From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Dec 2008 20:12:29 +0100
Subject: performance counters: core code

Implement the core kernel bits of Performance Counters subsystem.

The Linux Performance Counter subsystem provides an abstraction of
performance counter hardware capabilities. It provides per task and per
CPU counters, and it provides event capabilities on top of those.

Performance counters are accessed via special file descriptors.
There's one file descriptor per virtual counter used.

The special file descriptor is opened via the perf_counter_open()
system call:

 int
 perf_counter_open(u32 hw_event_type,
                   u32 hw_event_period,
                   u32 record_type,
                   pid_t pid,
                   int cpu);

The syscall returns the new fd. The fd can be used via the normal
VFS system calls: read() can be used to read the counter, fcntl()
can be used to set the blocking mode, etc.

Multiple counters can be kept open at a time, and the counters
can be poll()ed.

See more details in Documentation/perf-counters.txt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 171 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h        |   9 +++
 include/linux/syscalls.h     |   6 ++
 3 files changed, 186 insertions(+)
 create mode 100644 include/linux/perf_counter.h

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 00000000000..22c4469abf4
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,171 @@
+/*
+ *  Performance counters:
+ *
+ *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <asm/atomic.h>
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+struct task_struct;
+
+/*
+ * Generalized hardware event types, used by the hw_event_type parameter
+ * of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	PERF_COUNT_CYCLES,
+	PERF_COUNT_INSTRUCTIONS,
+	PERF_COUNT_CACHE_REFERENCES,
+	PERF_COUNT_CACHE_MISSES,
+	PERF_COUNT_BRANCH_INSTRUCTIONS,
+	PERF_COUNT_BRANCH_MISSES,
+	/*
+	 * If this bit is set in the type, then trigger NMI sampling:
+	 */
+	PERF_COUNT_NMI			= (1 << 30),
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_record_type {
+	PERF_RECORD_SIMPLE,
+	PERF_RECORD_IRQ,
+	PERF_RECORD_GROUP,
+};
+
+/**
+ * struct hw_perf_counter - performance counter hardware details
+ */
+struct hw_perf_counter {
+	u64			config;
+	unsigned long		config_base;
+	unsigned long		counter_base;
+	int			nmi;
+	unsigned int		idx;
+	u64			prev_count;
+	s32			next_count;
+	u64			irq_period;
+};
+
+/*
+ * Hardcoded buffer length limit for now, for IRQ-fed events:
+ */
+#define PERF_DATA_BUFLEN	2048
+
+/**
+ * struct perf_data - performance counter IRQ data sampling ...
+ */
+struct perf_data {
+	int			len;
+	int			rd_idx;
+	int			overrun;
+	u8			data[PERF_DATA_BUFLEN];
+};
+
+/**
+ * struct perf_counter - performance counter kernel representation:
+ */
+struct perf_counter {
+	struct list_head		list;
+	int				active;
+#if BITS_PER_LONG == 64
+	atomic64_t			count;
+#else
+	atomic_t			count32[2];
+#endif
+	u64				__irq_period;
+
+	struct hw_perf_counter		hw;
+
+	struct perf_counter_context	*ctx;
+	struct task_struct		*task;
+
+	/*
+	 * Protect attach/detach:
+	 */
+	struct mutex			mutex;
+
+	int				oncpu;
+	int				cpu;
+
+	s32				hw_event_type;
+	enum perf_record_type		record_type;
+
+	/* read() / irq related data */
+	wait_queue_head_t		waitq;
+	/* optional: for NMIs */
+	int				wakeup_pending;
+	struct perf_data		*irqdata;
+	struct perf_data		*usrdata;
+	struct perf_data		data[2];
+};
+
+/**
+ * struct perf_counter_context - counter context structure
+ *
+ * Used as a container for task counters and CPU counters as well:
+ */
+struct perf_counter_context {
+#ifdef CONFIG_PERF_COUNTERS
+	/*
+	 * Protect the list of counters:
+	 */
+	spinlock_t		lock;
+	struct list_head	counters;
+	int			nr_counters;
+	int			nr_active;
+	struct task_struct	*task;
+#endif
+};
+
+/**
+ * struct perf_counter_cpu_context - per cpu counter context structure
+ */
+struct perf_cpu_context {
+	struct perf_counter_context	ctx;
+	struct perf_counter_context	*task_ctx;
+	int				active_oncpu;
+	int				max_pertask;
+};
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_counters;
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_tick(struct task_struct *task, int cpu);
+extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_print_debug(void);
+#else
+static inline void
+perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
+static inline void perf_counter_init_task(struct task_struct *task)	{ }
+static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_print_debug(void)			{ }
+#endif
+
+#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d11447..4c530278391 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
+#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -1326,6 +1327,7 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
+	struct perf_counter_context perf_counter_ctx;
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+/*
+ * Call the function if the target task is executing on a CPU right now:
+ */
+extern void task_oncpu_function_call(struct task_struct *p,
+				     void (*func) (void *info), void *info);
+
+
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 04fb47bfb92..6cce728a626 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+		      u32 hw_event_period,
+		      u32 record_type,
+		      pid_t pid,
+		      int cpu);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Dec 2008 21:43:39 +0100
Subject: perf counters: protect them against CSTATE transitions

Impact: fix rare lost events problem

There are CPUs whose performance counters misbehave on CSTATE transitions,
so provide a way to just disable/enable them around deep idle methods.

(hw_perf_enable_all() is cheap on x86.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++-
 drivers/acpi/processor_idle.c      |  8 ++++++++
 include/linux/perf_counter.h       |  4 ++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6a93d1f04d9..0a7f3bea2dc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -12,6 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
@@ -119,10 +120,21 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_disable_all(void)
+void hw_perf_restore_ctrl(u64 ctrl)
 {
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+
+u64 hw_perf_disable_all(void)
+{
+	u64 ctrl;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	return ctrl;
 }
+EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
 __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5f8d746a9b8..cca804e6f1d 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,8 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 22c4469abf4..5031b5614f2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void hw_perf_restore_ctrl(u64 ctrl);
+extern u64 hw_perf_disable_all(void);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
+static inline u64 hw_perf_disable_all(void)		{ return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
-- 
cgit v1.2.3-70-g09d2


From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:26:59 +0100
Subject: perf counters: clean up 'raw' type API

Impact: cleanup

Introduce a separate hw_event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++++++
 include/linux/syscalls.h     |  8 +++-----
 kernel/perf_counter.c        | 15 ++++++++-------
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5031b5614f2..daedd7d87c2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -38,6 +38,7 @@ enum hw_event_types {
 	 * If this bit is set in the type, then trigger NMI sampling:
 	 */
 	PERF_COUNT_NMI			= (1 << 30),
+	PERF_COUNT_RAW			= (1 << 31),
 };
 
 /*
@@ -49,6 +50,12 @@ enum perf_record_type {
 	PERF_RECORD_GROUP,
 };
 
+struct perf_counter_event {
+	u32			hw_event_type;
+	u32			hw_event_period;
+	u64			hw_raw_ctrl;
+};
+
 /**
  * struct hw_perf_counter - performance counter hardware details
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6cce728a626..3ecd73d03da 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct perf_counter_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 asmlinkage int
-sys_perf_counter_open(u32 hw_event_type,
-		      u32 hw_event_period,
-		      u32 record_type,
-		      pid_t pid,
-		      int cpu);
+sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
+		      pid_t pid, int cpu, int masterfd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 20508f05365..96c333a5b0f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
  * @pid:		target pid
  */
 asmlinkage int
-sys_perf_counter_open(u32 hw_event_type,
-		      u32 hw_event_period,
-		      u32 record_type,
-		      pid_t pid,
-		      int cpu)
+sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
+		      pid_t pid, int cpu, int masterfd)
 {
 	struct perf_counter_context *ctx;
+	struct perf_counter_event event;
 	struct perf_counter *counter;
 	int ret;
 
+	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+		return -EFAULT;
+
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, hw_event_type);
+	ret = hw_perf_counter_init(counter, event.hw_event_type);
 	if (ret)
 		goto err_free_put_context;
 
-- 
cgit v1.2.3-70-g09d2


From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:35:37 +0100
Subject: perf counters: expand use of counter->event

Impact: change syscall, cleanup

Make use of the new perf_counters event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++-----------
 include/linux/perf_counter.h       |  4 +---
 kernel/perf_counter.c              | 10 +++++-----
 3 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0a7f3bea2dc..30e7ebf7827 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+int hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
 	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->__irq_period;
+	hwc->irq_period = counter->event.hw_event_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->next_count = -((s32) hwc->irq_period);
 
 	/*
-	 * Negative event types mean raw encoded event+umask values:
+	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event_type < 0) {
-		counter->hw_event_type = -hw_event_type;
-		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	hw_event_type &= ~PERF_COUNT_NMI;
+	if (hw_event_type == PERF_COUNT_RAW) {
+		hwc->config |= counter->event.hw_raw_ctrl;
 	} else {
-		hw_event_type &= ~PERF_COUNT_NMI;
 		if (hw_event_type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event_type];
 	}
-	hwc->config |= counter->hw_event_type;
 	counter->wakeup_pending = 0;
 
 	return 0;
@@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, counter->event.hw_event_type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -418,7 +417,8 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    counter->event.hw_event_type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index daedd7d87c2..1f0017673e7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -96,8 +96,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	u64				__irq_period;
-
+	struct perf_counter_event	event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -111,7 +110,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	s32				hw_event_type;
 	enum perf_record_type		record_type;
 
 	/* read() / irq related data */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 96c333a5b0f..2557c670a3b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
  * Architecture provided APIs - weak aliases:
  */
 
-int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+int __weak hw_perf_counter_init(struct perf_counter *counter)
 {
 	return -EINVAL;
 }
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
 	counter->usrdata	= &counter->data[1];
 	counter->cpu		= cpu;
 	counter->record_type	= record_type;
-	counter->__irq_period	= hw_event_period;
+	counter->event		= *event;
 	counter->wakeup_pending = 0;
 
 	return counter;
@@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(&event, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, event.hw_event_type);
+	ret = hw_perf_counter_init(counter);
 	if (ret)
 		goto err_free_put_context;
 
-- 
cgit v1.2.3-70-g09d2


From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 10 Dec 2008 12:33:23 +0100
Subject: perf counters: restructure the API

Impact: clean up new API

Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:

 - introduce perf_counter_hw_event to separate out the event source details

 - move special type flags into separate attributes: PERF_COUNT_NMI,
   PERF_COUNT_RAW

 - extend the type to u64 and reserve it fully to the architecture in the
   raw type case.

And make use of all these changes in the core and x86 perfcounters code.

Also change the syscall signature to:

  asmlinkage int sys_perf_counter_open(

	struct perf_counter_hw_event	*hw_event_uptr		__user,
	pid_t				pid,
	int				cpu,
	int				group_fd);

( Note that group_fd is unused for now - it's reserved for the counter
  groups abstraction. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 29 ++++++-----
 include/linux/perf_counter.h       | 98 ++++++++++++++++++++++++--------------
 include/linux/syscalls.h           | 12 +++--
 kernel/perf_counter.c              | 38 ++++++++-------
 4 files changed, 106 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 30e7ebf7827..ef1936a871a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  */
 int hw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
-	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi = 0;
 	if (capable(CAP_SYS_ADMIN)) {
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event_type & PERF_COUNT_NMI)
+		if (hw_event->nmi)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->event.hw_event_period;
+	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->irq_period)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count = -((s32) hwc->irq_period);
+	hwc->next_count	= -(s32)hwc->irq_period;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	hw_event_type &= ~PERF_COUNT_NMI;
-	if (hw_event_type == PERF_COUNT_RAW) {
-		hwc->config |= counter->event.hw_raw_ctrl;
+	if (hw_event->raw) {
+		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event_type >= max_intel_perfmon_events)
+		if (hw_event->type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event->type];
 	}
 	counter->wakeup_pending = 0;
 
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 	int bit;
 
 	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
 		    counter == leader)
 			continue;
 
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->event.hw_event_type);
+		perf_store_irq_data(leader, counter->hw_event.type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -410,7 +409,7 @@ again:
 
 		perf_save_and_restart(counter);
 
-		switch (counter->record_type) {
+		switch (counter->hw_event.record_type) {
 		case PERF_RECORD_SIMPLE:
 			continue;
 		case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ again:
 			break;
 		case PERF_RECORD_GROUP:
 			perf_store_irq_data(counter,
-					    counter->event.hw_event_type);
+					    counter->hw_event.type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1f0017673e7..a2b4852e2d7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,65 +24,93 @@
 struct task_struct;
 
 /*
- * Generalized hardware event types, used by the hw_event_type parameter
- * of the sys_perf_counter_open() syscall:
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
  */
 enum hw_event_types {
-	PERF_COUNT_CYCLES,
-	PERF_COUNT_INSTRUCTIONS,
-	PERF_COUNT_CACHE_REFERENCES,
-	PERF_COUNT_CACHE_MISSES,
-	PERF_COUNT_BRANCH_INSTRUCTIONS,
-	PERF_COUNT_BRANCH_MISSES,
 	/*
-	 * If this bit is set in the type, then trigger NMI sampling:
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_NMI			= (1 << 30),
-	PERF_COUNT_RAW			= (1 << 31),
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
  * IRQ-notification data record type:
  */
-enum perf_record_type {
-	PERF_RECORD_SIMPLE,
-	PERF_RECORD_IRQ,
-	PERF_RECORD_GROUP,
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
 };
 
-struct perf_counter_event {
-	u32			hw_event_type;
-	u32			hw_event_period;
-	u64			hw_raw_ctrl;
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	u64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
 };
 
+/*
+ * Kernel-internal data types:
+ */
+
 /**
- * struct hw_perf_counter - performance counter hardware details
+ * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
-	u64			config;
-	unsigned long		config_base;
-	unsigned long		counter_base;
-	int			nmi;
-	unsigned int		idx;
-	u64			prev_count;
-	s32			next_count;
-	u64			irq_period;
+	u64				config;
+	unsigned long			config_base;
+	unsigned long			counter_base;
+	int				nmi;
+	unsigned int			idx;
+	u64				prev_count;
+	u64				irq_period;
+	s32				next_count;
 };
 
 /*
  * Hardcoded buffer length limit for now, for IRQ-fed events:
  */
-#define PERF_DATA_BUFLEN	2048
+#define PERF_DATA_BUFLEN		2048
 
 /**
  * struct perf_data - performance counter IRQ data sampling ...
  */
 struct perf_data {
-	int			len;
-	int			rd_idx;
-	int			overrun;
-	u8			data[PERF_DATA_BUFLEN];
+	int				len;
+	int				rd_idx;
+	int				overrun;
+	u8				data[PERF_DATA_BUFLEN];
 };
 
 /**
@@ -96,7 +124,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	struct perf_counter_event	event;
+	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -110,8 +138,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	enum perf_record_type		record_type;
-
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3ecd73d03da..a549678b7c3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,7 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
-struct perf_counter_event;
+struct perf_counter_hw_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd);
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2557c670a3b..0d323ceda3a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->record_type) {
+	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		return perf_read_hw(counter, buf, count);
 
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 	INIT_LIST_HEAD(&counter->list);
 	init_waitqueue_head(&counter->waitq);
 
-	counter->irqdata	= &counter->data[0];
-	counter->usrdata	= &counter->data[1];
-	counter->cpu		= cpu;
-	counter->record_type	= record_type;
-	counter->event		= *event;
-	counter->wakeup_pending = 0;
+	counter->irqdata		= &counter->data[0];
+	counter->usrdata		= &counter->data[1];
+	counter->cpu			= cpu;
+	counter->hw_event		= *hw_event;
+	counter->wakeup_pending		= 0;
 
 	return counter;
 }
 
 /**
- * sys_perf_task_open - open a performance counter associate it to a task
- * @hw_event_type:	event type for monitoring/sampling...
+ * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ *
+ * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
+ * @cpu:		target cpu
+ * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd)
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd)
+
 {
 	struct perf_counter_context *ctx;
-	struct perf_counter_event event;
+	struct perf_counter_hw_event hw_event;
 	struct perf_counter *counter;
 	int ret;
 
-	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
 	ctx = find_get_context(pid, cpu);
@@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&event, cpu, record_type);
+	counter = perf_counter_alloc(&hw_event, cpu);
 	if (!counter)
 		goto err_put_context;
 
-- 
cgit v1.2.3-70-g09d2


From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 08:38:42 +0100
Subject: perf counters: add support for group counters

Impact: add group counters

This patch adds the "counter groups" abstraction.

Groups of counters behave much like normal 'single' counters, with a
few semantic and behavioral extensions on top of that.

A counter group is created by creating a new counter with the open()
syscall's group-leader group_fd file descriptor parameter pointing
to another, already existing counter.

Groups of counters are scheduled in and out in one atomic group, and
they are also roundrobin-scheduled atomically.

Counters that are member of a group can also record events with an
(atomic) extended timestamp that extends to all members of the group,
if the record type is set to PERF_RECORD_GROUP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  28 ++--
 include/linux/perf_counter.h       |   8 +-
 kernel/perf_counter.c              | 282 ++++++++++++++++++++++++++++---------
 3 files changed, 236 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ef1936a871a..54b4ad0cce6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter)
 }
 
 static void
-perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
-	struct perf_counter_context *ctx = leader->ctx;
-	struct perf_counter *counter;
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
 	int bit;
 
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
-		    counter == leader)
-			continue;
+	/*
+	 * Store the counter's own timestamp first:
+	 */
+	perf_store_irq_data(sibling, sibling->hw_event.type);
+	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
-		if (counter->active) {
+	/*
+	 * Then store sibling timestamps (if any):
+	 */
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		if (!counter->active) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
@@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event.type);
-		perf_store_irq_data(leader, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, atomic64_counter_read(counter));
 	}
 }
 
@@ -416,10 +420,6 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter,
-					    counter->hw_event.type);
-			perf_store_irq_data(counter,
-					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
 			break;
 		}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a2b4852e2d7..7af7d896546 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -117,7 +117,10 @@ struct perf_data {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
-	struct list_head		list;
+	struct list_head		list_entry;
+	struct list_head		sibling_list;
+	struct perf_counter		*group_leader;
+
 	int				active;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
@@ -158,7 +161,8 @@ struct perf_counter_context {
 	 * Protect the list of counters:
 	 */
 	spinlock_t		lock;
-	struct list_head	counters;
+
+	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0d323ceda3a..fa59fe8c02d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
 #include <linux/ptrace.h>
@@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { }
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 64 bit version - no complications.
  */
-static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	return (u64) atomic64_read(&counter->count);
 }
@@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter)
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 32 bit version.
  */
-static u64 perf_read_counter_safe(struct perf_counter *counter)
+static u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	u32 cntl, cnth;
 
@@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter)
 
 #endif
 
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *group_leader = counter->group_leader;
+
+	/*
+	 * Depending on whether it is a standalone or sibling counter,
+	 * add it straight to the context's counter list, or to the group
+	 * leader's sibling list:
+	 */
+	if (counter->group_leader == counter)
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
+	else
+		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+}
+
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *sibling, *tmp;
+
+	list_del_init(&counter->list_entry);
+
+	if (list_empty(&counter->sibling_list))
+		return;
+
+	/*
+	 * If this was a group counter with sibling counters then
+	 * upgrade the siblings to singleton counters by adding them
+	 * to the context list directly:
+	 */
+	list_for_each_entry_safe(sibling, tmp,
+				 &counter->sibling_list, list_entry) {
+
+		list_del_init(&sibling->list_entry);
+		list_add_tail(&sibling->list_entry, &ctx->counter_list);
+		WARN_ON_ONCE(!sibling->group_leader);
+		WARN_ON_ONCE(sibling->group_leader == sibling);
+		sibling->group_leader = sibling;
+	}
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
  * remove it from the context list.
  */
-static void __perf_remove_from_context(void *info)
+static void __perf_counter_remove_from_context(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
@@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_del_init(&counter->list);
+	list_del_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	if (!ctx->task) {
@@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info)
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
  */
-static void perf_remove_from_context(struct perf_counter *counter)
+static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
@@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter)
 		 * the removal is always sucessful.
 		 */
 		smp_call_function_single(counter->cpu,
-					 __perf_remove_from_context,
+					 __perf_counter_remove_from_context,
 					 counter, 1);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_remove_from_context,
+	task_oncpu_function_call(task, __perf_counter_remove_from_context,
 				 counter);
 
 	spin_lock_irq(&ctx->lock);
 	/*
 	 * If the context is active we need to retry the smp call.
 	 */
-	if (ctx->nr_active && !list_empty(&counter->list)) {
+	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
 	 * The lock prevents that this context is scheduled in so we
-	 * can remove the counter safely, if it the call above did not
+	 * can remove the counter safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&counter->list)) {
+	if (!list_empty(&counter->list_entry)) {
 		ctx->nr_counters--;
-		list_del_init(&counter->list);
+		list_del_counter(counter, ctx);
 		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
@@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_add_tail(&counter->list, &ctx->counters);
+	list_add_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	ctx->nr_counters++;
@@ -268,7 +311,7 @@ retry:
 	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list)) {
+	if (ctx->nr_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -278,13 +321,45 @@ retry:
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list)) {
-		list_add_tail(&counter->list, &ctx->counters);
+	if (list_empty(&counter->list_entry)) {
+		list_add_counter(counter, ctx);
 		ctx->nr_counters++;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_disable(counter);
+	counter->active	=  0;
+	counter->oncpu	= -1;
+
+	cpuctx->active_oncpu--;
+	ctx->nr_active--;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (!ctx->nr_active)
-			break;
-		if (counter->active) {
-			hw_perf_counter_disable(counter);
-			counter->active = 0;
-			counter->oncpu = -1;
-			ctx->nr_active--;
-			cpuctx->active_oncpu--;
-		}
+	if (ctx->nr_active) {
+		list_for_each_entry(counter, &ctx->counter_list, list_entry)
+			group_sched_out(counter, cpuctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 	cpuctx->task_ctx = NULL;
 }
 
+static void
+counter_sched_in(struct perf_counter *counter,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_counter_context *ctx,
+		 int cpu)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_enable(counter);
+	counter->active = 1;
+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+
+	cpuctx->active_oncpu++;
+	ctx->nr_active++;
+}
+
+static void
+group_sched_in(struct perf_counter *group_counter,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx,
+	       int cpu)
+{
+	struct perf_counter *counter;
+
+	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_in(counter, cpuctx, ctx, cpu);
+}
+
 /*
  * Called from scheduler to add the counters of the current task
  * with interrupts disabled.
@@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (ctx->nr_active == cpuctx->max_pertask)
 			break;
+
+		/*
+		 * Listen to the 'cpu' scheduling filter constraint
+		 * of counters:
+		 */
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		hw_perf_counter_enable(counter);
-		counter->active = 1;
-		counter->oncpu = cpu;
-		ctx->nr_active++;
-		cpuctx->active_oncpu++;
+		group_sched_in(counter, cpuctx, ctx, cpu);
 	}
 	spin_unlock(&ctx->lock);
+
 	cpuctx->task_ctx = ctx;
 }
 
@@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	spin_lock(&ctx->lock);
 
 	/*
-	 * Rotate the first entry last:
+	 * Rotate the first entry last (works just fine for group counters too):
 	 */
 	hw_perf_disable_all();
-	list_for_each_entry(counter, &ctx->counters, list) {
-		list_del(&counter->list);
-		list_add_tail(&counter->list, &ctx->counters);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		list_del(&counter->list_entry);
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
 	hw_perf_enable_all();
@@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	perf_counter_task_sched_in(curr, cpu);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->nr_counters	= 0;
+	ctx->task		= task;
+}
 /*
  * Initialize the perf_counter context in task_struct
  */
 void perf_counter_init_task(struct task_struct *task)
 {
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
-
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counters);
-	ctx->nr_counters = 0;
-	ctx->task = task;
+	__perf_counter_init_context(&task->perf_counter_ctx, task);
 }
 
 /*
@@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info)
 	hw_perf_counter_read(info);
 }
 
-static u64 perf_read_counter(struct perf_counter *counter)
+static u64 perf_counter_read(struct perf_counter *counter)
 {
 	/*
 	 * If counter is enabled and currently active on a CPU, update the
@@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_read_counter_safe(counter);
+	return perf_counter_read_safe(counter);
 }
 
 /*
@@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	mutex_lock(&counter->mutex);
 
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
@@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return -EINVAL;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_read_counter(counter);
+	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
 
 	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
@@ -707,15 +818,25 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+		   int cpu,
+		   struct perf_counter *group_leader)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
 	if (!counter)
 		return NULL;
 
+	/*
+	 * Single counters are their own group leaders, with an
+	 * empty sibling list:
+	 */
+	if (!group_leader)
+		group_leader = counter;
+
 	mutex_init(&counter->mutex);
-	INIT_LIST_HEAD(&counter->list);
+	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
 	counter->irqdata		= &counter->data[0];
@@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
+	counter->group_leader		= group_leader;
 
 	return counter;
 }
@@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open(
 	int				group_fd)
 
 {
-	struct perf_counter_context *ctx;
+	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-	struct perf_counter *counter;
+	struct perf_counter_context *ctx;
+	struct file *group_file = NULL;
+	int fput_needed = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
+	/*
+	 * Look up the group leader:
+	 */
+	group_leader = NULL;
+	if (group_fd != -1) {
+		ret = -EINVAL;
+		group_file = fget_light(group_fd, &fput_needed);
+		if (!group_file)
+			goto out_fput;
+		if (group_file->f_op != &perf_fops)
+			goto out_fput;
+
+		group_leader = group_file->private_data;
+		/*
+		 * Do not allow a recursive hierarchy:
+		 */
+		if (group_leader->group_leader)
+			goto out_fput;
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&hw_event, cpu);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
 
@@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open(
 	if (ret < 0)
 		goto err_remove_free_put_context;
 
+out_fput:
+	fput_light(group_file, fput_needed);
+
 	return ret;
 
 err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
 
 err_free_put_context:
@@ -783,40 +933,40 @@ err_free_put_context:
 err_put_context:
 	put_context(ctx);
 
-	return ret;
+	goto out_fput;
 }
 
-static void __cpuinit perf_init_cpu(int cpu)
+static void __cpuinit perf_counter_init_cpu(int cpu)
 {
-	struct perf_cpu_context *ctx;
+	struct perf_cpu_context *cpuctx;
 
-	ctx = &per_cpu(perf_cpu_context, cpu);
-	spin_lock_init(&ctx->ctx.lock);
-	INIT_LIST_HEAD(&ctx->ctx.counters);
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	__perf_counter_init_context(&cpuctx->ctx, NULL);
 
 	mutex_lock(&perf_resource_mutex);
-	ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
 	mutex_unlock(&perf_resource_mutex);
+
 	hw_perf_counter_setup();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_exit_cpu(void *info)
+static void __perf_counter_exit_cpu(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = &cpuctx->ctx;
 	struct perf_counter *counter, *tmp;
 
-	list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
-		__perf_remove_from_context(counter);
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+		__perf_counter_remove_from_context(counter);
 
 }
-static void perf_exit_cpu(int cpu)
+static void perf_counter_exit_cpu(int cpu)
 {
-	smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
 }
 #else
-static inline void perf_exit_cpu(int cpu) { }
+static inline void perf_counter_exit_cpu(int cpu) { }
 #endif
 
 static int __cpuinit
@@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		perf_init_cpu(cpu);
+		perf_counter_init_cpu(cpu);
 		break;
 
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
-		perf_exit_cpu(cpu);
+		perf_counter_exit_cpu(cpu);
 		break;
 
 	default:
-- 
cgit v1.2.3-70-g09d2


From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 12:46:46 +0100
Subject: perf counters: hw driver API

Impact: restructure code, introduce hw_ops driver abstraction

Introduce this abstraction to handle counter details:

 struct hw_perf_counter_ops {
	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };

This will be useful to support assymetric hw details, and it will also
be useful to implement "software counters". (Counters that count kernel
managed sw events such as pagefaults, context-switches, wall-clock time
or task-local time.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++++++---------
 include/linux/perf_counter.h       | 15 +++++++++++++
 kernel/perf_counter.c              | 45 ++++++++++++++++++++------------------
 3 files changed, 66 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 54b4ad0cce6..718b635dece 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void)
 EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
-__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
 {
 	wrmsr(hwc->config_base + idx, hwc->config, 0);
 }
@@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
 }
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-void hw_perf_counter_enable(struct perf_counter *counter)
+static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(hwc, idx);
-	__hw_perf_counter_enable(hwc, idx);
+	__x86_perf_counter_enable(hwc, idx);
 }
 
 #ifdef CONFIG_X86_64
@@ -282,20 +282,20 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-void hw_perf_counter_disable(struct perf_counter *counter)
+static void x86_perf_counter_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
 	__hw_perf_save_counter(counter, hwc, idx);
 }
 
-void hw_perf_counter_read(struct perf_counter *counter)
+static void x86_perf_counter_read(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned long addr = hwc->counter_base + hwc->idx;
@@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__hw_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(hwc, idx);
 }
 
 static void
@@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void)
 
 	perf_counters_initialized = true;
 }
+
+static struct hw_perf_counter_ops x86_perf_counter_ops = {
+	.hw_perf_counter_enable		= x86_perf_counter_enable,
+	.hw_perf_counter_disable	= x86_perf_counter_disable,
+	.hw_perf_counter_read		= x86_perf_counter_read,
+};
+
+struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+{
+	int err;
+
+	err = __hw_perf_counter_init(counter);
+	if (err)
+		return NULL;
+
+	return &x86_perf_counter_ops;
+}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7af7d896546..27385641ecb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -113,6 +113,17 @@ struct perf_data {
 	u8				data[PERF_DATA_BUFLEN];
 };
 
+struct perf_counter;
+
+/**
+ * struct hw_perf_counter_ops - performance counter hw ops
+ */
+struct hw_perf_counter_ops {
+	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -120,6 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
+	struct hw_perf_counter_ops	*hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -185,6 +197,9 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter);
+
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 278209c547a..e6e41ca9546 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-
-int __weak hw_perf_counter_init(struct perf_counter *counter)
+extern __weak struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
-	return -EINVAL;
+	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_counter_enable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_disable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_read(struct perf_counter *counter)		 { }
-void __weak hw_perf_disable_all(void) { }
-void __weak hw_perf_enable_all(void) { }
-void __weak hw_perf_counter_setup(void) { }
+void __weak hw_perf_disable_all(void)	 { }
+void __weak hw_perf_enable_all(void)	 { }
+void __weak hw_perf_counter_setup(void)	 { }
 
 #if BITS_PER_LONG == 64
 
@@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock(&ctx->lock);
 
 	if (counter->active) {
-		hw_perf_counter_disable(counter);
+		counter->hw_ops->hw_perf_counter_disable(counter);
 		counter->active = 0;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		hw_perf_counter_enable(counter);
+		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->active = 1;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
@@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!counter->active)
 		return;
 
-	hw_perf_counter_disable(counter);
+	counter->hw_ops->hw_perf_counter_disable(counter);
 	counter->active	=  0;
 	counter->oncpu	= -1;
 
@@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	hw_perf_counter_enable(counter);
+	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task)
  */
 static void __hw_perf_counter_read(void *info)
 {
-	hw_perf_counter_read(info);
+	struct perf_counter *counter = info;
+
+	counter->hw_ops->hw_perf_counter_read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	struct hw_perf_counter_ops *hw_ops;
+	struct perf_counter *counter;
 
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 	if (!counter)
 		return NULL;
 
@@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
+	counter->hw_ops			= NULL;
+
+	hw_ops = hw_perf_counter_init(counter);
+	if (!hw_ops) {
+		kfree(counter);
+		return NULL;
+	}
+	counter->hw_ops = hw_ops;
 
 	return counter;
 }
@@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open(
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter);
-	if (ret)
-		goto err_free_put_context;
-
 	perf_install_in_context(ctx, counter, cpu);
 
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -927,8 +932,6 @@ err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
-
-err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-- 
cgit v1.2.3-70-g09d2


From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:21:10 +0100
Subject: perf counters: implement PERF_COUNT_CPU_CLOCK

Impact: add new perf-counter type

The 'CPU clock' counter counts the amount of CPU clock time that is
elapsing, in nanoseconds. (regardless of how much of it the task is
spending on a CPU executing)

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 36 ++-------------
 include/linux/perf_counter.h       |  9 ++--
 kernel/perf_counter.c              | 95 ++++++++++++++++++++++++++++++++------
 3 files changed, 92 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 718b635dece..43c8e9a38b4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 	__x86_perf_counter_enable(hwc, idx);
 }
 
-#ifdef CONFIG_X86_64
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-#else
-/*
- * Todo: add proper atomic64_t support to 32-bit x86:
- */
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-#endif
-
 static void __hw_perf_save_counter(struct perf_counter *counter,
 				   struct hw_perf_counter *hwc, int idx)
 {
@@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter)
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
-	val =  (s64)hwc->irq_period + (s64)val32;
+	val = (s64)hwc->irq_period + (s64)val32;
 	atomic64_counter_set(counter, hwc->prev_count + val);
 }
 
@@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
-static struct hw_perf_counter_ops x86_perf_counter_ops = {
+static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
 	.hw_perf_counter_read		= x86_perf_counter_read,
 };
 
-struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 27385641ecb..9a1713a1be2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -131,7 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
-	struct hw_perf_counter_ops	*hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -197,7 +197,7 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern struct hw_perf_counter_ops *
+extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
@@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern void hw_perf_restore_ctrl(u64 ctrl);
 extern u64 hw_perf_disable_all(void);
+extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
+extern u64 atomic64_counter_read(struct perf_counter *counter);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		{ return 0; }
+static inline u64 hw_perf_disable_all(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e6e41ca9546..506286e5ba6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak struct hw_perf_counter_ops *
+extern __weak const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
 	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_disable_all(void)	 { }
-void __weak hw_perf_enable_all(void)	 { }
-void __weak hw_perf_counter_setup(void)	 { }
+u64 __weak hw_perf_disable_all(void)		{ return 0; }
+void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
 
@@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 	return (u64) atomic64_read(&counter->count);
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+
 #else
 
 /*
@@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter)
 	return cntl | ((u64) cnth) << 32;
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+
 #endif
 
 static void
@@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_del_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_add_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
@@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
@@ -807,6 +834,42 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
+static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+	int cpu = raw_smp_processor_id();
+
+	atomic64_counter_set(counter, cpu_clock(cpu));
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+};
+
+static const struct hw_perf_counter_ops *
+sw_perf_counter_init(struct perf_counter *counter)
+{
+	const struct hw_perf_counter_ops *hw_ops = NULL;
+
+	switch (counter->hw_event.type) {
+	case PERF_COUNT_CPU_CLOCK:
+		hw_ops = &perf_ops_cpu_clock;
+		break;
+	default:
+		break;
+	}
+	return hw_ops;
+}
+
 /*
  * Allocate and initialize a counter structure
  */
@@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct hw_perf_counter_ops *hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
 	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
-	hw_ops = hw_perf_counter_init(counter);
+	hw_ops = NULL;
+	if (!hw_event->raw && hw_event->type < 0)
+		hw_ops = sw_perf_counter_init(counter);
+	if (!hw_ops) {
+		hw_ops = hw_perf_counter_init(counter);
+	}
+
 	if (!hw_ops) {
 		kfree(counter);
 		return NULL;
@@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open(
 			goto err_put_context;
 	}
 
-	ret = -ENOMEM;
+	ret = -EINVAL;
 	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
-- 
cgit v1.2.3-70-g09d2


From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:45:51 +0100
Subject: perf counters: consolidate hw_perf save/restore APIs

Impact: cleanup

Rename them to better match up the usual IRQ disable/enable APIs:

 hw_perf_disable_all()  => hw_perf_save_disable()
 hw_perf_restore_ctrl() => hw_perf_restore()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  8 ++++----
 drivers/acpi/processor_idle.c      | 10 +++++-----
 include/linux/perf_counter.h       | 10 +++++-----
 kernel/perf_counter.c              | 16 ++++++++--------
 4 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 43c8e9a38b4..3e1dbebe22b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -118,13 +118,13 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore_ctrl(u64 ctrl)
+void hw_perf_restore(u64 ctrl)
 {
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
-EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-u64 hw_perf_disable_all(void)
+u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
@@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	return ctrl;
 }
-EXPORT_SYMBOL_GPL(hw_perf_disable_all);
+EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static inline void
 __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index cca804e6f1d..a3e66a33b7a 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,11 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
-	u64 pctrl;
+	u64 perf_flags;
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(perf_flags);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	pctrl = hw_perf_save_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9a1713a1be2..68f6e3ad531 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -67,7 +67,7 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	u64			type;
+	s64			type;
 
 	u64			irq_period;
 	u32			record_type;
@@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
-extern void hw_perf_restore_ctrl(u64 ctrl);
-extern u64 hw_perf_disable_all(void);
+extern u64 hw_perf_save_disable(void);
+extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
 
@@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		      { return 0; }
+static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline u64 hw_perf_save_disable(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 506286e5ba6..0e93fea1712 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return ERR_PTR(-EINVAL);
 }
 
-u64 __weak hw_perf_disable_all(void)		{ return 0; }
-void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+u64 __weak hw_perf_save_disable(void)		{ return 0; }
+void __weak hw_perf_restore(u64 ctrl)	{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
@@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_add_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
-- 
cgit v1.2.3-70-g09d2


From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 14:03:20 +0100
Subject: perf counters: implement PERF_COUNT_TASK_CLOCK

Impact: add new perf-counter type

The 'task clock' counter counts the amount of time a task is executing,
in nanoseconds. It stops ticking when a task is scheduled out either due
to it blocking, sleeping or it being preempted.

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++++--
 kernel/perf_counter.c        | 22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 68f6e3ad531..30c0ec8c1ee 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -50,8 +50,11 @@ enum hw_event_types {
 	 */
 	PERF_COUNT_CPU_CLOCK		= -1,
 	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	/*
+	 * Future software events:
+	 */
+	/* PERF_COUNT_PAGE_FAULTS	= -3,
+	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e93fea1712..a0fe8474ee2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
 };
 
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+}
+
+static const struct hw_perf_counter_ops perf_ops_task_clock = {
+	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
+	.hw_perf_counter_read		= task_clock_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 		break;
+	case PERF_COUNT_TASK_CLOCK:
+		hw_ops = &perf_ops_task_clock;
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3-70-g09d2


From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 14:59:31 +0100
Subject: perf counters: add prctl interface to disable/enable counters

Add a way for self-monitoring tasks to disable/enable counters summarily,
via a prctl:

	PR_TASK_PERF_COUNTERS_DISABLE		31
	PR_TASK_PERF_COUNTERS_ENABLE		32

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 +++
 include/linux/prctl.h        |  3 ++
 kernel/perf_counter.c        | 86 ++++++++++++++++++++++++++++++++++++++++----
 kernel/sys.c                 |  7 ++++
 4 files changed, 93 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 30c0ec8c1ee..97d86c293ee 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
+extern int perf_counter_task_disable(void);
+extern int perf_counter_task_enable(void);
 
 #else
 static inline void
@@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
+static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e..b00df4c79c6 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
 #define PR_SET_TIMERSLACK 29
 #define PR_GET_TIMERSLACK 30
 
+#define PR_TASK_PERF_COUNTERS_DISABLE		31
+#define PR_TASK_PERF_COUNTERS_ENABLE		32
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a0fe8474ee2..4e679b91d8b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
+	if (counter->active == -1)
+		return;
+
 	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
@@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = ctx;
 }
 
+int perf_counter_task_disable(void)
+{
+	struct task_struct *curr = current;
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+	u64 perf_flags;
+	int cpu;
+
+	if (likely(!ctx->nr_counters))
+		return 0;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+
+	perf_counter_task_sched_out(curr, cpu);
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Disable all the counters:
+	 */
+	perf_flags = hw_perf_save_disable();
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		WARN_ON_ONCE(counter->active == 1);
+		counter->active = -1;
+	}
+	hw_perf_restore(perf_flags);
+
+	spin_unlock(&ctx->lock);
+
+	local_irq_enable();
+
+	return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+	struct task_struct *curr = current;
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+	u64 perf_flags;
+	int cpu;
+
+	if (likely(!ctx->nr_counters))
+		return 0;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Disable all the counters:
+	 */
+	perf_flags = hw_perf_save_disable();
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->active != -1)
+			continue;
+		counter->active = 0;
+	}
+	hw_perf_restore(perf_flags);
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(curr, cpu);
+
+	local_irq_enable();
+
+	return 0;
+}
+
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
@@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd)
-
+asmlinkage int
+sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
+		      pid_t pid, int cpu, int group_fd)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d1..0f66633be31 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/perf_counter.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
@@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
+		case PR_TASK_PERF_COUNTERS_DISABLE:
+			error = perf_counter_task_disable();
+			break;
+		case PR_TASK_PERF_COUNTERS_ENABLE:
+			error = perf_counter_task_enable();
+			break;
 		case PR_GET_TIMERSLACK:
 			error = current->timer_slack_ns;
 			break;
-- 
cgit v1.2.3-70-g09d2


From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 15:17:03 +0100
Subject: perf counters: clean up state transitions

Impact: cleanup

Introduce a proper enum for the 3 states of a counter:

	PERF_COUNTER_STATE_OFF		= -1
	PERF_COUNTER_STATE_INACTIVE	=  0
	PERF_COUNTER_STATE_ACTIVE	=  1

and rename counter->active to counter->state and propagate the
changes everywhere.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 include/linux/perf_counter.h       | 11 ++++++++++-
 kernel/perf_counter.c              | 29 ++++++++++++++---------------
 3 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3e1dbebe22b..4854cca7fff 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Then store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (!counter->active) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 97d86c293ee..8cb095fa442 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,6 +127,15 @@ struct hw_perf_counter_ops {
 	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };
 
+/**
+ * enum perf_counter_active_state - the states of a counter
+ */
+enum perf_counter_active_state {
+	PERF_COUNTER_STATE_OFF		= -1,
+	PERF_COUNTER_STATE_INACTIVE	=  0,
+	PERF_COUNTER_STATE_ACTIVE	=  1,
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -136,7 +145,7 @@ struct perf_counter {
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
-	int				active;
+	enum perf_counter_active_state	state;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4e679b91d8b..559130b8774 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info)
 
 	spin_lock(&ctx->lock);
 
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
 		counter->task = NULL;
@@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info)
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
 		counter->hw_ops->hw_perf_counter_enable(counter);
-		counter->active = 1;
+		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
@@ -328,7 +328,6 @@ retry:
 
 	spin_lock_irq(&ctx->lock);
 	/*
-	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
 	if (ctx->nr_active && list_empty(&counter->list_entry)) {
@@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_counter_context *ctx)
 {
-	if (!counter->active)
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
 	counter->hw_ops->hw_perf_counter_disable(counter);
-	counter->active	=  0;
-	counter->oncpu	= -1;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->oncpu = -1;
 
 	cpuctx->active_oncpu--;
 	ctx->nr_active--;
@@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->active == -1)
+	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
 	counter->hw_ops->hw_perf_counter_enable(counter);
-	counter->active = 1;
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
@@ -506,8 +505,8 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->active == 1);
-		counter->active = -1;
+		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -540,9 +539,9 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->active != -1)
+		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 * If counter is enabled and currently active on a CPU, update the
 	 * value in the counter structure:
 	 */
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __hw_perf_counter_read, counter, 1);
 	}
@@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
 
 retry:
 	spin_lock_irq(&ctx->lock);
-	if (!counter->active) {
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 		counter->irqdata = counter->usrdata;
 		counter->usrdata = oldirqdata;
 		spin_unlock_irq(&ctx->lock);
-- 
cgit v1.2.3-70-g09d2


From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Dec 2008 09:00:03 +0100
Subject: perfcounters: restructure x86 counter math

Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   2 +-
 arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++-----------------
 include/linux/perf_counter.h       |  15 ++-
 kernel/perf_counter.c              |  68 +----------
 4 files changed, 137 insertions(+), 178 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc186724..fe94490bab6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8df72b..5afae13d8d5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
 
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+			struct hw_perf_counter *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count, delta;
+
+	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+	/*
+	 * Careful: an NMI might modify the previous counter value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count, so we do that by clipping the delta to 32 bits:
+	 */
+	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	WARN_ON_ONCE((int)delta < 0);
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if (!hwc->irq_period)
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count	= -(s32)hwc->irq_period;
+	atomic64_set(&hwc->period_left, hwc->irq_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	int err;
+
+	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+	WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+			     struct hw_perf_counter *hwc, int idx)
 {
-	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+	s32 left = atomic64_read(&hwc->period_left);
+	s32 period = hwc->irq_period;
+
+	WARN_ON_ONCE(period <= 0);
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+	}
 
-	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	WARN_ON_ONCE(left <= 0);
+
+	per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw counter starts counting from this counter offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+	wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
-	__hw_perf_counter_set_period(hwc, idx);
-	__x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-				   struct hw_perf_counter *hwc, int idx)
-{
-	s64 raw = -1;
-	s64 delta;
-
-	/*
-	 * Get the raw hw counter value:
-	 */
-	rdmsrl(hwc->counter_base + idx, raw);
-
-	/*
-	 * Rebase it to zero (it started counting at -irq_period),
-	 * to see the delta since ->prev_count:
-	 */
-	delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-	atomic64_counter_set(counter, hwc->prev_count + delta);
-
-	/*
-	 * Adjust the ->prev_count offset - if we went beyond
-	 * irq_period of units, then we got an IRQ and the counter
-	 * was set back to -irq_period:
-	 */
-	while (delta >= (s64)hwc->irq_period) {
-		hwc->prev_count += hwc->irq_period;
-		delta -= (s64)hwc->irq_period;
-	}
-
-	/*
-	 * Calculate the next raw counter value we'll write into
-	 * the counter at the next sched-in time:
-	 */
-	delta -= (s64)hwc->irq_period;
-
-	hwc->next_count = (s32)delta;
+	__hw_perf_counter_set_period(counter, hwc, idx);
+	__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
 	if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-		next_count = per_cpu(prev_next_count[idx], cpu);
+		prev_left = per_cpu(prev_left[idx], cpu);
 
 		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
 		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-			cpu, idx, next_count);
+		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
 	}
 	local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
-	__hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned long addr = hwc->counter_base + hwc->idx;
-	s64 offs, val = -1LL;
-	s32 val32;
-
-	/* Careful: NMI might modify the counter offset */
-	do {
-		offs = hwc->prev_count;
-		rdmsrl(addr, val);
-	} while (offs != hwc->prev_count);
-
-	val32 = (s32) val;
-	val = (s64)hwc->irq_period + (s64)val32;
-	atomic64_counter_set(counter, hwc->prev_count + val);
+	/*
+	 * Drain the remaining delta count out of a counter
+	 * that we are disabling:
+	 */
+	x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
 	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-	__hw_perf_save_counter(counter, hwc, idx);
-	__hw_perf_counter_set_period(hwc, idx);
+	x86_perf_counter_update(counter, hwc, idx);
+	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
-	int bit;
-
-	/*
-	 * Store the counter's own timestamp first:
-	 */
-	perf_store_irq_data(sibling, sibling->hw_event.type);
-	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
 	/*
-	 * Then store sibling timestamps (if any):
+	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-			/*
-			 * When counter was not in the overflow mask, we have to
-			 * read it from hardware. We read it as well, when it
-			 * has not been read yet and clear the bit in the
-			 * status mask.
-			 */
-			bit = counter->hw.idx;
-			if (!test_bit(bit, (unsigned long *) overflown) ||
-			    test_bit(bit, (unsigned long *) status)) {
-				clear_bit(bit, (unsigned long *) status);
-				perf_save_and_restart(counter);
-			}
-		}
+		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
-		perf_store_irq_data(sibling, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8cb095fa442..72460289c65 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
  * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	u64				config;
 	unsigned long			config_base;
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
-	u64				prev_count;
+	atomic64_t			prev_count;
 	u64				irq_period;
-	s32				next_count;
+	atomic64_t			period_left;
+#endif
 };
 
 /*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
-#if BITS_PER_LONG == 64
 	atomic64_t			count;
-#else
-	atomic_t			count32[2];
-#endif
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -172,6 +172,7 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+#endif
 };
 
 /**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 559130b8774..416861ce8b2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)	{ }
+void __weak hw_perf_restore(u64 ctrl)		{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	u32 cntl, cnth;
-
-	local_irq_disable();
-	do {
-		cnth = atomic_read(&counter->count32[1]);
-		cntl = atomic_read(&counter->count32[0]);
-	} while (cnth != atomic_read(&counter->count32[1]));
-
-	local_irq_enable();
-
-	return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
+		counter->hw_ops->hw_perf_counter_enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_counter_read_safe(counter);
+	return atomic64_read(&counter->count);
 }
 
 /*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
 
-	atomic64_counter_set(counter, cpu_clock(cpu));
+	atomic64_set(&counter->count, cpu_clock(cpu));
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+	atomic64_set(&counter->count, current->se.sum_exec_runtime);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v1.2.3-70-g09d2


From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 12 Dec 2008 13:49:45 +0100
Subject: perfcounters: implement "counter inheritance"

Impact: implement new performance feature

Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.

Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).

The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.

The timec.c command utilizes counter inheritance:

  http://redhat.com/~mingo/perfcounters/timec.c

Sample output:

   $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null

   Performance counter stats for 'ls':

           163516953 instructions
                2295 cache-misses
             2855182 branch-misses

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  24 +++--
 kernel/exit.c                |   7 +-
 kernel/perf_counter.c        | 248 +++++++++++++++++++++++++++++++++++--------
 3 files changed, 228 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 72460289c65..e5d25bf8f74 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default */
-				nmi	     :  1, /* NMI sampling   */
-				raw	     :  1, /* raw event type */
-				__reserved_1 : 29;
+	u32			disabled     :  1, /* off by default      */
+				nmi	     :  1, /* NMI sampling        */
+				raw	     :  1, /* raw event type      */
+				inherit	     :  1, /* children inherit it */
+				__reserved_1 : 28;
 
 	u64			__reserved_2;
 };
@@ -138,6 +139,8 @@ enum perf_counter_active_state {
 	PERF_COUNTER_STATE_ACTIVE	=  1,
 };
 
+struct file;
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -156,7 +159,10 @@ struct perf_counter {
 
 	struct perf_counter_context	*ctx;
 	struct task_struct		*task;
+	struct file			*filp;
 
+	unsigned int			nr_inherited;
+	struct perf_counter		*parent;
 	/*
 	 * Protect attach/detach:
 	 */
@@ -210,13 +216,16 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern void
+perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
+perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
+static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline void perf_counter_init_task(struct task_struct *task)	{ }
+static inline void perf_counter_init_task(struct task_struct *child)	{ }
+static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f..d336c90a5f1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-#ifdef CONFIG_FUTEX
 	/*
-	 * This must happen late, after the PID is not
-	 * hashed anymore:
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
 	 */
+	perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
 	if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861ce8b2..f5e81dd193d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 		list_del_init(&sibling->list_entry);
 		list_add_tail(&sibling->list_entry, &ctx->counter_list);
-		WARN_ON_ONCE(!sibling->group_leader);
-		WARN_ON_ONCE(sibling->group_leader == sibling);
 		sibling->group_leader = sibling;
 	}
 }
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock(&ctx->lock);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
-	spin_unlock(&ctx->lock);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry)
 		counter->state = PERF_COUNTER_STATE_OFF;
-	}
+
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
@@ -525,26 +524,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	perf_counter_task_sched_in(curr, cpu);
 }
 
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	ctx->nr_counters	= 0;
-	ctx->task		= task;
-}
-/*
- * Initialize the perf_counter context in task_struct
- */
-void perf_counter_init_task(struct task_struct *task)
-{
-	__perf_counter_init_context(&task->perf_counter_ctx, task);
-}
-
 /*
  * Cross CPU call to read the hardware counter
  */
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 
-		WARN_ON_ONCE(ctx->task);
 		return ctx;
 	}
 
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
 static struct perf_counter *
 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
-		   struct perf_counter *group_leader)
+		   struct perf_counter *group_leader,
+		   gfp_t gfpflags)
 {
 	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
-	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	counter = kzalloc(sizeof(*counter), gfpflags);
 	if (!counter)
 		return NULL;
 
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-	if (!hw_ops) {
+	if (!hw_ops)
 		hw_ops = hw_perf_counter_init(counter);
-	}
 
 	if (!hw_ops) {
 		kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
 	struct perf_counter_context *ctx;
+	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
 	int fput_needed = 0;
+	int fput_needed2 = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	}
 
 	ret = -EINVAL;
-	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
 	if (!counter)
 		goto err_put_context;
 
-	perf_install_in_context(ctx, counter, cpu);
-
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
 	if (ret < 0)
-		goto err_remove_free_put_context;
+		goto err_free_put_context;
+
+	counter_file = fget_light(ret, &fput_needed2);
+	if (!counter_file)
+		goto err_free_put_context;
+
+	counter->filp = counter_file;
+	perf_install_in_context(ctx, counter, cpu);
+
+	fput_light(counter_file, fput_needed2);
 
 out_fput:
 	fput_light(group_file, fput_needed);
 
 	return ret;
 
-err_remove_free_put_context:
-	mutex_lock(&counter->mutex);
-	perf_counter_remove_from_context(counter);
-	mutex_unlock(&counter->mutex);
+err_free_put_context:
 	kfree(counter);
 
 err_put_context:
@@ -1044,6 +1028,186 @@ err_put_context:
 	goto out_fput;
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->task = task;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static int
+inherit_counter(struct perf_counter *parent_counter,
+	      struct task_struct *parent,
+	      struct perf_counter_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *child_counter;
+
+	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+					    parent_counter->cpu, NULL,
+					    GFP_ATOMIC);
+	if (!child_counter)
+		return -ENOMEM;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	child_counter->ctx = child_ctx;
+	child_counter->task = child;
+	list_add_counter(child_counter, child_ctx);
+	child_ctx->nr_counters++;
+
+	child_counter->parent = parent_counter;
+	parent_counter->nr_inherited++;
+	/*
+	 * inherit into child's child as well:
+	 */
+	child_counter->hw_event.inherit = 1;
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child counter exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_counter->filp->f_count);
+
+	return 0;
+}
+
+static void
+__perf_counter_exit_task(struct task_struct *child,
+			 struct perf_counter *child_counter,
+			 struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *parent_counter;
+	u64 parent_val, child_val;
+	u64 perf_flags;
+
+	/*
+	 * Disable and unlink this counter.
+	 *
+	 * Be careful about zapping the list - IRQ/NMI context
+	 * could still be processing it:
+	 */
+	local_irq_disable();
+	perf_flags = hw_perf_save_disable();
+
+	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+	list_del_init(&child_counter->list_entry);
+
+	hw_perf_restore(perf_flags);
+	local_irq_enable();
+
+	parent_counter = child_counter->parent;
+	/*
+	 * It can happen that parent exits first, and has counters
+	 * that are still around due to the child reference. These
+	 * counters need to be zapped - but otherwise linger.
+	 */
+	if (!parent_counter)
+		return;
+
+	parent_val = atomic64_read(&parent_counter->count);
+	child_val = atomic64_read(&child_counter->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_counter->count);
+
+	fput(parent_counter->filp);
+
+	kfree(child_counter);
+}
+
+/*
+ * When a child task exist, feed back counter values to parent counters.
+ *
+ * Note: we are running in child context, but the PID is not hashed
+ * anymore so new counters will not be added.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+	struct perf_counter *child_counter, *tmp;
+	struct perf_counter_context *child_ctx;
+
+	child_ctx = &child->perf_counter_ctx;
+
+	if (likely(!child_ctx->nr_counters))
+		return;
+
+	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+				 list_entry)
+		__perf_counter_exit_task(child, child_counter, child_ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *child)
+{
+	struct perf_counter_context *child_ctx, *parent_ctx;
+	struct perf_counter *counter, *parent_counter;
+	struct task_struct *parent = current;
+	unsigned long flags;
+
+	child_ctx  =  &child->perf_counter_ctx;
+	parent_ctx = &parent->perf_counter_ctx;
+
+	__perf_counter_init_context(child_ctx, child);
+
+	/*
+	 * This is executed from the parent task context, so inherit
+	 * counters that have been marked for cloning:
+	 */
+
+	if (likely(!parent_ctx->nr_counters))
+		return;
+
+	/*
+	 * Lock the parent list. No need to lock the child - not PID
+	 * hashed yet and not running, so nobody can access it.
+	 */
+	spin_lock_irqsave(&parent_ctx->lock, flags);
+
+	/*
+	 * We dont have to disable NMIs - we are only looking at
+	 * the list, not manipulating it:
+	 */
+	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+		if (!counter->hw_event.inherit || counter->group_leader != counter)
+			continue;
+
+		/*
+		 * Instead of creating recursive hierarchies of counters,
+		 * we link inheritd counters back to the original parent,
+		 * which has a filp for sure, which we use as the reference
+		 * count:
+		 */
+		parent_counter = counter;
+		if (counter->parent)
+			parent_counter = counter->parent;
+
+		if (inherit_counter(parent_counter, parent,
+				  parent_ctx, child, child_ctx))
+			break;
+	}
+
+	spin_unlock_irqrestore(&parent_ctx->lock, flags);
+}
+
 static void __cpuinit perf_counter_init_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
-- 
cgit v1.2.3-70-g09d2


From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:28:33 +0100
Subject: perfcounters: add context switch counter

Impact: add new feature, new sw counter

Add a counter that counts the number of context-switches a task
is doing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++--
 kernel/perf_counter.c        | 51 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e5d25bf8f74..d2a16563415 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,8 +53,8 @@ enum hw_event_types {
 	/*
 	 * Future software events:
 	 */
-	/* PERF_COUNT_PAGE_FAULTS	= -3,
-	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1f81cde0dc4..09287091c52 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.hw_perf_counter_read		= task_clock_perf_counter_read,
 };
 
+static u64 get_context_switches(void)
+{
+	struct task_struct *curr = current;
+
+	return curr->nvcsw + curr->nivcsw;
+}
+
+static void context_switches_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_context_switches();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void context_switches_perf_counter_read(struct perf_counter *counter)
+{
+	context_switches_perf_counter_update(counter);
+}
+
+static void context_switches_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * ->nvcsw + curr->nivcsw is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void context_switches_perf_counter_disable(struct perf_counter *counter)
+{
+	context_switches_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_context_switches = {
+	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
+	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
+	.hw_perf_counter_read		= context_switches_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_TASK_CLOCK:
 		hw_ops = &perf_ops_task_clock;
 		break;
+	case PERF_COUNT_CONTEXT_SWITCHES:
+		hw_ops = &perf_ops_context_switches;
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3-70-g09d2


From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:34:15 +0100
Subject: perfcounters: add task migrations counter

Impact: add new feature, new sw counter

Add a counter that counts the number of cross-CPU migrations a
task is suffering.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  8 +++++---
 include/linux/sched.h        |  3 ++-
 kernel/perf_counter.c        | 49 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched.c               |  7 +++++--
 4 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d2a16563415..f30486fc55d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,6 +42,8 @@ enum hw_event_types {
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
 
+	PERF_HW_EVENTS_MAX		=  6,
+
 	/*
 	 * Special "software" counters provided by the kernel, even if
 	 * the hardware does not support performance counters. These
@@ -50,11 +52,11 @@ enum hw_event_types {
 	 */
 	PERF_COUNT_CPU_CLOCK		= -1,
 	PERF_COUNT_TASK_CLOCK		= -2,
-	/*
-	 * Future software events:
-	 */
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
 };
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c530278391..2e15be8fc79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1014,6 +1014,8 @@ struct sched_entity {
 	u64			last_wakeup;
 	u64			avg_overlap;
 
+	u64			nr_migrations;
+
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
@@ -1029,7 +1031,6 @@ struct sched_entity {
 	u64			exec_max;
 	u64			slice_max;
 
-	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 09287091c52..fb11e351e44 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.hw_perf_counter_read		= context_switches_perf_counter_read,
 };
 
+static inline u64 get_cpu_migrations(void)
+{
+	return current->se.nr_migrations;
+}
+
+static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_cpu_migrations();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+{
+	cpu_migrations_perf_counter_update(counter);
+}
+
+static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * se.nr_migrations is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
+{
+	cpu_migrations_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		hw_ops = &perf_ops_context_switches;
 		break;
+	case PERF_COUNT_CPU_MIGRATIONS:
+		hw_ops = &perf_ops_cpu_migrations;
+		break;
 	default:
 		break;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c3f4106314..382cfdb5e38 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
+#endif
 	if (old_cpu != new_cpu) {
-		schedstat_inc(p, se.nr_migrations);
+		p->se.nr_migrations++;
+#ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
-	}
 #endif
+	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
 
@@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
+	p->se.nr_migrations		= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
 
-- 
cgit v1.2.3-70-g09d2


From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:04:16 +0100
Subject: perfcounters: remove warnings

Impact: remove debug checks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 -------
 include/linux/perf_counter.h       | 4 ----
 kernel/perf_counter.c              | 8 --------
 3 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index bdbdb56eaa3..89fad5d4fb3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter,
 {
 	u64 prev_raw_count, new_raw_count, delta;
 
-	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
 	/*
 	 * Careful: an NMI might modify the previous counter value.
 	 *
@@ -89,7 +88,6 @@ again:
 	 * of the count, so we do that by clipping the delta to 32 bits:
 	 */
 	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
-	WARN_ON_ONCE((int)delta < 0);
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
@@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	int err;
 
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
-	WARN_ON_ONCE(err);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
@@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	s32 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
 
-	WARN_ON_ONCE(period <= 0);
-
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
 	 */
@@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 		atomic64_set(&hwc->period_left, left);
 	}
 
-	WARN_ON_ONCE(left <= 0);
-
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f30486fc55d..d038450de87 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -218,8 +218,6 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern void
-perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
@@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
-perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
-static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5431e790b5d..aab6c123b02 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
-- 
cgit v1.2.3-70-g09d2


From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:09:13 +0100
Subject: x86, perfcounters: prepare for fixed-mode PMCs

Impact: refactor the x86 code for fixed-mode PMCs

Extend the data structures and rename the existing facilities
to allow for a 'generic' versus 'fixed' counter distinction.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 11 ++++++++
 arch/x86/kernel/cpu/perf_counter.c  | 53 ++++++++++++++++++-------------------
 include/linux/perf_counter.h        |  1 +
 3 files changed, 38 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 9dadce1124e..dd5a4a559e2 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_PERF_COUNTER_H
 #define _ASM_X86_PERF_COUNTER_H
 
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -20,6 +27,10 @@
 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a4a3a09a654..fc3af868823 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
-/* No support for fixed function counters yet */
-
-#define MAX_HW_COUNTERS		8
-
 struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HW_COUNTERS];
-	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
+
+	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
+	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
 };
 
 /*
@@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl)
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline void
-__x86_perf_counter_disable(struct perf_counter *counter,
+__pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
@@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static void
-__x86_perf_counter_enable(struct perf_counter *counter,
+__pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
@@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter,
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void x86_perf_counter_enable(struct perf_counter *counter)
+static void pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->counters[idx] = counter;
+	cpuc->generic[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__x86_perf_counter_enable(counter, hwc, idx);
+	__pmc_generic_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
@@ -301,16 +300,16 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void x86_perf_counter_disable(struct perf_counter *counter)
+static void pmc_generic_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
+	cpuc->generic[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(counter, hwc, idx);
+		__pmc_generic_enable(counter, hwc, idx);
 }
 
 static void
@@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -412,7 +411,7 @@ again:
 		}
 		/*
 		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these counters as
+		 * do a task wakeup - but we mark these generic as
 		 * wakeup_pending and initate a wakeup callback:
 		 */
 		if (nmi) {
@@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		if (!counter)
 			continue;
@@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > MAX_HW_COUNTERS) {
-		nr_hw_counters = MAX_HW_COUNTERS;
+	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
+		nr_hw_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, MAX_HW_COUNTERS);
+			nr_hw_counters, X86_PMC_MAX_GENERIC);
 	}
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
@@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_perf_counter_read(struct perf_counter *counter)
+static void pmc_generic_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= x86_perf_counter_enable,
-	.hw_perf_counter_disable	= x86_perf_counter_disable,
-	.hw_perf_counter_read		= x86_perf_counter_read,
+	.hw_perf_counter_enable		= pmc_generic_enable,
+	.hw_perf_counter_disable	= pmc_generic_disable,
+	.hw_perf_counter_read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d038450de87..984da540224 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/perf_counter.h>
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3-70-g09d2


From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:20:28 +0100
Subject: perfcounters: hw ops rename

Impact: rename field names

Shorten them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  6 ++---
 include/linux/perf_counter.h       |  6 ++---
 kernel/perf_counter.c              | 50 +++++++++++++++++++-------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 358af526640..b6755712142 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= pmc_generic_enable,
-	.hw_perf_counter_disable	= pmc_generic_disable,
-	.hw_perf_counter_read		= pmc_generic_read,
+	.enable		= pmc_generic_enable,
+	.disable	= pmc_generic_disable,
+	.read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 984da540224..48f76d2e54c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,9 +128,9 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+	void (*enable)			(struct perf_counter *counter);
+	void (*disable)			(struct perf_counter *counter);
+	void (*read)			(struct perf_counter *counter);
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8a4d9a5d5d..961d651aa57 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->hw_ops->hw_perf_counter_disable(counter);
+		counter->hw_ops->disable(counter);
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info)
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
-		counter->hw_ops->hw_perf_counter_enable(counter);
+		counter->hw_ops->enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
-	counter->hw_ops->hw_perf_counter_disable(counter);
+	counter->hw_ops->disable(counter);
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->oncpu = -1;
 
@@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter,
  *
  * We stop each counter and update the counter value in counter->count.
  *
- * This does not protect us against NMI, but hw_perf_counter_disable()
+ * This does not protect us against NMI, but disable()
  * sets the disabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
@@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
-	counter->hw_ops->hw_perf_counter_enable(counter);
+	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter,
  *
  * We restore the counter value and then enable it.
  *
- * This does not protect us against NMI, but hw_perf_counter_enable()
+ * This does not protect us against NMI, but enable()
  * sets the enabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * keep the counter running.
@@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 /*
  * Cross CPU call to read the hardware counter
  */
-static void __hw_perf_counter_read(void *info)
+static void __read(void *info)
 {
 	struct perf_counter *counter = info;
 
-	counter->hw_ops->hw_perf_counter_read(counter);
+	counter->hw_ops->read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
-					 __hw_perf_counter_read, counter, 1);
+					 __read, counter, 1);
 	}
 
 	return atomic64_read(&counter->count);
@@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
-	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+	.enable		= cpu_clock_perf_counter_enable,
+	.disable	= cpu_clock_perf_counter_disable,
+	.read		= cpu_clock_perf_counter_read,
 };
 
 static void task_clock_perf_counter_update(struct perf_counter *counter)
@@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
-	.hw_perf_counter_read		= task_clock_perf_counter_read,
+	.enable		= task_clock_perf_counter_enable,
+	.disable	= task_clock_perf_counter_disable,
+	.read		= task_clock_perf_counter_read,
 };
 
 static u64 get_page_faults(void)
@@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.hw_perf_counter_enable		= page_faults_perf_counter_enable,
-	.hw_perf_counter_disable	= page_faults_perf_counter_disable,
-	.hw_perf_counter_read		= page_faults_perf_counter_read,
+	.enable		= page_faults_perf_counter_enable,
+	.disable	= page_faults_perf_counter_disable,
+	.read		= page_faults_perf_counter_read,
 };
 
 static u64 get_context_switches(void)
@@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
-	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
-	.hw_perf_counter_read		= context_switches_perf_counter_read,
+	.enable		= context_switches_perf_counter_enable,
+	.disable	= context_switches_perf_counter_disable,
+	.read		= context_switches_perf_counter_read,
 };
 
 static inline u64 get_cpu_migrations(void)
@@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
-	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+	.enable		= cpu_migrations_perf_counter_enable,
+	.disable	= cpu_migrations_perf_counter_disable,
+	.read		= cpu_migrations_perf_counter_read,
 };
 
 static const struct hw_perf_counter_ops *
@@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+		child_counter->hw_ops->disable(child_counter);
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
 		child_counter->oncpu = -1;
 
-- 
cgit v1.2.3-70-g09d2


From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:10:57 +0100
Subject: perfcounters: fix task clock counter

Impact: fix per task clock counter precision

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |  8 ++++++
 kernel/exit.c               | 17 +++++++----
 kernel/perf_counter.c       | 70 ++++++++++++++++++++++++++++++++++-----------
 kernel/sched.c              | 49 +++++++++++++++++++++++++++++--
 4 files changed, 120 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145caeee0..1b2e3242497 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 	return sum;
 }
 
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
+extern void curr_rq_lock_irq_save(unsigned long *flags);
+extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
+
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/kernel/exit.c b/kernel/exit.c
index d336c90a5f1..244edfd9686 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
@@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
+	/*
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
+	 */
+	perf_counter_exit_task(tsk);
+
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 961d651aa57..f1110ac1267 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
 
 /*
@@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->disable(counter);
@@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 
@@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info)
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 /*
@@ -438,15 +443,19 @@ int perf_counter_task_disable(void)
 	struct task_struct *curr = current;
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	unsigned long flags;
 	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
+	/* force the update of the task clock: */
+	__task_delta_exec(curr, 1);
+
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -463,7 +472,7 @@ int perf_counter_task_disable(void)
 
 	spin_unlock(&ctx->lock);
 
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	return 0;
 }
@@ -473,15 +482,19 @@ int perf_counter_task_enable(void)
 	struct task_struct *curr = current;
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	unsigned long flags;
 	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
+	/* force the update of the task clock: */
+	__task_delta_exec(curr, 1);
+
 	spin_lock(&ctx->lock);
 
 	/*
@@ -493,6 +506,7 @@ int perf_counter_task_enable(void)
 		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -500,7 +514,7 @@ int perf_counter_task_enable(void)
 
 	perf_counter_task_sched_in(curr, cpu);
 
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	return 0;
 }
@@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
 	struct perf_counter *counter = info;
+	unsigned long flags;
 
+	curr_rq_lock_irq_save(&flags);
 	counter->hw_ops->read(counter);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.read		= cpu_clock_perf_counter_read,
 };
 
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+/*
+ * Called from within the scheduler:
+ */
+static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 {
-	u64 prev, now;
+	struct task_struct *curr = counter->task;
+	u64 delta;
+
+	WARN_ON_ONCE(counter->task != current);
+
+	delta = __task_delta_exec(curr, update);
+
+	return curr->se.sum_exec_runtime + delta;
+}
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+	u64 prev;
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = current->se.sum_exec_runtime;
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter);
+	u64 now = task_clock_perf_counter_val(counter, 1);
+
+	task_clock_perf_counter_update(counter, now);
 }
 
 static void task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+	u64 now = task_clock_perf_counter_val(counter, 0);
+
+	atomic64_set(&counter->hw.prev_count, now);
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter);
+	u64 now = task_clock_perf_counter_val(counter, 0);
+
+	task_clock_perf_counter_update(counter, now);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 	u64 parent_val, child_val;
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	 * Be careful about zapping the list - IRQ/NMI context
 	 * could still be processing it:
 	 */
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	perf_flags = hw_perf_save_disable();
 
 	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
@@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	list_del_init(&child_counter->list_entry);
 
 	hw_perf_restore(perf_flags);
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	parent_counter = child_counter->parent;
 	/*
diff --git a/kernel/sched.c b/kernel/sched.c
index 382cfdb5e38..4d84ff4c877 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
+void curr_rq_lock_irq_save(unsigned long *flags)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	local_irq_save(*flags);
+	rq = cpu_rq(smp_processor_id());
+	spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+	__releases(rq->lock)
+{
+	struct rq *rq;
+
+	rq = cpu_rq(smp_processor_id());
+	spin_unlock(&rq->lock);
+	local_irq_restore(*flags);
+}
+
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
@@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
-	perf_counter_task_sched_out(prev, cpu_of(rq));
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+	s64 delta_exec;
+	struct rq *rq;
+
+	rq = task_rq(p);
+	WARN_ON_ONCE(!runqueue_is_locked());
+	WARN_ON_ONCE(!task_current(rq, p));
+
+	if (update)
+		update_rq_clock(rq);
+
+	delta_exec = rq->clock - p->se.exec_start;
+
+	WARN_ON_ONCE(delta_exec < 0);
+
+	return delta_exec;
+}
+
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
@@ -4316,13 +4358,13 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
+	perf_counter_task_tick(curr, cpu);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
-	perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4512,6 +4554,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
+		perf_counter_task_sched_out(prev, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
cgit v1.2.3-70-g09d2


From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 10:20:42 +0100
Subject: perfcounters: pull inherited counters

Change counter inheritance from a 'push' to a 'pull' model: instead of
child tasks pushing their final counts to the parent, reuse the wait4
infrastructure to pull counters as child tasks are exit-processed,
much like how cutime/cstime is collected.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |  9 +++++++++
 kernel/exit.c             | 21 +++++++++------------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e..54fa2fa2c8e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -113,6 +113,14 @@ extern struct group_info init_groups;
 # define CAP_INIT_BSET  CAP_INIT_EFF_SET
 #endif
 
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)					\
+	.perf_counter_ctx.counter_list =				\
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -180,6 +188,7 @@ extern struct group_info init_groups;
 	INIT_IDS							\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	INIT_PERF_COUNTERS(tsk)						\
 }
 
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 244edfd9686..101b7eeff44 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
+#ifdef CONFIG_PERF_COUNTERS
+	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+#endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
@@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
-
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
@@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-- 
cgit v1.2.3-70-g09d2


From 78b6084c907cea15bb40a564b974e072f5163781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 15:07:49 +0100
Subject: perfcounters: fix init context lock

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 54fa2fa2c8e..467cff545c3 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -116,7 +116,9 @@ extern struct group_info init_groups;
 #ifdef CONFIG_PERF_COUNTERS
 # define INIT_PERF_COUNTERS(tsk)					\
 	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
+	.perf_counter_ctx.lock =					\
+		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 #else
 # define INIT_PERF_COUNTERS(tsk)
 #endif
-- 
cgit v1.2.3-70-g09d2


From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 13:50:42 +0100
Subject: perfcounters: enable lowlevel pmc code to schedule counters

Allow lowlevel ->enable() op to return an error if a counter can not be
added. This can be used to handle counter constraints.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  6 +++-
 include/linux/perf_counter.h       |  2 +-
 kernel/perf_counter.c              | 62 +++++++++++++++++++++++++++-----------
 3 files changed, 51 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b6755712142..74090a393a7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void pmc_generic_enable(struct perf_counter *counter)
+static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+		if (idx == nr_counters_generic)
+			return -EAGAIN;
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
+
+	return 0;
 }
 
 void perf_counter_print_debug(void)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48f76d2e54c..53af11d3767 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,7 +128,7 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*enable)			(struct perf_counter *counter);
+	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f1110ac1267..2e73929a695 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = NULL;
 }
 
-static void
+static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
 	if (counter->state == PERF_COUNTER_STATE_OFF)
-		return;
+		return 0;
+
+	if (counter->hw_ops->enable(counter))
+		return -EAGAIN;
 
-	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
 	ctx->nr_active++;
+
+	return 0;
 }
 
 static int
@@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter,
 	       struct perf_counter_context *ctx,
 	       int cpu)
 {
-	struct perf_counter *counter;
-	int was_group = 0;
+	struct perf_counter *counter, *partial_group;
+	int ret = 0;
 
-	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+		return -EAGAIN;
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter_sched_in(counter, cpuctx, ctx, cpu);
-		was_group = 1;
+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+			partial_group = counter;
+			goto group_error;
+		}
+		ret = -EAGAIN;
 	}
 
-	return was_group;
+	return ret;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		if (counter == partial_group)
+			break;
+		counter_sched_out(counter, cpuctx, ctx);
+	}
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	return -EAGAIN;
 }
 
 /*
@@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (ctx->nr_active == cpuctx->max_pertask)
-			break;
-
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
@@ -856,8 +875,9 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
-static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
+	return 0;
 }
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
@@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, now);
 }
 
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	u64 now = task_clock_perf_counter_val(counter, 0);
 
 	atomic64_set(&counter->hw.prev_count, now);
+
+	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
@@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 	page_faults_perf_counter_update(counter);
 }
 
-static void page_faults_perf_counter_enable(struct perf_counter *counter)
+static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * page-faults is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void page_faults_perf_counter_disable(struct perf_counter *counter)
@@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 	context_switches_perf_counter_update(counter);
 }
 
-static void context_switches_perf_counter_enable(struct perf_counter *counter)
+static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * ->nvcsw + curr->nivcsw is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void context_switches_perf_counter_disable(struct perf_counter *counter)
@@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * se.nr_migrations is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-- 
cgit v1.2.3-70-g09d2


From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:29:25 +0100
Subject: perfcounters: remove ->nr_inherited

Impact: remove dead code

nr_inherited was not maintained correctly (not decremented) - and also
not used - remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 -
 kernel/perf_counter.c        | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 53af11d3767..1ea08e9f31c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,7 +164,6 @@ struct perf_counter {
 	struct task_struct		*task;
 	struct file			*filp;
 
-	unsigned int			nr_inherited;
 	struct perf_counter		*parent;
 	/*
 	 * Protect attach/detach:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2e73929a695..48e1dbcdc1c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	child_ctx->nr_counters++;
 
 	child_counter->parent = parent_counter;
-	parent_counter->nr_inherited++;
 	/*
 	 * inherit into child's child as well:
 	 */
-- 
cgit v1.2.3-70-g09d2


From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:17:29 +0100
Subject: perfcounters: add PERF_COUNT_BUS_CYCLES

Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref
on x86. (which is a good enough approximation)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 ++-
 include/linux/perf_counter.h       | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f3359c2b391..86b2fdd344a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 static const int intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
   [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
   [PERF_COUNT_CACHE_MISSES]		= 0x412e,
   [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
 static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1ea08e9f31c..ec77d1643d3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -36,14 +36,15 @@ enum hw_event_types {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_CPU_CYCLES		=  0,
 	PERF_COUNT_INSTRUCTIONS		=  1,
 	PERF_COUNT_CACHE_REFERENCES	=  2,
 	PERF_COUNT_CACHE_MISSES		=  3,
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  6,
+	PERF_HW_EVENTS_MAX		=  7,
 
 	/*
 	 * Special "software" counters provided by the kernel, even if
-- 
cgit v1.2.3-70-g09d2


From e44aef58ecfbe061eb4c53b939bcc35fb1bee82d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 09:02:11 +0100
Subject: perfcounters: include asm/perf_counter.h only if
 CONFIG_PERF_COUNTERS=y

Impact: build fix on ia64

KOSAKI Motohiro reported that -tip doesnt build on ia64 because
asm/perf_counter.h only exists on x86 for now. Fix it.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ec77d1643d3..cc3a75a239a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,7 +14,10 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
-#include <asm/perf_counter.h>
+
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3-70-g09d2


From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:43:42 +1100
Subject: perf_counter: Add optional hw_perf_group_sched_in arch function

Impact: extend perf_counter infrastructure

This adds an optional hw_perf_group_sched_in() arch function that enables
a whole group of counters in one go.  It returns 1 if it added the group
successfully, 0 if it did nothing (and therefore the core needs to add
the counters individually), or a negative number if an error occurred.
It should add all the counters and enable any software counters in the
group, or else add none of them and return an error.

There are a couple of related changes/improvements in the group handling
here:

* As an optimization, group_sched_out() and group_sched_in() now check the
  state of the group leader, and do nothing if the leader is not active
  or disabled.

* We now call hw_perf_save_disable/hw_perf_restore around the complete
  set of counter enable/disable calls in __perf_counter_sched_in/out,
  to give the arch code the opportunity to defer updating the hardware
  state until the hw_perf_restore call if it wants.

* We no longer stop adding groups after we get to a group that has more
  than one counter.  We will ultimately add an option for a group to be
  exclusive.  The current code doesn't really implement exclusive groups
  anyway, since a group could end up going on with other counters that
  get added before it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h |  3 +++
 kernel/perf_counter.c        | 31 ++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index cc3a75a239a..b21d1ea4c05 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -236,6 +236,9 @@ extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu);
 
 #else
 static inline void
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b7a027a2ef0..9ad11e44d9a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -47,6 +47,12 @@ hw_perf_counter_init(struct perf_counter *counter)
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
 void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
 void __weak hw_perf_counter_setup(void)		{ barrier(); }
+int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu)
+{
+	return 0;
+}
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -341,6 +347,9 @@ group_sched_out(struct perf_counter *group_counter,
 {
 	struct perf_counter *counter;
 
+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
 	counter_sched_out(group_counter, cpuctx, ctx);
 
 	/*
@@ -354,15 +363,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
+	u64 flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
 
 	spin_lock(&ctx->lock);
+	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
+	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
 }
 
@@ -402,7 +414,14 @@ group_sched_in(struct perf_counter *group_counter,
 	       int cpu)
 {
 	struct perf_counter *counter, *partial_group;
-	int ret = 0;
+	int ret;
+
+	if (group_counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
 
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
@@ -415,10 +434,9 @@ group_sched_in(struct perf_counter *group_counter,
 			partial_group = counter;
 			goto group_error;
 		}
-		ret = -EAGAIN;
 	}
 
-	return ret;
+	return 0;
 
 group_error:
 	/*
@@ -440,11 +458,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
+	u64 flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
 
 	spin_lock(&ctx->lock);
+	flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
@@ -454,12 +474,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			continue;
 
 		/*
-		 * If we scheduled in a group atomically and
-		 * exclusively, break out:
+		 * If we scheduled in a group atomically and exclusively,
+		 * or if this group can't go on, break out:
 		 */
 		if (group_sched_in(counter, cpuctx, ctx, cpu))
 			break;
 	}
+	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 21:00:30 +1100
Subject: perf_counter: Add support for pinned and exclusive counter groups

Impact: New perf_counter features

A pinned counter group is one that the user wants to have on the CPU
whenever possible, i.e. whenever the associated task is running, for
a per-task group, or always for a per-cpu group.  If the system
cannot satisfy that, it puts the group into an error state where
it is not scheduled any more and reads from it return EOF (i.e. 0
bytes read).  The group can be released from error state and made
readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE).  When we
have finer-grained enable/disable controls on counters we'll be able
to reset the error state on individual groups.

An exclusive group is one that the user wants to be the only group
using the CPU performance monitor hardware whenever it is on.  The
counter group scheduler will not schedule an exclusive group if there
are already other groups on the CPU and will not schedule other groups
onto the CPU if there is an exclusive group scheduled (that statement
does not apply to groups containing only software counters, which can
always go on and which do not prevent an exclusive group from going on).
With an exclusive group, we will be able to let users program PMU
registers at a low level without the concern that those settings will
perturb other measurements.

Along the way this reorganizes things a little:
- is_software_counter() is moved to perf_counter.h.
- cpuctx->active_oncpu now records the number of hardware counters on
  the CPU, i.e. it now excludes software counters.  Nothing was reading
  cpuctx->active_oncpu before, so this change is harmless.
- A new cpuctx->exclusive field records whether we currently have an
  exclusive group on the CPU.
- counter_sched_out moves higher up in perf_counter.c and gets called
  from __perf_counter_remove_from_context and __perf_counter_exit_task,
  where we used to have essentially the same code.
- __perf_counter_sched_in now goes through the counter list twice, doing
  the pinned counters in the first loop and the non-pinned counters in
  the second loop, in order to give the pinned counters the best chance
  to be scheduled in.

Note that only a group leader can be exclusive or pinned, and that
attribute applies to the whole group.  This avoids some awkwardness in
some corner cases (e.g. where a group leader is closed and the other
group members get added to the context list).  If we want to relax that
restriction later, we can, and it is easier to relax a restriction than
to apply a new one.

This doesn't yet handle the case where a pinned counter is inherited
and goes into error state in the child - the error state is not
propagated up to the parent when the child exits, and arguably it
should.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c |  10 +-
 include/linux/perf_counter.h       |  15 ++-
 kernel/perf_counter.c              | 226 +++++++++++++++++++++++++------------
 3 files changed, 169 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 85ad25923c2..5b0211348c7 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -35,14 +35,6 @@ void perf_counter_print_debug(void)
 {
 }
 
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
 /*
  * Read one performance monitor counter (PMC).
  */
@@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	 */
 	for (i = n0; i < n0 + n; ++i)
 		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	cpuctx->active_oncpu += n;
 	n = 1;
 	counter_sched_in(group_leader, cpu);
 	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
@@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			++n;
 		}
 	}
-	cpuctx->active_oncpu += n;
 	ctx->nr_active += n;
 
 	return 1;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b21d1ea4c05..7ab8e5f96f5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -86,7 +86,10 @@ struct perf_counter_hw_event {
 				nmi	     :  1, /* NMI sampling        */
 				raw	     :  1, /* raw event type      */
 				inherit	     :  1, /* children inherit it */
-				__reserved_1 : 28;
+				pinned	     :  1, /* must always be on PMU */
+				exclusive    :  1, /* only counter on PMU */
+
+				__reserved_1 : 26;
 
 	u64			__reserved_2;
 };
@@ -141,6 +144,7 @@ struct hw_perf_counter_ops {
  * enum perf_counter_active_state - the states of a counter
  */
 enum perf_counter_active_state {
+	PERF_COUNTER_STATE_ERROR	= -2,
 	PERF_COUNTER_STATE_OFF		= -1,
 	PERF_COUNTER_STATE_INACTIVE	=  0,
 	PERF_COUNTER_STATE_ACTIVE	=  1,
@@ -214,6 +218,7 @@ struct perf_cpu_context {
 	struct perf_counter_context	*task_ctx;
 	int				active_oncpu;
 	int				max_pertask;
+	int				exclusive;
 };
 
 /*
@@ -240,6 +245,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52f2f526248..faf671b2956 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	}
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->hw_ops->disable(counter);
+	counter->oncpu = -1;
+
+	if (!is_software_counter(counter))
+		cpuctx->active_oncpu--;
+	ctx->nr_active--;
+	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+		cpuctx->exclusive = 0;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info)
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
 
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->hw_ops->disable(counter);
-		ctx->nr_active--;
-		cpuctx->active_oncpu--;
-		counter->task = NULL;
-		counter->oncpu = -1;
-	}
+	counter_sched_out(counter, cpuctx, ctx);
+
+	counter->task = NULL;
 	ctx->nr_counters--;
 
 	/*
@@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->state == PERF_COUNTER_STATE_OFF)
+	if (counter->state <= PERF_COUNTER_STATE_OFF)
 		return 0;
 
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
@@ -223,12 +237,63 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
-	cpuctx->active_oncpu++;
+	if (!is_software_counter(counter))
+		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
+	if (counter->hw_event.exclusive)
+		cpuctx->exclusive = 1;
+
 	return 0;
 }
 
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	if (!is_software_counter(leader))
+		return 0;
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		if (!is_software_counter(counter))
+			return 0;
+	return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+			   struct perf_cpu_context *cpuctx,
+			   int can_add_hw)
+{
+	/*
+	 * Groups consisting entirely of software counters can always go on.
+	 */
+	if (is_software_only_group(counter))
+		return 1;
+	/*
+	 * If an exclusive group is already on, no other hardware
+	 * counters can go on.
+	 */
+	if (cpuctx->exclusive)
+		return 0;
+	/*
+	 * If this group is exclusive and there are already
+	 * counters on the CPU, it can't go on.
+	 */
+	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+		return 0;
+	/*
+	 * Otherwise, try to add it if all previous groups were able
+	 * to go on.
+	 */
+	return can_add_hw;
+}
+
 /*
  * Cross CPU call to install and enable a performance counter
  */
@@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info)
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	u64 perf_flags;
+	int err;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info)
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
-	counter_sched_in(counter, cpuctx, ctx, cpu);
+	/*
+	 * An exclusive counter can't go on if there are already active
+	 * hardware counters, and no hardware counter can go on if there
+	 * is already an exclusive counter on.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
+	    !group_can_go_on(counter, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+	if (err && counter->hw_event.pinned)
+		counter->state = PERF_COUNTER_STATE_ERROR;
 
-	if (!ctx->task && cpuctx->max_pertask)
+	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
 	hw_perf_restore(perf_flags);
@@ -326,22 +404,6 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void
-counter_sched_out(struct perf_counter *counter,
-		  struct perf_cpu_context *cpuctx,
-		  struct perf_counter_context *ctx)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->hw_ops->disable(counter);
-	counter->oncpu = -1;
-
-	cpuctx->active_oncpu--;
-	ctx->nr_active--;
-}
-
 static void
 group_sched_out(struct perf_counter *group_counter,
 		struct perf_cpu_context *cpuctx,
@@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter,
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
+
+	if (group_counter->hw_event.exclusive)
+		cpuctx->exclusive = 0;
 }
 
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -455,30 +520,6 @@ group_error:
 	return -EAGAIN;
 }
 
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	if (!is_software_counter(leader))
-		return 0;
-	list_for_each_entry(counter, &leader->sibling_list, list_entry)
-		if (!is_software_counter(counter))
-			return 0;
-	return 1;
-}
-
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
@@ -492,7 +533,38 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
+		    !counter->hw_event.pinned)
+			continue;
+		if (counter->cpu != -1 && counter->cpu != cpu)
+			continue;
+
+		if (group_can_go_on(counter, cpuctx, 1))
+			group_sched_in(counter, cpuctx, ctx, cpu);
+
+		/*
+		 * If this pinned group hasn't been scheduled,
+		 * put it in error state.
+		 */
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+			counter->state = PERF_COUNTER_STATE_ERROR;
+	}
+
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		/*
+		 * Ignore counters in OFF or ERROR state, and
+		 * ignore pinned counters since we did them already.
+		 */
+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
+		    counter->hw_event.pinned)
+			continue;
+
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
@@ -500,14 +572,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		/*
-		 * If we scheduled in a group atomically and exclusively,
-		 * or if this group can't go on, don't add any more
-		 * hardware counters.
-		 */
-		if (can_add_hw || is_software_only_group(counter))
+		if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 			if (group_sched_in(counter, cpuctx, ctx, cpu))
 				can_add_hw = 0;
+		}
 	}
 	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
@@ -567,8 +635,10 @@ int perf_counter_task_disable(void)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry)
-		counter->state = PERF_COUNTER_STATE_OFF;
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ERROR)
+			counter->state = PERF_COUNTER_STATE_OFF;
+	}
 
 	hw_perf_restore(perf_flags);
 
@@ -607,7 +677,7 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_OFF)
+		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->hw_event.disabled = 0;
@@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (count != sizeof(cntval))
 		return -EINVAL;
 
+	/*
+	 * Return end-of-file for a read on a counter that is in
+	 * error state (i.e. because it was pinned but it couldn't be
+	 * scheduled on to the CPU at some point).
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ERROR)
+		return 0;
+
 	mutex_lock(&counter->mutex);
 	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
@@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter	*counter,
 {
 	struct perf_data *irqdata, *usrdata;
 	DECLARE_WAITQUEUE(wait, current);
-	ssize_t res;
+	ssize_t res, res2;
 
 	irqdata = counter->irqdata;
 	usrdata = counter->usrdata;
@@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter	*counter,
 		if (signal_pending(current))
 			break;
 
+		if (counter->state == PERF_COUNTER_STATE_ERROR)
+			break;
+
 		spin_unlock_irq(&counter->waitq.lock);
 		schedule();
 		spin_lock_irq(&counter->waitq.lock);
@@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter	*counter,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock_irq(&counter->waitq.lock);
 
-	if (usrdata->len + irqdata->len < count)
+	if (usrdata->len + irqdata->len < count &&
+	    counter->state != PERF_COUNTER_STATE_ERROR)
 		return -ERESTARTSYS;
 read_pending:
 	mutex_lock(&counter->mutex);
@@ -925,11 +1007,12 @@ read_pending:
 
 	/* Switch irq buffer: */
 	usrdata = perf_switch_irq_data(counter);
-	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
+	if (res2 < 0) {
 		if (!res)
 			res = -EFAULT;
 	} else {
-		res = count;
+		res += res2;
 	}
 out:
 	mutex_unlock(&counter->mutex);
@@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 		 */
 		if (group_leader->ctx != ctx)
 			goto err_put_context;
+		/*
+		 * Only a group leader can be exclusive or pinned
+		 */
+		if (hw_event.exclusive || hw_event.pinned)
+			goto err_put_context;
 	}
 
 	ret = -EINVAL;
@@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
-			child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-			child_counter->hw_ops->disable(child_counter);
-			cpuctx->active_oncpu--;
-			child_ctx->nr_active--;
-			child_counter->oncpu = -1;
-		}
+		counter_sched_out(child_counter, cpuctx, child_ctx);
 
 		list_del_init(&child_counter->list_entry);
 
-- 
cgit v1.2.3-70-g09d2


From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 17 Jan 2009 18:10:22 +1100
Subject: perf_counter: Add counter enable/disable ioctls

Impact: New perf_counter features

This primarily adds a way for perf_counter users to enable and disable
counters and groups.  Enabling or disabling a counter or group also
enables or disables all of the child counters that have been cloned
from it to monitor children of the task monitored by the top-level
counter.  The userspace interface to enable/disable counters is via
ioctl on the counter file descriptor.

Along the way this extends the code that handles child counters to
handle child counter groups properly.  A group with multiple counters
will be cloned to child tasks if and only if the group leader has the
hw_event.inherit bit set - if it is set the whole group is cloned as a
group in the child task.

In order to be able to enable or disable all child counters of a given
top-level counter, we need a way to find them all.  Hence I have added
a child_list field to struct perf_counter, which is the head of the
list of children for a top-level counter, or the link in that list for
a child counter.  That list is protected by the perf_counter.mutex
field.

This also adds a mutex to the perf_counter_context struct.  Previously
the list of counters was protected just by the lock field in the
context, which meant that perf_counter_init_task had to take that lock
and then take whatever lock/mutex protects the top-level counter's
child_list.  But the counter enable/disable functions need to take
that lock in order to traverse the list, then for each counter take
the lock in that counter's context in order to change the counter's
state safely, which will lead to a deadlock.

To solve this, we now have both a mutex and a spinlock in the context,
and taking either is sufficient to ensure the list of counters can't
change - you have to take both before changing the list.  Now
perf_counter_init_task takes the mutex instead of the lock (which
incidentally means that inherit_counter can use GFP_KERNEL instead of
GFP_ATOMIC) and thus avoids the possible deadlock.  Similarly the new
enable/disable functions can take the mutex while traversing the list
of child counters without incurring a possible deadlock when the
counter manipulation code locks the context for a child counter.

We also had an misfeature that the first counter added to a context
would possibly not go on until the next sched-in, because we were
using ctx->nr_active to detect if the context was running on a CPU.
But nr_active is the number of active counters, and if that was zero
(because the context didn't have any counters yet) it would look like
the context wasn't running on a cpu and so the retry code in
__perf_install_in_context wouldn't retry.  So this adds an 'is_active'
field that is set when the context is on a CPU, even if it has no
counters.  The is_active field is only used for task contexts, not for
per-cpu contexts.

If we enable a subsidiary counter in a group that is active on a CPU,
and the arch code can't enable the counter, then we have to pull the
whole group off the CPU.  We do this with group_sched_out, which gets
moved up in the file so it comes before all its callers.  This also
adds similar logic to __perf_install_in_context so that the "all on,
or none" invariant of groups is preserved when adding a new counter to
a group.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h |  21 +-
 kernel/perf_counter.c        | 455 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 415 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7ab8e5f96f5..33ba9fe0a78 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/ioctl.h>
 
 #ifdef CONFIG_PERF_COUNTERS
 # include <asm/perf_counter.h>
@@ -94,6 +95,12 @@ struct perf_counter_hw_event {
 	u64			__reserved_2;
 };
 
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
 /*
  * Kernel-internal data types:
  */
@@ -173,8 +180,10 @@ struct perf_counter {
 	struct file			*filp;
 
 	struct perf_counter		*parent;
+	struct list_head		child_list;
+
 	/*
-	 * Protect attach/detach:
+	 * Protect attach/detach and child_list:
 	 */
 	struct mutex			mutex;
 
@@ -199,13 +208,21 @@ struct perf_counter {
 struct perf_counter_context {
 #ifdef CONFIG_PERF_COUNTERS
 	/*
-	 * Protect the list of counters:
+	 * Protect the states of the counters in the list,
+	 * nr_active, and the list:
 	 */
 	spinlock_t		lock;
+	/*
+	 * Protect the list of counters.  Locking either mutex or lock
+	 * is sufficient to ensure the list doesn't change; to change
+	 * the list you need to lock both the mutex and the spinlock.
+	 */
+	struct mutex		mutex;
 
 	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
+	int			is_active;
 	struct task_struct	*task;
 #endif
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index faf671b2956..1ac18daa424 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter,
 		cpuctx->exclusive = 0;
 }
 
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+
+	if (group_counter->hw_event.exclusive)
+		cpuctx->exclusive = 0;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex held.
+ * Must be called with counter->mutex and ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -215,6 +237,99 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+	struct perf_counter *counter = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long flags;
+
+	/*
+	 * If this is a per-task counter, need to check whether this
+	 * counter's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
+
+	/*
+	 * If the counter is on, turn it off.
+	 * If it is in error state, leave it in error state.
+	 */
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		if (counter == counter->group_leader)
+			group_sched_out(counter, cpuctx, ctx);
+		else
+			counter_sched_out(counter, cpuctx, ctx);
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
+
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Disable a counter.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Disable the counter on the cpu that it's on
+		 */
+		smp_call_function_single(counter->cpu, __perf_counter_disable,
+					 counter, 1);
+		return;
+	}
+
+ retry:
+	task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the counter is still active, we need to retry the cross-call.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Disable a counter and all its children.
+ */
+static void perf_counter_disable_family(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+
+	perf_counter_disable(counter);
+
+	/*
+	 * Lock the mutex to protect the list of children
+	 */
+	mutex_lock(&counter->mutex);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_disable(child);
+	mutex_unlock(&counter->mutex);
+}
+
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	u64 perf_flags;
@@ -327,23 +443,40 @@ static void __perf_install_in_context(void *info)
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
+	/*
+	 * Don't put the counter on if it is disabled or if
+	 * it is in a group and the group isn't on.
+	 */
+	if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+		goto unlock;
+
 	/*
 	 * An exclusive counter can't go on if there are already active
 	 * hardware counters, and no hardware counter can go on if there
 	 * is already an exclusive counter on.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
-	    !group_can_go_on(counter, cpuctx, 1))
+	if (!group_can_go_on(counter, cpuctx, 1))
 		err = -EEXIST;
 	else
 		err = counter_sched_in(counter, cpuctx, ctx, cpu);
 
-	if (err && counter->hw_event.pinned)
-		counter->state = PERF_COUNTER_STATE_ERROR;
+	if (err) {
+		/*
+		 * This counter couldn't go on.  If it is in a group
+		 * then we have to pull the whole group off.
+		 * If the counter group is pinned then put it in error state.
+		 */
+		if (leader != counter)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->hw_event.pinned)
+			leader->state = PERF_COUNTER_STATE_ERROR;
+	}
 
 	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
+ unlock:
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
@@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info)
  * If the counter is attached to a task which is on a CPU we use a smp
  * call to enable it in the task context. The task might have been
  * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
  */
 static void
 perf_install_in_context(struct perf_counter_context *ctx,
@@ -387,7 +522,7 @@ retry:
 	/*
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list_entry)) {
+	if (ctx->is_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -404,26 +539,131 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void
-group_sched_out(struct perf_counter *group_counter,
-		struct perf_cpu_context *cpuctx,
-		struct perf_counter_context *ctx)
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
 {
-	struct perf_counter *counter;
+	struct perf_counter *counter = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *leader = counter->group_leader;
+	unsigned long flags;
+	int err;
 
-	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+	/*
+	 * If this is a per-task counter, need to check whether this
+	 * counter's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	counter_sched_out(group_counter, cpuctx, ctx);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
+
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		goto unlock;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
 
 	/*
-	 * Schedule out siblings (if any):
+	 * If the counter is in a group and isn't the group leader,
+	 * then don't put it on unless the group is on.
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
-		counter_sched_out(counter, cpuctx, ctx);
+	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+		goto unlock;
 
-	if (group_counter->hw_event.exclusive)
-		cpuctx->exclusive = 0;
+	if (!group_can_go_on(counter, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = counter_sched_in(counter, cpuctx, ctx,
+				       smp_processor_id());
+
+	if (err) {
+		/*
+		 * If this counter can't go on and it's part of a
+		 * group, then the whole group has to come off.
+		 */
+		if (leader != counter)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->hw_event.pinned)
+			leader->state = PERF_COUNTER_STATE_ERROR;
+	}
+
+ unlock:
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Enable a counter.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Enable the counter on the cpu that it's on
+		 */
+		smp_call_function_single(counter->cpu, __perf_counter_enable,
+					 counter, 1);
+		return;
+	}
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		goto out;
+
+	/*
+	 * If the counter is in error state, clear that first.
+	 * That way, if we see the counter in error state below, we
+	 * know that it has gone back into error state, as distinct
+	 * from the task having been scheduled away before the
+	 * cross-call arrived.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ERROR)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+	spin_lock_irq(&ctx->lock);
+
+	/*
+	 * If the context is active and the counter is still off,
+	 * we need to retry the cross-call.
+	 */
+	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+		goto retry;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_OFF)
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+ out:
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Enable a counter and all its children.
+ */
+static void perf_counter_enable_family(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+
+	perf_counter_enable(counter);
+
+	/*
+	 * Lock the mutex to protect the list of children
+	 */
+	mutex_lock(&counter->mutex);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_enable(child);
+	mutex_unlock(&counter->mutex);
 }
 
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	struct perf_counter *counter;
 	u64 flags;
 
+	spin_lock(&ctx->lock);
+	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
-		return;
+		goto out;
 
-	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
 	hw_perf_restore(flags);
+ out:
 	spin_unlock(&ctx->lock);
 }
 
@@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	u64 flags;
 	int can_add_hw = 1;
 
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
 	if (likely(!ctx->nr_counters))
-		return;
+		goto out;
 
-	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
 
 	/*
@@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		}
 	}
 	hw_perf_restore(flags);
+ out:
 	spin_unlock(&ctx->lock);
 }
 
@@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	file->private_data = NULL;
 
+	mutex_lock(&ctx->mutex);
 	mutex_lock(&counter->mutex);
 
 	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
+	mutex_unlock(&ctx->mutex);
 
 	kfree(counter);
 
@@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct perf_counter *counter = file->private_data;
+	int err = 0;
+
+	switch (cmd) {
+	case PERF_COUNTER_IOC_ENABLE:
+		perf_counter_enable_family(counter);
+		break;
+	case PERF_COUNTER_IOC_DISABLE:
+		perf_counter_disable_family(counter);
+		break;
+	default:
+		err = -ENOTTY;
+	}
+	return err;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
 };
 
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
@@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
+	INIT_LIST_HEAD(&counter->child_list);
+
 	counter->irqdata		= &counter->data[0];
 	counter->usrdata		= &counter->data[1];
 	counter->cpu			= cpu;
@@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 		goto err_free_put_context;
 
 	counter->filp = counter_file;
+	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, counter, cpu);
+	mutex_unlock(&ctx->mutex);
 
 	fput_light(counter_file, fput_needed2);
 
@@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 {
 	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	ctx->task = task;
 }
@@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 /*
  * inherit a counter from parent task to child task:
  */
-static int
+static struct perf_counter *
 inherit_counter(struct perf_counter *parent_counter,
 	      struct task_struct *parent,
 	      struct perf_counter_context *parent_ctx,
 	      struct task_struct *child,
+	      struct perf_counter *group_leader,
 	      struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *child_counter;
 
+	/*
+	 * Instead of creating recursive hierarchies of counters,
+	 * we link inherited counters back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_counter->parent)
+		parent_counter = parent_counter->parent;
+
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
-					    parent_counter->cpu, NULL,
-					    GFP_ATOMIC);
+					    parent_counter->cpu, group_leader,
+					    GFP_KERNEL);
 	if (!child_counter)
-		return -ENOMEM;
+		return NULL;
 
 	/*
 	 * Link it up in the child's context:
@@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter,
 	 */
 	atomic_long_inc(&parent_counter->filp->f_count);
 
+	/*
+	 * Link this into the parent counter's child list
+	 */
+	mutex_lock(&parent_counter->mutex);
+	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+
+	/*
+	 * Make the child state follow the state of the parent counter,
+	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_counter_{en,dis}able_family.
+	 */
+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+	else
+		child_counter->state = PERF_COUNTER_STATE_OFF;
+
+	mutex_unlock(&parent_counter->mutex);
+
+	return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+	      struct task_struct *parent,
+	      struct perf_counter_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *leader;
+	struct perf_counter *sub;
+
+	leader = inherit_counter(parent_counter, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (!leader)
+		return -ENOMEM;
+	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+		if (!inherit_counter(sub, parent, parent_ctx,
+				     child, leader, child_ctx))
+			return -ENOMEM;
+	}
 	return 0;
 }
 
+static void sync_child_counter(struct perf_counter *child_counter,
+			       struct perf_counter *parent_counter)
+{
+	u64 parent_val, child_val;
+
+	parent_val = atomic64_read(&parent_counter->count);
+	child_val = atomic64_read(&child_counter->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_counter->count);
+
+	/*
+	 * Remove this counter from the parent's list
+	 */
+	mutex_lock(&parent_counter->mutex);
+	list_del_init(&child_counter->child_list);
+	mutex_unlock(&parent_counter->mutex);
+
+	/*
+	 * Release the parent counter, if this was the last
+	 * reference to it.
+	 */
+	fput(parent_counter->filp);
+}
+
 static void
 __perf_counter_exit_task(struct task_struct *child,
 			 struct perf_counter *child_counter,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
-	u64 parent_val, child_val;
+	struct perf_counter *sub, *tmp;
 
 	/*
 	 * If we do not self-reap then we have to wait for the
@@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		counter_sched_out(child_counter, cpuctx, child_ctx);
+		group_sched_out(child_counter, cpuctx, child_ctx);
 
 		list_del_init(&child_counter->list_entry);
 
@@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child,
 	 * that are still around due to the child reference. These
 	 * counters need to be zapped - but otherwise linger.
 	 */
-	if (!parent_counter)
-		return;
-
-	parent_val = atomic64_read(&parent_counter->count);
-	child_val = atomic64_read(&child_counter->count);
-
-	/*
-	 * Add back the child's count to the parent's count:
-	 */
-	atomic64_add(child_val, &parent_counter->count);
-
-	fput(parent_counter->filp);
+	if (parent_counter) {
+		sync_child_counter(child_counter, parent_counter);
+		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
+					 list_entry) {
+			if (sub->parent)
+				sync_child_counter(sub, sub->parent);
+			kfree(sub);
+		}
+	}
 
 	kfree(child_counter);
 }
 
 /*
- * When a child task exist, feed back counter values to parent counters.
+ * When a child task exits, feed back counter values to parent counters.
  *
- * Note: we are running in child context, but the PID is not hashed
+ * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
  */
 void perf_counter_exit_task(struct task_struct *child)
@@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child)
 void perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
-	struct perf_counter *counter, *parent_counter;
+	struct perf_counter *counter;
 	struct task_struct *parent = current;
-	unsigned long flags;
 
 	child_ctx  =  &child->perf_counter_ctx;
 	parent_ctx = &parent->perf_counter_ctx;
@@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child)
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
 	 */
-	spin_lock_irqsave(&parent_ctx->lock, flags);
+	mutex_lock(&parent_ctx->mutex);
 
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
 	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
-		if (!counter->hw_event.inherit || counter->group_leader != counter)
+		if (!counter->hw_event.inherit)
 			continue;
 
-		/*
-		 * Instead of creating recursive hierarchies of counters,
-		 * we link inheritd counters back to the original parent,
-		 * which has a filp for sure, which we use as the reference
-		 * count:
-		 */
-		parent_counter = counter;
-		if (counter->parent)
-			parent_counter = counter->parent;
-
-		if (inherit_counter(parent_counter, parent,
+		if (inherit_group(counter, parent,
 				  parent_ctx, child, child_ctx))
 			break;
 	}
 
-	spin_unlock_irqrestore(&parent_ctx->lock, flags);
+	mutex_unlock(&parent_ctx->mutex);
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)
@@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info)
 
 	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
 		__perf_counter_remove_from_context(counter);
-
 }
 static void perf_counter_exit_cpu(int cpu)
 {
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+
+	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+	mutex_unlock(&ctx->mutex);
 }
 #else
 static inline void perf_counter_exit_cpu(int cpu) { }
-- 
cgit v1.2.3-70-g09d2


From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 10:13:01 +0100
Subject: perfcounters: throttle on too high IRQ rates

Starting kerneltop with only -c 100 seems to be a bad idea, it can
easily lock the system due to perfcounter IRQ overload.

So add throttling: if a new IRQ arrives in a shorter than
PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them
from the next timer tick.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic.c             |  2 ++
 arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------
 include/linux/perf_counter.h       |  4 ++++
 3 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 7b434e5b14c..849c23009bf 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
+
+	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9376771f757..1a040b179b5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	u64			last_interrupt;
+	u64			global_enable;
+	int			throttled;
 };
 
 /*
@@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, saved_global;
-	struct cpu_hw_counters *cpuc;
+	u64 ack, status, now;
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 
 	/* Disable counters globally */
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	now = sched_clock();
+	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
+		cpuc->throttled = 1;
+	cpuc->last_interrupt = now;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
@@ -533,9 +539,29 @@ again:
 		goto again;
 out:
 	/*
-	 * Restore - do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	if (!cpuc->throttled)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+}
+
+void perf_counter_unthrottle(void)
+{
+	struct cpu_hw_counters *cpuc;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	if (cpuc->throttled) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		cpuc->throttled = 0;
+	}
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 33ba9fe0a78..91f1ca4c01c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
@@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+#define PERFMON_MIN_PERIOD_NS 10000
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
cgit v1.2.3-70-g09d2


From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:36:16 +0100
Subject: perfcounters: ratelimit performance counter interrupts

Ratelimit performance counter interrupts to 100KHz per CPU.

This replaces the irq-delta-time based method.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------
 include/linux/perf_counter.h       |  2 --
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1a040b179b5..a56d4cf92f3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	u64			last_interrupt;
+	unsigned long		interrupts;
 	u64			global_enable;
-	int			throttled;
 };
 
 /*
@@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	}
 }
 
+/*
+ * Maximum interrupt frequency of 100KHz per CPU
+ */
+#define PERFMON_MAX_INTERRUPTS 100000/HZ
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, now;
+	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
@@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	now = sched_clock();
-	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
-		cpuc->throttled = 1;
-	cpuc->last_interrupt = now;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
 		goto out;
@@ -541,13 +540,14 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (!cpuc->throttled)
+	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
+	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -556,12 +556,15 @@ void perf_counter_unthrottle(void)
 		return;
 
 	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
-	if (cpuc->throttled) {
+	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-		cpuc->throttled = 0;
 	}
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
+	if (unlikely(cpuc->global_enable && !global_enable))
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 91f1ca4c01c..f55381fbcac 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
-#define PERFMON_MIN_PERIOD_NS 10000
-
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
-- 
cgit v1.2.3-70-g09d2


From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 9 Feb 2009 22:42:47 +1100
Subject: perf_counters: make software counters work as per-cpu counters

Impact: kernel crash fix

Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software
counter as a per-cpu counter would reliably crash the system, because
it calls __task_delta_exec with a null pointer.  The page fault,
context switch and cpu migration counters also won't function
correctly as per-cpu counters since they reference the current task.

This fixes the problem by redirecting the task_clock counter to the
cpu_clock counter when used as a per-cpu counter, and by implementing
per-cpu page fault, context switch and cpu migration counters.

Along the way, this:

- Initializes counter->ctx earlier, in perf_counter_alloc, so that
  sw_perf_counter_init can use it
- Adds code to kernel/sched.c to count task migrations into each
  cpu, in rq->nr_migrations_in
- Exports the per-cpu context switch and task migration counts
  via new functions added to kernel/sched.c
- Makes sure that if sw_perf_counter_init fails, we don't try to
  initialize the counter as a hardware counter.  Since the user has
  passed a negative, non-raw event type, they clearly don't intend
  for it to be interpreted as a hardware event.

Reported-by: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/perf_counter.c | 78 +++++++++++++++++++++++++++++----------------------
 kernel/sched.c        | 17 +++++++++++
 3 files changed, 64 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b85b10abf77..1e5f70062a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern u64 cpu_nr_switches(int cpu);
+extern u64 cpu_nr_migrations(int cpu);
 
 struct seq_file;
 struct cfs_rq;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f27a7e9f3c4..544193cbc47 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -20,6 +20,8 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
 
 /*
  * Each CPU has a list of per CPU counters:
@@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
-	counter->ctx = ctx;
 	if (!task) {
 		/*
 		 * Per cpu counters are installed via an smp call and
@@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-static u64 get_page_faults(void)
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
+#else
+#define cpu_page_faults()	0
+#endif
+
+static u64 get_page_faults(struct perf_counter *counter)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = counter->ctx->task;
 
-	return curr->maj_flt + curr->min_flt;
+	if (curr)
+		return curr->maj_flt + curr->min_flt;
+	return cpu_page_faults();
 }
 
 static void page_faults_perf_counter_update(struct perf_counter *counter)
@@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_page_faults();
+	now = get_page_faults(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 
 static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * page-faults is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
 	return 0;
 }
 
@@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
 	.read		= page_faults_perf_counter_read,
 };
 
-static u64 get_context_switches(void)
+static u64 get_context_switches(struct perf_counter *counter)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = counter->ctx->task;
 
-	return curr->nvcsw + curr->nivcsw;
+	if (curr)
+		return curr->nvcsw + curr->nivcsw;
+	return cpu_nr_switches(smp_processor_id());
 }
 
 static void context_switches_perf_counter_update(struct perf_counter *counter)
@@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_context_switches();
+	now = get_context_switches(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 
 static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * ->nvcsw + curr->nivcsw is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
 	return 0;
 }
 
@@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.read		= context_switches_perf_counter_read,
 };
 
-static inline u64 get_cpu_migrations(void)
+static inline u64 get_cpu_migrations(struct perf_counter *counter)
 {
-	return current->se.nr_migrations;
+	struct task_struct *curr = counter->ctx->task;
+
+	if (curr)
+		return curr->se.nr_migrations;
+	return cpu_nr_migrations(smp_processor_id());
 }
 
 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
@@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations();
+	now = get_cpu_migrations(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 
 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * se.nr_migrations is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
 	return 0;
 }
 
@@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter)
 		hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
-		hw_ops = &perf_ops_task_clock;
+		/*
+		 * If the user instantiates this as a per-cpu counter,
+		 * use the cpu_clock counter instead.
+		 */
+		if (counter->ctx->task)
+			hw_ops = &perf_ops_task_clock;
+		else
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 		hw_ops = &perf_ops_page_faults;
@@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 static struct perf_counter *
 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
+		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
@@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
+	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-	if (!hw_ops)
+	else
 		hw_ops = hw_perf_counter_init(counter);
 
 	if (!hw_ops) {
@@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	}
 
 	ret = -EINVAL;
-	counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
+	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+				     GFP_KERNEL);
 	if (!counter)
 		goto err_put_context;
 
@@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter,
 		parent_counter = parent_counter->parent;
 
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
-					    parent_counter->cpu, group_leader,
-					    GFP_KERNEL);
+					   parent_counter->cpu, child_ctx,
+					   group_leader, GFP_KERNEL);
 	if (!child_counter)
 		return NULL;
 
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->ctx = child_ctx;
 	child_counter->task = child;
 	list_add_counter(child_counter, child_ctx);
 	child_ctx->nr_counters++;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8db1a4cf208..173768f142a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -558,6 +558,7 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
+	u64 nr_migrations_in;
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
@@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
+		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
@@ -2810,6 +2812,21 @@ unsigned long nr_active(void)
 	return running + uninterruptible;
 }
 
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_switches(cpu) - number of context switches on that cpu
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_switches(int cpu)
+{
+	return cpu_rq(cpu)->nr_switches;
+}
+
+u64 cpu_nr_migrations(int cpu)
+{
+	return cpu_rq(cpu)->nr_migrations_in;
+}
+
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
-- 
cgit v1.2.3-70-g09d2


From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 14:35:35 +1100
Subject: perf_counters: allow users to count user, kernel and/or hypervisor
 events

Impact: new perf_counter feature

This extends the perf_counter_hw_event struct with bits that specify
that events in user, kernel and/or hypervisor mode should not be
counted (i.e. should be excluded), and adds code to program the PMU
mode selection bits accordingly on x86 and powerpc.

For software counters, we don't currently have the infrastructure to
distinguish which mode an event occurs in, so we currently fail the
counter initialization if the setting of the hw_event.exclude_* bits
would require us to distinguish.  Context switches and CPU migrations
are currently considered to occur in kernel mode.

On x86, this changes the previous policy that only root can count
kernel events.  Now non-root users can count kernel events or exclude
them.  Non-root users still can't use NMI events, though.  On x86 we
don't appear to have any way to control whether hypervisor events are
counted or not, so hw_event.exclude_hv is ignored.

On powerpc, the selection of whether to count events in user, kernel
and/or hypervisor mode is PMU-wide, not per-counter, so this adds a
check that the hw_event.exclude_* settings are the same as other events
on the PMU.  Counters being added to a group have to have the same
settings as the other hardware counters in the group.  Counters and
groups can only be enabled in hw_perf_group_sched_in or power_perf_enable
if they have the same settings as any other counters already on the
PMU.  If we are not running on a hypervisor, the exclude_hv setting
is ignored (by forcing it to 0) since we can't ever get any
hypervisor events.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++-------
 include/linux/perf_counter.h       | 19 ++++++-----
 kernel/perf_counter.c              | 26 ++++++++++++---
 4 files changed, 117 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5b0211348c7..bd6ba85beb5 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -16,6 +16,7 @@
 #include <asm/reg.h>
 #include <asm/pmc.h>
 #include <asm/machdep.h>
+#include <asm/firmware.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	return 0;
 }
 
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+{
+	int eu, ek, eh;
+	int i, n;
+	struct perf_counter *counter;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	eu = ctrs[0]->hw_event.exclude_user;
+	ek = ctrs[0]->hw_event.exclude_kernel;
+	eh = ctrs[0]->hw_event.exclude_hv;
+	if (n_prev == 0)
+		n_prev = 1;
+	for (i = n_prev; i < n; ++i) {
+		counter = ctrs[i];
+		if (counter->hw_event.exclude_user != eu ||
+		    counter->hw_event.exclude_kernel != ek ||
+		    counter->hw_event.exclude_hv != eh)
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 static void power_perf_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
@@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable)
 		goto out;
 	}
 
+	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * hw_event.exclude_* bits for the first counter.
+	 * We have already checked that all counters have the
+	 * same values for these bits as the first counter.
+	 */
+	counter = cpuhw->counter[0];
+	if (counter->hw_event.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (counter->hw_event.exclude_kernel)
+		cpuhw->mmcr[0] |= MMCR0_FCS;
+	if (counter->hw_event.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
 	/*
 	 * Write the new configuration to MMCR* with the freeze
 	 * bit set and set the hardware counters to their initial values.
@@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			   &cpuhw->counter[n0], &cpuhw->events[n0]);
 	if (n < 0)
 		return -EAGAIN;
+	if (check_excludes(cpuhw->counter, n0, n))
+		return -EAGAIN;
 	if (power_check_constraints(cpuhw->events, n + n0))
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
@@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
+	if (check_excludes(cpuhw->counter, n0, 1))
+		goto out;
 	if (power_check_constraints(cpuhw->events, n0 + 1))
 		goto out;
 
@@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
 
+	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.  This also means that we don't
+	 * set the MMCR0_FCHV bit, which unconditionally freezes
+	 * the counters on the PPC970 variants used in Apple G5
+	 * machines (since MSR.HV is always 1 on those machines).
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		counter->hw_event.exclude_hv = 0;
+	
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		if (n < 0)
 			return NULL;
 	}
-	events[n++] = ev;
-	if (power_check_constraints(events, n))
+	events[n] = ev;
+	if (check_excludes(ctrs, n, 1))
+		return NULL;
+	if (power_check_constraints(events, n + 1))
 		return NULL;
 
-	counter->hw.config = events[n - 1];
+	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 	return &power_perf_ops;
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9901e46998d..383d4c6423a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -EINVAL;
 
 	/*
-	 * Count user events, and generate PMC IRQs:
+	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
-	 * If privileged enough, count OS events too, and allow
-	 * NMI events as well:
+	 * Count user and OS events unless requested not to.
 	 */
-	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN)) {
+	if (!hw_event->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event->nmi)
-			hwc->nmi = 1;
-	}
+
+	/*
+	 * If privileged enough, allow NMI events:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
 	/*
@@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	int err;
 
 	/*
-	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
-	 * and enable ring-0 counting if allowed:
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
 	 */
-	bits = 0x8ULL | 0x2ULL;
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
 	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
 		bits |= 0x1;
 	bits <<= (idx * 4);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f55381fbcac..c83f51d6e35 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,14 +83,17 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default      */
-				nmi	     :  1, /* NMI sampling        */
-				raw	     :  1, /* raw event type      */
-				inherit	     :  1, /* children inherit it */
-				pinned	     :  1, /* must always be on PMU */
-				exclusive    :  1, /* only counter on PMU */
-
-				__reserved_1 : 26;
+	u32			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+
+				__reserved_1 : 23;
 
 	u64			__reserved_2;
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 544193cbc47..89d5e3fe970 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct hw_perf_counter_ops *hw_ops = NULL;
 
+	/*
+	 * Software counters (currently) can't in general distinguish
+	 * between user, kernel and hypervisor events.
+	 * However, context switches and cpu migrations are considered
+	 * to be kernel events, and page faults are never hypervisor
+	 * events.
+	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel ||
+		      counter->hw_event.exclude_hv))
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
+		if (counter->hw_event.exclude_user ||
+		    counter->hw_event.exclude_kernel ||
+		    counter->hw_event.exclude_hv)
+			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		hw_ops = &perf_ops_page_faults;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel))
+			hw_ops = &perf_ops_page_faults;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_context_switches;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_context_switches;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		hw_ops = &perf_ops_cpu_migrations;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
 		break;
-- 
cgit v1.2.3-70-g09d2


From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 13 Feb 2009 22:10:34 +1100
Subject: perfcounters: make context switch and migration software counters
 work again

Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the
context switch and migration software counters to report zero always.
With that commit, the software counters only count events that occur
between sched-in and sched-out for a task.  This is necessary for the
counter enable/disable prctls and ioctls to work.  However, the
context switch and migration counts are incremented after sched-out
for one task and before sched-in for the next.  Since the increment
doesn't occur while a task is scheduled in (as far as the software
counters are concerned) it doesn't count towards any counter.

Thus the context switch and migration counters need to count events
that occur at any time, provided the counter is enabled, not just
those that occur while the task is scheduled in (from the perf_counter
subsystem's point of view).  The problem though is that the software
counter code can't tell the difference between being enabled and being
scheduled in, and between being disabled and being scheduled out,
since we use the one pair of enable/disable entry points for both.
That is, the high-level disable operation simply arranges for the
counter to not be scheduled in any more, and the high-level enable
operation arranges for it to be scheduled in again.

One way to solve this would be to have sched_in/out operations in the
hw_perf_counter_ops struct as well as enable/disable.  However, this
takes a simpler approach: it adds a 'prev_state' field to the
perf_counter struct that allows a counter's enable method to know
whether the counter was previously disabled or just inactive
(scheduled out), and therefore whether the enable method is being
called as a result of a high-level enable or a schedule-in operation.

This then allows the context switch, migration and page fault counters
to reset their hw.prev_count value in their enable functions only if
they are called as a result of a high-level enable operation.
Although page faults would normally only occur while the counter is
scheduled in, this changes the page fault counter code too in case
there are ever circumstances where page faults get counted against a
task while its counters are not scheduled in.

Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c83f51d6e35..32cd1acb738 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -173,6 +173,7 @@ struct perf_counter {
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
+	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
 	struct perf_counter_hw_event	hw_event;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fcefb0a726f..ad62965828d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -444,6 +444,7 @@ static void __perf_install_in_context(void *info)
 
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
+	counter->prev_state = PERF_COUNTER_STATE_OFF;
 
 	/*
 	 * Don't put the counter on if it is disabled or if
@@ -562,6 +563,7 @@ static void __perf_counter_enable(void *info)
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
 
+	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
@@ -733,6 +735,7 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
+	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -740,6 +743,7 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -1398,9 +1402,9 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	u64 now = task_clock_perf_counter_val(counter, 0);
-
-	atomic64_set(&counter->hw.prev_count, now);
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     task_clock_perf_counter_val(counter, 0));
 
 	return 0;
 }
@@ -1455,7 +1459,8 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 
 static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
 	return 0;
 }
 
@@ -1501,7 +1506,9 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 
 static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     get_context_switches(counter));
 	return 0;
 }
 
@@ -1547,7 +1554,9 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 
 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     get_cpu_migrations(counter));
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 26 Feb 2009 22:43:46 +1100
Subject: perfcounters: fix a few minor cleanliness issues

This fixes three issues noticed by Arnd Bergmann:

- Add #ifdef __KERNEL__ and move some things around in perf_counter.h
  to make sure only the bits that userspace needs are exported to
  userspace.

- Use __u64, __s64, __u32 types in the structs exported to userspace
  rather than u64, s64, u32.

- Make the sys_perf_counter_open syscall available to the SPUs on
  Cell platforms.

And one issue that I noticed in looking at the code again:

- Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get
  the proper handling of int arguments on ppc64 (and some other 64-bit
  architectures).

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/systbl.h |  2 +-
 include/linux/perf_counter.h      | 43 +++++++++++++++++++++------------------
 include/linux/syscalls.h          |  9 +++-----
 kernel/perf_counter.c             |  6 +++---
 4 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 4c8095f6bec..d312eec8abb 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
-SYSCALL(perf_counter_open)
+SYSCALL_SPU(perf_counter_open)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 32cd1acb738..186efaf4966 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -13,20 +13,8 @@
 #ifndef _LINUX_PERF_COUNTER_H
 #define _LINUX_PERF_COUNTER_H
 
-#include <asm/atomic.h>
-#include <asm/ioctl.h>
-
-#ifdef CONFIG_PERF_COUNTERS
-# include <asm/perf_counter.h>
-#endif
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-
-struct task_struct;
+#include <linux/types.h>
+#include <linux/ioctl.h>
 
 /*
  * User-space ABI bits:
@@ -78,12 +66,12 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	s64			type;
+	__s64			type;
 
-	u64			irq_period;
-	u32			record_type;
+	__u64			irq_period;
+	__u32			record_type;
 
-	u32			disabled       :  1, /* off by default        */
+	__u32			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
 				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
@@ -95,7 +83,7 @@ struct perf_counter_hw_event {
 
 				__reserved_1 : 23;
 
-	u64			__reserved_2;
+	__u64			__reserved_2;
 };
 
 /*
@@ -104,10 +92,24 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
 
+#ifdef __KERNEL__
 /*
- * Kernel-internal data types:
+ * Kernel-internal data types and definitions:
  */
 
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 #endif
 
+#endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88255d3261a..28ef2be839c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *);
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd);
+asmlinkage long sys_perf_counter_open(
+		const struct perf_counter_hw_event __user *hw_event_uptr,
+		pid_t pid, int cpu, int group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ad62965828d..16b14ba99d3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
-		      pid_t pid, int cpu, int group_fd)
+SYSCALL_DEFINE4(perf_counter_open,
+		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		pid_t, pid, int, cpu, int, group_fd)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-- 
cgit v1.2.3-70-g09d2


From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 4 Mar 2009 20:36:51 +1100
Subject: perfcounters: provide expansion room in the ABI

Impact: ABI change

This expands several fields in the perf_counter_hw_event struct and adds
a "flags" argument to the perf_counter_open system call, in order that
features can be added in future without ABI changes.

In particular the record_type field is expanded to 64 bits, and the
space for flag bits has been expanded from 32 to 64 bits.

This also adds some new fields:

* read_format (64 bits) is intended to provide a way to specify what
  userspace wants to get back when it does a read() on a simple
  (non-interrupting) counter;

* exclude_idle (1 bit) provides a way for userspace to ask that events
  that occur when the cpu is idle be excluded;

* extra_config_len will provide a way for userspace to supply an
  arbitrary amount of extra machine-specific PMU configuration data
  immediately following the perf_counter_hw_event struct, to allow
  sophisticated users to program things such as instruction matching
  CAMs and address range registers;

* __reserved_3 and __reserved_4 provide space for future expansion.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h | 12 +++++++++---
 include/linux/syscalls.h     |  2 +-
 kernel/perf_counter.c        | 10 +++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 186efaf4966..c42455ab155 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -69,9 +69,10 @@ struct perf_counter_hw_event {
 	__s64			type;
 
 	__u64			irq_period;
-	__u32			record_type;
+	__u64			record_type;
+	__u64			read_format;
 
-	__u32			disabled       :  1, /* off by default        */
+	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
 				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
@@ -80,10 +81,15 @@ struct perf_counter_hw_event {
 				exclude_user   :  1, /* don't count user      */
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1 : 23;
+				__reserved_1 : 55;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
 
 	__u64			__reserved_2;
+	__u64			__reserved_3;
 };
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 28ef2be839c..ab1d7724739 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 asmlinkage long sys_perf_counter_open(
 		const struct perf_counter_hw_event __user *hw_event_uptr,
-		pid_t pid, int cpu, int group_fd);
+		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 16b14ba99d3..b2e838959f3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 }
 
 /**
- * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
  * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-SYSCALL_DEFINE4(perf_counter_open,
+SYSCALL_DEFINE5(perf_counter_open,
 		const struct perf_counter_hw_event __user *, hw_event_uptr,
-		pid_t, pid, int, cpu, int, group_fd)
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
@@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open,
 	int fput_needed2 = 0;
 	int ret;
 
+	/* for future expandability... */
+	if (flags)
+		return -EINVAL;
+
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
-- 
cgit v1.2.3-70-g09d2


From 2485e5184452ccbda34ff83883883d9107800dff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 12:33:16 +0100
Subject: perfcounters: fix reserved bits sizing

The sum of bits is 65 currently not 64 - so reduce the # of reserved bits
from 55 to 54.

Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c42455ab155..dde564517b6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,7 +83,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1 : 55;
+				__reserved_1   : 54;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
-- 
cgit v1.2.3-70-g09d2


From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 20 Mar 2009 12:50:56 -0400
Subject: tracing: add function profiler

Impact: new profiling feature

This patch adds a function profiler. In debugfs/tracing/ two new
files are created.

  function_profile_enabled  - to enable or disable profiling

  trace_stat/functions   - the profiled functions.

For example:

  echo 1 > /debugfs/tracing/function_profile_enabled
  ./hackbench 50
  echo 0 > /debugfs/tracing/function_profile_enabled

yields:

  cat /debugfs/tracing/trace_stat/functions

  Function                               Hit
  --------                               ---
  _spin_lock                        10106442
  _spin_unlock                      10097492
  kfree                              6013704
  _spin_unlock_irqrestore            4423941
  _spin_lock_irqsave                 4406825
  __phys_addr                        4181686
  __slab_free                        4038222
  dput                               4030130
  path_put                           4023387
  unroll_tree_refs                   4019532
[...]

The most hit functions are listed first. Functions that are not
hit are not listed.

This feature depends on and uses dynamic function tracing. When the
function profiling is disabled, no overhead occurs. But it still
takes up around 300KB to hold the data, thus it is not recomended
to keep it enabled for systems low on memory.

When a '1' is echoed into the function_profile_enabled file, the
counters for is function is reset back to zero. Thus you can see what
functions are hit most by different programs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |   4 +
 kernel/trace/Kconfig   |  19 +++
 kernel/trace/ftrace.c  | 313 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 334 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d22cf7..0456c3a51c6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -153,6 +153,10 @@ struct dyn_ftrace {
 		unsigned long		flags;
 		struct dyn_ftrace	*newlist;
 	};
+#ifdef CONFIG_FUNCTION_PROFILER
+	unsigned long			counter;
+	struct hlist_node		node;
+#endif
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8a4d7293104..95e9ad5735d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
@@ -376,6 +377,24 @@ config DYNAMIC_FTRACE
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 and modifies the code to jump over the call to ftrace.
 
+config FUNCTION_PROFILER
+	bool "Kernel function profiler"
+	depends on DYNAMIC_FTRACE
+	default n
+	help
+	 This option enables the kernel function profiler. When the dynamic
+	 function tracing is enabled, a counter is added into the function
+	 records used by the dynamic function tracer. A file is created in
+	 debugfs called function_profile_enabled which defaults to zero.
+	 When a 1 is echoed into this file profiling begins, and when a
+	 zero is entered, profiling stops. A file in the trace_stats
+	 directory called functions, that show the list of functions that
+	 have been hit and their counters.
+
+	 This takes up around 320K more memory.
+
+	 If in doubt, say N
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722baf15..11f364c776d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,6 +34,7 @@
 #include <asm/ftrace.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
 	do {					\
@@ -261,7 +262,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 };
 
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records;
 		}				\
 	}
 
+#ifdef CONFIG_FUNCTION_PROFILER
+static struct hlist_head *ftrace_profile_hash;
+static int ftrace_profile_bits;
+static int ftrace_profile_enabled;
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static void *
+function_stat_next(void *v, int idx)
+{
+	struct dyn_ftrace *rec = v;
+	struct ftrace_page *pg;
+
+	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+	rec++;
+	if ((void *)rec >= (void *)&pg->records[pg->index]) {
+		pg = pg->next;
+		if (!pg)
+			return NULL;
+		rec = &pg->records[0];
+	}
+
+	if (rec->flags & FTRACE_FL_FREE ||
+	    rec->flags & FTRACE_FL_FAILED ||
+	    !(rec->flags & FTRACE_FL_CONVERTED) ||
+	    /* ignore non hit functions */
+	    !rec->counter)
+		goto again;
+
+	return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+	return function_stat_next(&ftrace_pages_start->records[0], 0);
+}
+
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct dyn_ftrace *a = p1;
+	struct dyn_ftrace *b = p2;
+
+	if (a->counter < b->counter)
+		return -1;
+	if (a->counter > b->counter)
+		return 1;
+	else
+		return 0;
+}
+
+static int function_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "  Function                               Hit\n"
+		      "  --------                               ---\n");
+	return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
+	return 0;
+}
+
+static struct tracer_stat function_stats = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
+static void ftrace_profile_init(int nr_funcs)
+{
+	unsigned long addr;
+	int order;
+	int size;
+
+	/*
+	 * We are profiling all functions, lets make it 1/4th of the
+	 * number of functions that are in core kernel. So we have to
+	 * iterate 4 times.
+	 */
+	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
+	order = get_order(order);
+	size = 1 << (PAGE_SHIFT + order);
+
+	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+
+	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!addr) {
+		pr_warning("Could not allocate function profiler hash\n");
+		return;
+	}
+
+	ftrace_profile_hash = (void *)addr;
+
+	/*
+	 * struct hlist_head should be a pointer of 4 or 8 bytes.
+	 * And a simple bit manipulation can be done, but if for
+	 * some reason struct hlist_head is not a mulitple of 2,
+	 * then we play it safe, and simply count. This function
+	 * is done once at boot up, so it is not that critical in
+	 * performance.
+	 */
+
+	size--;
+	size /= sizeof(struct hlist_head);
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	pr_info("Function profiler has %d hash buckets\n",
+		1 << ftrace_profile_bits);
+
+	return;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void ftrace_profile_reset(void)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		rec->counter = 0;
+	} while_for_each_ftrace_rec();
+}
+
+static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long flags;
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return NULL;
+
+	key = hash_long(ip, ftrace_profile_bits);
+	hhd = &ftrace_profile_hash[key];
+
+	if (hlist_empty(hhd))
+		return NULL;
+
+	local_irq_save(flags);
+	hlist_for_each_entry_rcu(rec, n, hhd, node) {
+		if (rec->ip == ip)
+			goto out;
+	}
+	rec = NULL;
+ out:
+	local_irq_restore(flags);
+
+	return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *rec;
+	unsigned long flags;
+
+	if (!ftrace_profile_enabled)
+		return;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(ip);
+	if (!rec)
+		goto out;
+
+	rec->counter++;
+ out:
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+	.func = function_profile_call,
+};
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (!ftrace_profile_hash) {
+		pr_info("Can not enable hash due to earlier problems\n");
+		return -ENODEV;
+	}
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&ftrace_profile_lock);
+	if (ftrace_profile_enabled ^ val) {
+		if (val) {
+			ftrace_profile_reset();
+			register_ftrace_function(&ftrace_profile_ops);
+			ftrace_profile_enabled = 1;
+		} else {
+			ftrace_profile_enabled = 0;
+			unregister_ftrace_function(&ftrace_profile_ops);
+		}
+	}
+	mutex_unlock(&ftrace_profile_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static const struct file_operations ftrace_profile_fops = {
+	.open		= tracing_open_generic,
+	.read		= ftrace_profile_read,
+	.write		= ftrace_profile_write,
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+	struct dentry *entry;
+	int ret;
+
+	ret = register_stat_tracer(&function_stats);
+	if (ret) {
+		pr_warning("Warning: could not register "
+			   "function stats\n");
+		return;
+	}
+
+	entry = debugfs_create_file("function_profile_enabled", 0644,
+				    d_tracer, NULL, &ftrace_profile_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'function_profile_enabled' entry\n");
+}
+
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+	mutex_lock(&ftrace_profile_lock);
+	hlist_del(&rec->node);
+	mutex_unlock(&ftrace_profile_lock);
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_init(int nr_funcs)
+{
+}
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+}
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		    !(rec->flags & FTRACE_FL_FREE)) {
 			ftrace_free_rec(rec);
+			ftrace_profile_release(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
+	ftrace_add_profile(rec);
+
 	return rec;
 }
 
@@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 
@@ -2225,6 +2532,8 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
+	ftrace_profile_init(count);
+
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
-- 
cgit v1.2.3-70-g09d2


From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 17:12:36 -0400
Subject: tracing: move function profiler data out of function struct

Impact: reduce size of memory in function profiler

The function profiler originally introduces its counters into the
function records itself. There is 20 thousand different functions on
a normal system, and that is adding 20 thousand counters for profiling
event when not needed.

A normal run of the profiler yields only a couple of thousand functions
executed, depending on what is being profiled. This means we have around
18 thousand useless counters.

This patch rectifies this by moving the data out of the function
records used by dynamic ftrace. Data is preallocated to hold the functions
when the profiling begins. Checks are made during profiling to see if
more recorcds should be allocated, and they are allocated if it is safe
to do so.

This also removes the dependency from using dynamic ftrace, and also
removes the overhead by having it enabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |   4 -
 kernel/trace/Kconfig   |  10 +-
 kernel/trace/ftrace.c  | 440 +++++++++++++++++++++++++++++--------------------
 3 files changed, 263 insertions(+), 191 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0456c3a51c6..015a3d22cf7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -153,10 +153,6 @@ struct dyn_ftrace {
 		unsigned long		flags;
 		struct dyn_ftrace	*newlist;
 	};
-#ifdef CONFIG_FUNCTION_PROFILER
-	unsigned long			counter;
-	struct hlist_node		node;
-#endif
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95e9ad5735d..8a4136096d7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -379,20 +379,16 @@ config DYNAMIC_FTRACE
 
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
-	depends on DYNAMIC_FTRACE
+	depends on FUNCTION_TRACER
 	default n
 	help
-	 This option enables the kernel function profiler. When the dynamic
-	 function tracing is enabled, a counter is added into the function
-	 records used by the dynamic function tracer. A file is created in
-	 debugfs called function_profile_enabled which defaults to zero.
+	 This option enables the kernel function profiler. A file is created
+	 in debugfs called function_profile_enabled which defaults to zero.
 	 When a 1 is echoed into this file profiling begins, and when a
 	 zero is entered, profiling stops. A file in the trace_stats
 	 directory called functions, that show the list of functions that
 	 have been hit and their counters.
 
-	 This takes up around 320K more memory.
-
 	 If in doubt, say N
 
 config FTRACE_MCOUNT_RECORD
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 11f364c776d..24dac448cdc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -241,87 +241,48 @@ static void ftrace_update_pid_func(void)
 #endif
 }
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_probe {
-	struct hlist_node	node;
-	struct ftrace_probe_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+	struct hlist_node		node;
+	unsigned long			ip;
+	unsigned long			counter;
 };
 
-enum {
-	FTRACE_ENABLE_CALLS		= (1 << 0),
-	FTRACE_DISABLE_CALLS		= (1 << 1),
-	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_ENABLE_MCOUNT		= (1 << 3),
-	FTRACE_DISABLE_MCOUNT		= (1 << 4),
-	FTRACE_START_FUNC_RET		= (1 << 5),
-	FTRACE_STOP_FUNC_RET		= (1 << 6),
+struct ftrace_profile_page {
+	struct ftrace_profile_page	*next;
+	unsigned long			index;
+	struct ftrace_profile		records[];
 };
 
-static int ftrace_filtered;
+#define PROFILE_RECORDS_SIZE						\
+	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
-static struct dyn_ftrace *ftrace_new_addrs;
+#define PROFILES_PER_PAGE					\
+	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static DEFINE_MUTEX(ftrace_regex_lock);
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct dyn_ftrace	records[];
-};
+/* TODO: make these percpu, to prevent cache line bouncing */
+static struct ftrace_profile_page *profile_pages_start;
+static struct ftrace_profile_page *profile_pages;
 
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-static struct dyn_ftrace *ftrace_free_records;
-
-/*
- * This is a double for. Do not use 'break' to break out of the loop,
- * you must use a goto.
- */
-#define do_for_each_ftrace_rec(pg, rec)					\
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
-		int _____i;						\
-		for (_____i = 0; _____i < pg->index; _____i++) {	\
-			rec = &pg->records[_____i];
-
-#define while_for_each_ftrace_rec()		\
-		}				\
-	}
-
-#ifdef CONFIG_FUNCTION_PROFILER
 static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
+static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static raw_spinlock_t ftrace_profile_rec_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
 static void *
 function_stat_next(void *v, int idx)
 {
-	struct dyn_ftrace *rec = v;
-	struct ftrace_page *pg;
+	struct ftrace_profile *rec = v;
+	struct ftrace_profile_page *pg;
 
-	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
 
  again:
 	rec++;
@@ -330,27 +291,22 @@ function_stat_next(void *v, int idx)
 		if (!pg)
 			return NULL;
 		rec = &pg->records[0];
+		if (!rec->counter)
+			goto again;
 	}
 
-	if (rec->flags & FTRACE_FL_FREE ||
-	    rec->flags & FTRACE_FL_FAILED ||
-	    !(rec->flags & FTRACE_FL_CONVERTED) ||
-	    /* ignore non hit functions */
-	    !rec->counter)
-		goto again;
-
 	return rec;
 }
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&ftrace_pages_start->records[0], 0);
+	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
 static int function_stat_cmp(void *p1, void *p2)
 {
-	struct dyn_ftrace *a = p1;
-	struct dyn_ftrace *b = p2;
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
 
 	if (a->counter < b->counter)
 		return -1;
@@ -369,7 +325,7 @@ static int function_stat_headers(struct seq_file *m)
 
 static int function_stat_show(struct seq_file *m, void *v)
 {
-	struct dyn_ftrace *rec = v;
+	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -387,115 +343,191 @@ static struct tracer_stat function_stats = {
 	.stat_show = function_stat_show
 };
 
-static void ftrace_profile_init(int nr_funcs)
+static void ftrace_profile_reset(void)
 {
-	unsigned long addr;
-	int order;
-	int size;
+	struct ftrace_profile_page *pg;
 
-	/*
-	 * We are profiling all functions, lets make it 1/4th of the
-	 * number of functions that are in core kernel. So we have to
-	 * iterate 4 times.
-	 */
-	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
-	order = get_order(order);
-	size = 1 << (PAGE_SHIFT + order);
-
-	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+	pg = profile_pages = profile_pages_start;
 
-	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!addr) {
-		pr_warning("Could not allocate function profiler hash\n");
-		return;
+	while (pg) {
+		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+		pg->index = 0;
+		pg = pg->next;
 	}
 
-	ftrace_profile_hash = (void *)addr;
+	memset(ftrace_profile_hash, 0,
+	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
 
-	/*
-	 * struct hlist_head should be a pointer of 4 or 8 bytes.
-	 * And a simple bit manipulation can be done, but if for
-	 * some reason struct hlist_head is not a mulitple of 2,
-	 * then we play it safe, and simply count. This function
-	 * is done once at boot up, so it is not that critical in
-	 * performance.
-	 */
+int ftrace_profile_pages_init(void)
+{
+	struct ftrace_profile_page *pg;
+	int i;
 
-	size--;
-	size /= sizeof(struct hlist_head);
+	/* If we already allocated, do nothing */
+	if (profile_pages)
+		return 0;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!profile_pages)
+		return -ENOMEM;
 
-	pr_info("Function profiler has %d hash buckets\n",
-		1 << ftrace_profile_bits);
+	pg = profile_pages_start = profile_pages;
 
-	return;
+	/* allocate 10 more pages to start */
+	for (i = 0; i < 10; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+		/*
+		 * We only care about allocating profile_pages, if
+		 * we failed to allocate here, hopefully we will allocate
+		 * later.
+		 */
+		if (!pg->next)
+			break;
+		pg = pg->next;
+	}
+
+	return 0;
 }
 
-static ssize_t
-ftrace_profile_read(struct file *filp, char __user *ubuf,
-		     size_t cnt, loff_t *ppos)
+static int ftrace_profile_init(void)
 {
-	char buf[64];
-	int r;
+	int size;
 
-	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
+	if (ftrace_profile_hash) {
+		/* If the profile is already created, simply reset it */
+		ftrace_profile_reset();
+		return 0;
+	}
 
-static void ftrace_profile_reset(void)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
+	/*
+	 * We are profiling all functions, but usually only a few thousand
+	 * functions are hit. We'll make a hash of 1024 items.
+	 */
+	size = FTRACE_PROFILE_HASH_SIZE;
 
-	do_for_each_ftrace_rec(pg, rec) {
-		rec->counter = 0;
-	} while_for_each_ftrace_rec();
+	ftrace_profile_hash =
+		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+	if (!ftrace_profile_hash)
+		return -ENOMEM;
+
+	size--;
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	/* Preallocate a few pages */
+	if (ftrace_profile_pages_init() < 0) {
+		kfree(ftrace_profile_hash);
+		ftrace_profile_hash = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
-static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+/* interrupts must be disabled */
+static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
-	unsigned long flags;
 	unsigned long key;
 
-	if (!ftrace_profile_hash)
-		return NULL;
-
 	key = hash_long(ip, ftrace_profile_bits);
 	hhd = &ftrace_profile_hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
 
-	local_irq_save(flags);
 	hlist_for_each_entry_rcu(rec, n, hhd, node) {
 		if (rec->ip == ip)
-			goto out;
+			return rec;
+	}
+
+	return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile *rec)
+{
+	unsigned long key;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+/* Interrupts must be disabled calling this */
+static struct ftrace_profile *
+ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+{
+	struct ftrace_profile *rec = NULL;
+
+	/* prevent recursion */
+	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+		goto out;
+
+	__raw_spin_lock(&ftrace_profile_rec_lock);
+
+	/* Try to always keep another page available */
+	if (!profile_pages->next && alloc_safe)
+		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+
+	/*
+	 * Try to find the function again since another
+	 * task on another CPU could have added it
+	 */
+	rec = ftrace_find_profiled_func(ip);
+	if (rec)
+		goto out_unlock;
+
+	if (profile_pages->index == PROFILES_PER_PAGE) {
+		if (!profile_pages->next)
+			goto out_unlock;
+		profile_pages = profile_pages->next;
 	}
-	rec = NULL;
+
+	rec = &profile_pages->records[profile_pages->index++];
+	rec->ip = ip;
+	ftrace_add_profile(rec);
+
+ out_unlock:
+	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	local_irq_restore(flags);
+	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
 
 	return rec;
 }
 
+/*
+ * If we are not in an interrupt, or softirq and
+ * and interrupts are disabled and preemption is not enabled
+ * (not in a spinlock) then it should be safe to allocate memory.
+ */
+static bool ftrace_safe_to_allocate(void)
+{
+	return !in_interrupt() && irqs_disabled() && !preempt_count();
+}
+
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	unsigned long flags;
+	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
+	alloc_safe = ftrace_safe_to_allocate();
+
 	local_irq_save(flags);
 	rec = ftrace_find_profiled_func(ip);
-	if (!rec)
-		goto out;
+	if (!rec) {
+		rec = ftrace_profile_alloc(ip, alloc_safe);
+		if (!rec)
+			goto out;
+	}
 
 	rec->counter++;
  out:
@@ -515,11 +547,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	char buf[64];
 	int ret;
 
-	if (!ftrace_profile_hash) {
-		pr_info("Can not enable hash due to earlier problems\n");
-		return -ENODEV;
-	}
-
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
@@ -537,7 +564,12 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	mutex_lock(&ftrace_profile_lock);
 	if (ftrace_profile_enabled ^ val) {
 		if (val) {
-			ftrace_profile_reset();
+			ret = ftrace_profile_init();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
+
 			register_ftrace_function(&ftrace_profile_ops);
 			ftrace_profile_enabled = 1;
 		} else {
@@ -545,6 +577,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			unregister_ftrace_function(&ftrace_profile_ops);
 		}
 	}
+ out:
 	mutex_unlock(&ftrace_profile_lock);
 
 	filp->f_pos += cnt;
@@ -552,6 +585,17 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct file_operations ftrace_profile_fops = {
 	.open		= tracing_open_generic,
 	.read		= ftrace_profile_read,
@@ -577,39 +621,80 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
 			   "'function_profile_enabled' entry\n");
 }
 
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-	unsigned long key;
-
-	if (!ftrace_profile_hash)
-		return;
-
-	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
-}
-
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-	mutex_lock(&ftrace_profile_lock);
-	hlist_del(&rec->node);
-	mutex_unlock(&ftrace_profile_lock);
-}
-
 #else /* CONFIG_FUNCTION_PROFILER */
-static void ftrace_profile_init(int nr_funcs)
-{
-}
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-}
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
 }
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-}
 #endif /* CONFIG_FUNCTION_PROFILER */
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
+#endif
+
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_probe {
+	struct hlist_node	node;
+	struct ftrace_probe_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
+};
+
+static int ftrace_filtered;
+
+static struct dyn_ftrace *ftrace_new_addrs;
+
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE)) {
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
-			ftrace_profile_release(rec);
-		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
-	ftrace_add_profile(rec);
-
 	return rec;
 }
 
@@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-	ftrace_profile_debugfs(d_tracer);
-
 	return 0;
 }
 
@@ -2532,8 +2611,6 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
-	ftrace_profile_init(count);
-
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
@@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_pid' entry\n");
+
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 fs_initcall(ftrace_init_debugfs);
-- 
cgit v1.2.3-70-g09d2


From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 23:17:58 -0400
Subject: function-graph: add option to calculate graph time or not

graph time is the time that a function is executing another function.
Thus if function A calls B, if graph-time is set, then the time for
A includes B. This is the default behavior. But if graph-time is off,
then the time spent executing B is subtracted from A.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h               |  3 +--
 kernel/trace/ftrace.c                | 21 ++++++++++++++++++++-
 kernel/trace/trace.c                 |  4 +++-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_functions_graph.c |  8 ++++----
 5 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d22cf7..9e0a8d245e5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -365,6 +365,7 @@ struct ftrace_ret_stack {
 	unsigned long ret;
 	unsigned long func;
 	unsigned long long calltime;
+	unsigned long long subtime;
 };
 
 /*
@@ -376,8 +377,6 @@ extern void return_to_handler(void);
 
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
-extern void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
 /*
  * Sometimes we don't want to trace a function with the function
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ed1fc5021d4..71e5faef12a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
 	struct ftrace_profile_stat *stat;
+	unsigned long long calltime;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 
@@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash)
 		goto out;
 
+	calltime = trace->rettime - trace->calltime;
+
+	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+		int index;
+
+		index = trace->depth;
+
+		/* Append this call time to the parent time to subtract */
+		if (index)
+			current->ret_stack[index - 1].subtime += calltime;
+
+		if (current->ret_stack[index].subtime < calltime)
+			calltime -= current->ret_stack[index].subtime;
+		else
+			calltime = 0;
+	}
+
 	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
-		rec->time += trace->rettime - trace->calltime;
+		rec->time += calltime;
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 821bf49771d..5d1a16cae37 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+	TRACE_ITER_GRAPH_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +318,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"global-clock",
 	"sleep-time",
+	"graph-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c66ca3b6605..e3429a8ab05 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -685,6 +685,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 	TRACE_ITER_SLEEP_TIME		= 0x100000,
+	TRACE_ITER_GRAPH_TIME		= 0x200000,
 };
 
 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 85bba0f018b..10f6ad7d85f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
+	current->ret_stack[index].subtime = 0;
 	*depth = index;
 
 	return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 	int index;
@@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
 }
 
 /*
@@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void)
 	ftrace_pop_return_trace(&trace, &ret);
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
+	barrier();
+	current->curr_ret_stack--;
 
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
-- 
cgit v1.2.3-70-g09d2


From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:32 +0100
Subject: perf_counter: software counter event infrastructure

Provide generic software counter infrastructure that supports
software events.

This will be used to allow sample based profiling based on software
events such as pagefaults. The current infrastructure can only
provide a count of such events, no place information.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   8 +-
 kernel/perf_counter.c        | 201 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index dde564517b6..3fefc3b8150 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -126,6 +126,7 @@ struct hw_perf_counter {
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
+	atomic64_t			count; /* software */
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
@@ -283,6 +284,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -295,10 +298,13 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline void hw_perf_restore(u64 ctrl)				{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
+
+static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+					int nmi, struct pt_regs *regs)	{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0fe22c916e2..eeb1b46cf70 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1328,6 +1328,185 @@ static const struct file_operations perf_fops = {
 	.compat_ioctl		= perf_ioctl,
 };
 
+/*
+ * Generic software counter infrastructure
+ */
+
+static void perf_swcounter_update(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 prev, now;
+	s64 delta;
+
+again:
+	prev = atomic64_read(&hwc->prev_count);
+	now = atomic64_read(&hwc->count);
+	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
+static void perf_swcounter_set_period(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->irq_period;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_add(period, &hwc->period_left);
+	}
+
+	atomic64_set(&hwc->prev_count, -left);
+	atomic64_set(&hwc->count, -left);
+}
+
+static void perf_swcounter_save_and_restart(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
+}
+
+static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_swcounter_handle_group(struct perf_counter *sibling)
+{
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
+
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		perf_swcounter_update(counter);
+		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
+	}
+}
+
+static void perf_swcounter_interrupt(struct perf_counter *counter,
+				     int nmi, struct pt_regs *regs)
+{
+	perf_swcounter_save_and_restart(counter);
+
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		break;
+
+	case PERF_RECORD_IRQ:
+		perf_swcounter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_swcounter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+	} else
+		wake_up(&counter->waitq);
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+				enum hw_event_types event,
+				struct pt_regs *regs)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return 0;
+
+	if (counter->hw_event.raw)
+		return 0;
+
+	if (counter->hw_event.type != event)
+		return 0;
+
+	if (counter->hw_event.exclude_user && user_mode(regs))
+		return 0;
+
+	if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		return 0;
+
+	return 1;
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+				     enum hw_event_types event, u64 nr,
+				     int nmi, struct pt_regs *regs)
+{
+	struct perf_counter *counter;
+	unsigned long flags;
+	int neg;
+
+	if (list_empty(&ctx->counter_list))
+		return;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	/*
+	 * XXX: make counter_list RCU safe
+	 */
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (perf_swcounter_match(counter, event, regs)) {
+			neg = atomic64_add_negative(nr, &counter->hw.count);
+			if (counter->hw.irq_period && !neg)
+				perf_swcounter_interrupt(counter, nmi, regs);
+		}
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+void perf_swcounter_event(enum hw_event_types event, u64 nr,
+			  int nmi, struct pt_regs *regs)
+{
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+
+	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
+	if (cpuctx->task_ctx)
+		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+
+	put_cpu_var(perf_cpu_context);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+	perf_swcounter_set_period(counter);
+	return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+}
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
@@ -1364,6 +1543,10 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.read		= cpu_clock_perf_counter_read,
 };
 
+/*
+ * Software counter: task time clock
+ */
+
 /*
  * Called from within the scheduler:
  */
@@ -1420,6 +1603,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
+/*
+ * Software counter: page faults
+ */
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 #define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
 #else
@@ -1473,6 +1660,10 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
 	.read		= page_faults_perf_counter_read,
 };
 
+/*
+ * Software counter: context switches
+ */
+
 static u64 get_context_switches(struct perf_counter *counter)
 {
 	struct task_struct *curr = counter->ctx->task;
@@ -1521,6 +1712,10 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.read		= context_switches_perf_counter_read,
 };
 
+/*
+ * Software counter: cpu migrations
+ */
+
 static inline u64 get_cpu_migrations(struct perf_counter *counter)
 {
 	struct task_struct *curr = counter->ctx->task;
@@ -1572,7 +1767,9 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct hw_perf_counter_ops *hw_ops = NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -1618,6 +1815,10 @@ sw_perf_counter_init(struct perf_counter *counter)
 	default:
 		break;
 	}
+
+	if (hw_ops)
+		hwc->irq_period = hw_event->irq_period;
+
 	return hw_ops;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:34 +0100
Subject: perf_counter: provide major/minor page fault software events

Provide separate sw counters for major and minor page faults.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c      |  5 ++++-
 arch/x86/mm/fault.c          |  7 +++++--
 include/linux/perf_counter.h |  4 +++-
 kernel/perf_counter.c        | 22 +++++++++-------------
 4 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eda5b0ca4af..17bbf6f91fb 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -312,6 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -319,8 +320,10 @@ good_area:
 			preempt_enable();
 		}
 #endif
-	} else
+	} else {
 		current->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c8725752b6c..f2d3324d921 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1140,10 +1140,13 @@ good_area:
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR)
+	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-	else
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+	} else {
 		tsk->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 
 	check_v8086_mode(regs, address, tsk);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3fefc3b8150..4b14a8e9dbf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -49,8 +49,10 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 	PERF_COUNT_CPU_MIGRATIONS	= -5,
+	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
-	PERF_SW_EVENTS_MIN		= -6,
+	PERF_SW_EVENTS_MIN		= -8,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1773c5d7427..68950a3a52b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
+static const struct hw_perf_counter_ops perf_ops_generic = {
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
+};
+
 /*
  * Software counter: cpu wall time clock
  */
@@ -1603,16 +1609,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: page faults
- */
-
-static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= perf_swcounter_enable,
-	.disable	= perf_swcounter_disable,
-	.read		= perf_swcounter_read,
-};
-
 /*
  * Software counter: context switches
  */
@@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel))
-			hw_ops = &perf_ops_page_faults;
+	case PERF_COUNT_PAGE_FAULTS_MIN:
+	case PERF_COUNT_PAGE_FAULTS_MAJ:
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		if (!counter->hw_event.exclude_kernel)
-- 
cgit v1.2.3-70-g09d2


From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:35 +0100
Subject: perf_counter: hrtimer based sampling for software time events

Use hrtimers to profile timer based sampling for the software time
counters.

This allows platforms without hardware counter support to still
perform sample based profiling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  20 ++++---
 kernel/perf_counter.c        | 123 ++++++++++++++++++++++++++++++-------------
 2 files changed, 100 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4b14a8e9dbf..dfb4c7ce18b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -114,6 +114,7 @@ struct perf_counter_hw_event {
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
+#include <linux/hrtimer.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -123,12 +124,19 @@ struct task_struct;
  */
 struct hw_perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
-	u64				config;
-	unsigned long			config_base;
-	unsigned long			counter_base;
-	int				nmi;
-	unsigned int			idx;
-	atomic64_t			count; /* software */
+	union {
+		struct { /* hardware */
+			u64				config;
+			unsigned long			config_base;
+			unsigned long			counter_base;
+			int				nmi;
+			unsigned int			idx;
+		};
+		union { /* software */
+			atomic64_t			count;
+			struct hrtimer			hrtimer;
+		};
+	};
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68950a3a52b..f9330d5827c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,7 +1395,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		perf_swcounter_update(counter);
+		counter->hw_ops->read(counter);
 		perf_swcounter_store_irq(sibling, counter->hw_event.type);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
@@ -1404,8 +1404,6 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 static void perf_swcounter_interrupt(struct perf_counter *counter,
 				     int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
-
 	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		break;
@@ -1426,6 +1424,38 @@ static void perf_swcounter_interrupt(struct perf_counter *counter,
 		wake_up(&counter->waitq);
 }
 
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+	struct perf_counter *counter;
+	struct pt_regs *regs;
+
+	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
+	counter->hw_ops->read(counter);
+
+	regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((counter->hw_event.exclude_kernel || !regs) &&
+			!counter->hw_event.exclude_user)
+		regs = task_pt_regs(current);
+
+	if (regs)
+		perf_swcounter_interrupt(counter, 0, regs);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+
+	return HRTIMER_RESTART;
+}
+
+static void perf_swcounter_overflow(struct perf_counter *counter,
+				    int nmi, struct pt_regs *regs)
+{
+	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_interrupt(counter, nmi, regs);
+}
+
 static int perf_swcounter_match(struct perf_counter *counter,
 				enum hw_event_types event,
 				struct pt_regs *regs)
@@ -1448,13 +1478,20 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	return 1;
 }
 
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+			       int nmi, struct pt_regs *regs)
+{
+	int neg = atomic64_add_negative(nr, &counter->hw.count);
+	if (counter->hw.irq_period && !neg)
+		perf_swcounter_overflow(counter, nmi, regs);
+}
+
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum hw_event_types event, u64 nr,
 				     int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 	unsigned long flags;
-	int neg;
 
 	if (list_empty(&ctx->counter_list))
 		return;
@@ -1465,11 +1502,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	 * XXX: make counter_list RCU safe
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (perf_swcounter_match(counter, event, regs)) {
-			neg = atomic64_add_negative(nr, &counter->hw.count);
-			if (counter->hw.irq_period && !neg)
-				perf_swcounter_interrupt(counter, nmi, regs);
-		}
+		if (perf_swcounter_match(counter, event, regs))
+			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1513,14 +1547,6 @@ static const struct hw_perf_counter_ops perf_ops_generic = {
  * Software counter: cpu wall time clock
  */
 
-static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
-	return 0;
-}
-
 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
@@ -1533,8 +1559,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_add(now - prev, &counter->count);
 }
 
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	if (hwc->irq_period) {
+		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hwc->hrtimer.function = perf_swcounter_hrtimer;
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(hwc->irq_period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
+	hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
 
@@ -1580,27 +1624,33 @@ static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now
 	atomic64_add(delta, &counter->count);
 }
 
-static void task_clock_perf_counter_read(struct perf_counter *counter)
-{
-	u64 now = task_clock_perf_counter_val(counter, 1);
-
-	task_clock_perf_counter_update(counter, now);
-}
-
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     task_clock_perf_counter_val(counter, 0));
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	if (hwc->irq_period) {
+		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hwc->hrtimer.function = perf_swcounter_hrtimer;
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(hwc->irq_period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
 
 	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	u64 now = task_clock_perf_counter_val(counter, 0);
+	hrtimer_cancel(&counter->hw.hrtimer);
+	task_clock_perf_counter_update(counter,
+			task_clock_perf_counter_val(counter, 0));
+}
 
-	task_clock_perf_counter_update(counter, now);
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+	task_clock_perf_counter_update(counter,
+			task_clock_perf_counter_val(counter, 1));
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1729,16 +1779,12 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel ||
-		      counter->hw_event.exclude_hv))
-			hw_ops = &perf_ops_cpu_clock;
+		hw_ops = &perf_ops_cpu_clock;
+
+		if (hw_event->irq_period && hw_event->irq_period < 10000)
+			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
-		if (counter->hw_event.exclude_user ||
-		    counter->hw_event.exclude_kernel ||
-		    counter->hw_event.exclude_hv)
-			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1747,6 +1793,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_task_clock;
 		else
 			hw_ops = &perf_ops_cpu_clock;
+
+		if (hw_event->irq_period && hw_event->irq_period < 10000)
+			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
-- 
cgit v1.2.3-70-g09d2


From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:36 +0100
Subject: perf_counter: add an event_list

I noticed that the counter_list only includes top-level counters, thus
perf_swcounter_event() will miss sw-counters in groups.

Since perf_swcounter_event() also wants an RCU safe list, create a new
event_list that includes all counters and uses RCU list ops and use call_rcu
to free the counter structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++++
 kernel/perf_counter.c        | 30 +++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index dfb4c7ce18b..08c11a6afeb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -187,6 +187,7 @@ struct file;
 struct perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
+	struct list_head		event_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
@@ -220,6 +221,8 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+
+	struct rcu_head			rcu_head;
 #endif
 };
 
@@ -243,6 +246,7 @@ struct perf_counter_context {
 	struct mutex		mutex;
 
 	struct list_head	counter_list;
+	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f9330d5827c..8d6ecfa64c0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -22,6 +22,7 @@
 #include <linux/perf_counter.h>
 #include <linux/mm.h>
 #include <linux/vmstat.h>
+#include <linux/rculist.h>
 
 /*
  * Each CPU has a list of per CPU counters:
@@ -72,6 +73,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+
+	list_add_rcu(&counter->event_entry, &ctx->event_list);
 }
 
 static void
@@ -80,6 +83,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	struct perf_counter *sibling, *tmp;
 
 	list_del_init(&counter->list_entry);
+	list_del_rcu(&counter->event_entry);
 
 	/*
 	 * If this was a group counter with sibling counters then
@@ -1133,6 +1137,14 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	return ctx;
 }
 
+static void free_counter_rcu(struct rcu_head *head)
+{
+	struct perf_counter *counter;
+
+	counter = container_of(head, struct perf_counter, rcu_head);
+	kfree(counter);
+}
+
 /*
  * Called when the last reference to the file is gone.
  */
@@ -1151,7 +1163,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	kfree(counter);
+	call_rcu(&counter->rcu_head, free_counter_rcu);
 	put_context(ctx);
 
 	return 0;
@@ -1491,22 +1503,16 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
-	unsigned long flags;
 
-	if (list_empty(&ctx->counter_list))
+	if (list_empty(&ctx->event_list))
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
-
-	/*
-	 * XXX: make counter_list RCU safe
-	 */
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
-
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	rcu_read_unlock();
 }
 
 void perf_swcounter_event(enum hw_event_types event, u64 nr,
@@ -1846,6 +1852,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mutex);
 	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
@@ -1992,6 +1999,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
 	ctx->task = task;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:11 +0100
Subject: perf_counter: fix uninitialized usage of event_list

Impact: fix boot crash

When doing the generic context switch event I ran into some early
boot hangs, which were caused by inf func recursion (event, fault,
event, fault).

I eventually tracked it down to event_list not being initialized
at the time of the first event. Fix this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.195392657@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 2 ++
 kernel/perf_counter.c     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 219748d0026..ca226a91abe 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -124,6 +124,8 @@ extern struct cred init_cred;
 # define INIT_PERF_COUNTERS(tsk)					\
 	.perf_counter_ctx.counter_list =				\
 		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
+	.perf_counter_ctx.event_list =					\
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
 	.perf_counter_ctx.lock =					\
 		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b39456ad74a..4c4e9eb37ab 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1506,7 +1506,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 {
 	struct perf_counter *counter;
 
-	if (list_empty(&ctx->event_list))
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
 		return;
 
 	rcu_read_lock();
-- 
cgit v1.2.3-70-g09d2


From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:12 +0100
Subject: perf_counter: generic context switch event

Impact: cleanup

Use the generic software events for context switches.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.283522645@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/perf_counter.c | 60 ++++-----------------------------------------------
 kernel/sched.c        |  6 ------
 3 files changed, 4 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75b2fc5306d..7ed41f7c5ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -138,7 +138,6 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
-extern u64 cpu_nr_switches(int cpu);
 extern u64 cpu_nr_migrations(int cpu);
 
 extern unsigned long get_parent_ip(unsigned long addr);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c4e9eb37ab..99d5930f0a5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -710,10 +710,13 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct pt_regs *regs;
 
 	if (likely(!cpuctx->task_ctx))
 		return;
 
+	regs = task_pt_regs(task);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1667,58 +1670,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: context switches
- */
-
-static u64 get_context_switches(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->nvcsw + curr->nivcsw;
-	return cpu_nr_switches(smp_processor_id());
-}
-
-static void context_switches_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_context_switches(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void context_switches_perf_counter_read(struct perf_counter *counter)
-{
-	context_switches_perf_counter_update(counter);
-}
-
-static int context_switches_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_context_switches(counter));
-	return 0;
-}
-
-static void context_switches_perf_counter_disable(struct perf_counter *counter)
-{
-	context_switches_perf_counter_update(counter);
-}
-
-static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.enable		= context_switches_perf_counter_enable,
-	.disable	= context_switches_perf_counter_disable,
-	.read		= context_switches_perf_counter_read,
-};
-
 /*
  * Software counter: cpu migrations
  */
@@ -1808,11 +1759,8 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
-		hw_ops = &perf_ops_generic;
-		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_context_switches;
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
diff --git a/kernel/sched.c b/kernel/sched.c
index 39e70860216..f76e3c0188a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2900,14 +2900,8 @@ unsigned long nr_active(void)
 
 /*
  * Externally visible per-cpu scheduler statistics:
- * cpu_nr_switches(cpu) - number of context switches on that cpu
  * cpu_nr_migrations(cpu) - number of migrations into that cpu
  */
-u64 cpu_nr_switches(int cpu)
-{
-	return cpu_rq(cpu)->nr_switches;
-}
-
 u64 cpu_nr_migrations(int cpu)
 {
 	return cpu_rq(cpu)->nr_migrations_in;
-- 
cgit v1.2.3-70-g09d2


From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:17 +0100
Subject: perf_counter: hook up the tracepoint events

Impact: new perfcounters feature

Enable usage of tracepoints as perf counter events.

tracepoint event ids can be found in /debug/tracing/event/*/*/id
and (for now) are represented as -65536+id in the type field.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.744044174@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 +++
 init/Kconfig                 |  5 +++++
 kernel/perf_counter.c        | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 08c11a6afeb..065984c1ff5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,6 +53,8 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
 	PERF_SW_EVENTS_MIN		= -8,
+
+	PERF_TP_EVENTS_MIN		= -65536
 };
 
 /*
@@ -222,6 +224,7 @@ struct perf_counter {
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
 
+	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index 38a2ecd47c3..4f647142f2e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -947,6 +947,11 @@ config PERF_COUNTERS
 
 	  Say Y if unsure.
 
+config EVENT_PROFILE
+	bool "Tracepoint profile sources"
+	depends on PERF_COUNTERS && EVENT_TRACER
+	default y
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 97f891ffeb4..0bbe3e45ba0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1152,6 +1152,9 @@ static void free_counter_rcu(struct rcu_head *head)
 
 static void free_counter(struct perf_counter *counter)
 {
+	if (counter->destroy)
+		counter->destroy(counter);
+
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1727,6 +1730,45 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 	.read		= cpu_migrations_perf_counter_read,
 };
 
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id)
+{
+	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
+			task_pt_regs(current));
+}
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+
+	ftrace_profile_disable(event_id);
+}
+
+static const struct hw_perf_counter_ops *
+tp_perf_counter_init(struct perf_counter *counter)
+{
+	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int ret;
+
+	ret = ftrace_profile_enable(event_id);
+	if (ret)
+		return NULL;
+
+	counter->destroy = tp_perf_counter_destroy;
+
+	return &perf_ops_generic;
+}
+#else
+static const struct hw_perf_counter_ops *
+tp_perf_counter_init(struct perf_counter *counter)
+{
+	return NULL;
+}
+#endif
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -1772,6 +1814,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
+		hw_ops = tp_perf_counter_init(counter);
 		break;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:18 +0100
Subject: perf_counter: revamp syscall input ABI

Impact: modify ABI

The hardware/software classification in hw_event->type became a little
strained due to the addition of tracepoint tracing.

Instead split up the field and provide a type field to explicitly specify
the counter type, while using the event_id field to specify which event to
use.

Raw counters still work as before, only the raw config now goes into
raw_event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.836807573@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  4 +-
 arch/x86/kernel/cpu/perf_counter.c | 10 ++--
 include/linux/perf_counter.h       | 95 ++++++++++++++++++++++++--------------
 kernel/perf_counter.c              | 83 ++++++++++++++++++++-------------
 4 files changed, 117 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5008762e8bf..26f69dc7130 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.type;
+	ev = counter->hw_event.event_id;
 	if (!counter->hw_event.raw) {
 		if (ev >= ppmu->n_generic ||
 		    ppmu->generic_events[ev] == 0)
@@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, sub->hw_event.event_config);
 		perf_store_irq_data(counter, atomic64_read(&sub->count));
 	}
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cba9d47b71..d844ae41d5a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw) {
-		hwc->config |= pmc_ops->raw_event(hw_event->type);
+	if (hw_event->raw_type) {
+		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
 	} else {
-		if (hw_event->type >= pmc_ops->max_events)
+		if (hw_event->event_id >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->type);
+		hwc->config |= pmc_ops->event_map(hw_event->event_id);
 	}
 	counter->wakeup_pending = 0;
 
@@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, counter->hw_event.event_config);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 065984c1ff5..8f939490550 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -21,56 +21,81 @@
  */
 
 /*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
+ * hw_event.type
  */
-enum hw_event_types {
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+
 	/*
-	 * Common hardware events, generalized by the kernel:
+	 * available TYPE space, raw is the max value.
 	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  7,
+	PERF_TYPE_RAW			= 128,
+};
 
+/*
+ * Generalized performance counter event types, used by the hw_event.event_id
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_ids {
 	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
-
-	PERF_SW_EVENTS_MIN		= -8,
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
+
+	PERF_HW_EVENTS_MAX		= 7,
+};
 
-	PERF_TP_EVENTS_MIN		= -65536
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_SW_EVENTS_MAX		= 7,
 };
 
 /*
  * IRQ-notification data record type:
  */
 enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
+	PERF_RECORD_SIMPLE		= 0,
+	PERF_RECORD_IRQ			= 1,
+	PERF_RECORD_GROUP		= 2,
 };
 
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	__s64			type;
+	union {
+		struct {
+			__u64			event_id	: 56,
+						type		:  8;
+		};
+		struct {
+			__u64			raw_event_id	: 63,
+						raw_type	:  1;
+		};
+		__u64		event_config;
+	};
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -78,7 +103,6 @@ struct perf_counter_hw_event {
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -87,7 +111,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1   : 54;
+				__reserved_1   : 55;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
+	return !counter->hw_event.raw_type &&
+		counter->hw_event.type != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
 #else
 static inline void
@@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+static inline void perf_swcounter_event(u32 event, u64 nr,
 					int nmi, struct pt_regs *regs)	{ }
 #endif
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bbe3e45ba0..68a56a68bc7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_save_and_restart(struct perf_counter *counter)
-{
-	perf_swcounter_update(counter);
-	perf_swcounter_set_period(counter);
-}
-
 static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
 {
 	struct perf_data *irqdata = counter->irqdata;
@@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
 }
@@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
 	perf_swcounter_interrupt(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum hw_event_types event,
-				struct pt_regs *regs)
+				enum perf_event_types type,
+				u32 event, struct pt_regs *regs)
 {
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw)
+	if (counter->hw_event.raw_type)
+		return 0;
+
+	if (counter->hw_event.type != type)
 		return 0;
 
-	if (counter->hw_event.type != event)
+	if (counter->hw_event.event_id != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum hw_event_types event, u64 nr,
-				     int nmi, struct pt_regs *regs)
+				     enum perf_event_types type, u32 event,
+				     u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, event, regs))
+		if (perf_swcounter_match(counter, type, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 	rcu_read_unlock();
 }
 
-void perf_swcounter_event(enum hw_event_types event, u64 nr,
-			  int nmi, struct pt_regs *regs)
+static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
-	if (cpuctx->task_ctx)
-		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	if (cpuctx->task_ctx) {
+		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
+				nr, nmi, regs);
+	}
 
 	put_cpu_var(perf_cpu_context);
 }
 
+void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+{
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+}
+
 static void perf_swcounter_read(struct perf_counter *counter)
 {
 	perf_swcounter_update(counter);
@@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
-			task_pt_regs(current));
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
+
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
 }
 
 extern int ftrace_profile_enable(int);
@@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
-
-	ftrace_profile_disable(event_id);
+	ftrace_profile_disable(counter->hw_event.event_id);
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int event_id = counter->hw_event.event_id;
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
+	counter->hw.irq_period = counter->hw_event.irq_period;
 
 	return &perf_ops_generic;
 }
@@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.type) {
+	switch (counter->hw_event.event_id) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter)
 		if (!counter->hw_event.exclude_kernel)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
-	default:
-		hw_ops = tp_perf_counter_init(counter);
-		break;
 	}
 
 	if (hw_ops)
@@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	hw_ops = NULL;
-	if (!hw_event->raw && hw_event->type < 0)
-		hw_ops = sw_perf_counter_init(counter);
-	else
+
+	if (hw_event->raw_type)
+		hw_ops = hw_perf_counter_init(counter);
+	else switch (hw_event->type) {
+	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_SOFTWARE:
+		hw_ops = sw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_TRACEPOINT:
+		hw_ops = tp_perf_counter_init(counter);
+		break;
+	}
 
 	if (!hw_ops) {
 		kfree(counter);
-- 
cgit v1.2.3-70-g09d2


From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:19 +0100
Subject: perf_counter: unify irq output code

Impact: cleanup

Having 3 slightly different copies of the same code around does nobody
any good. First step in revamping the output format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.929962222@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  51 +-----------------
 arch/x86/kernel/cpu/perf_counter.c |  53 +------------------
 include/linux/perf_counter.h       |   2 +
 kernel/perf_counter.c              | 106 ++++++++++++++++++++-----------------
 4 files changed, 61 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 26f69dc7130..88b72eb4af1 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -662,41 +662,6 @@ void perf_counter_do_pending(void)
 	}
 }
 
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-	struct perf_counter *leader, *sub;
-
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.event_config);
-		perf_store_irq_data(counter, atomic64_read(&sub->count));
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record) {
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			break;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			counter->wakeup_pending = 1;
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter);
-			counter->wakeup_pending = 1;
-			break;
-		}
-	}
+	if (record)
+		perf_counter_output(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d844ae41d5a..902282d68b0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 }
 
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
 /*
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
@@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
-static void
-perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	/*
-	 * Store sibling timestamps (if any):
-	 */
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-
-		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.event_config);
-		perf_store_irq_data(sibling, atomic64_read(&counter->count));
-	}
-}
-
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
@@ -754,28 +724,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			continue;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter, &status, &ack);
-			break;
-		}
-		/*
-		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these generic as
-		 * wakeup_pending and initate a wakeup callback:
-		 */
-		if (nmi) {
-			counter->wakeup_pending = 1;
-			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
-		} else {
-			wake_up(&counter->waitq);
-		}
+		perf_counter_output(counter, nmi, regs);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8f939490550..a4b76c0175f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+extern void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68a56a68bc7..f054b8c9bf9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1353,6 +1353,60 @@ static const struct file_operations perf_fops = {
 	.compat_ioctl		= perf_ioctl,
 };
 
+/*
+ * Output
+ */
+
+static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_counter_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+	}
+}
+
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
+{
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		return;
+
+	case PERF_RECORD_IRQ:
+		perf_counter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_counter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_perf_counter_pending();
+	} else
+		wake_up(&counter->waitq);
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-static void perf_swcounter_handle_group(struct perf_counter *sibling)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
-		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
-	}
-}
-
-static void perf_swcounter_interrupt(struct perf_counter *counter,
-				     int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		break;
-
-	case PERF_RECORD_IRQ:
-		perf_swcounter_store_irq(counter, instruction_pointer(regs));
-		break;
-
-	case PERF_RECORD_GROUP:
-		perf_swcounter_handle_group(counter);
-		break;
-	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
-}
-
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	struct perf_counter *counter;
@@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs)
-		perf_swcounter_interrupt(counter, 0, regs);
+		perf_counter_output(counter, 0, regs);
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
@@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_swcounter_interrupt(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v1.2.3-70-g09d2


From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 21 Mar 2009 15:31:47 +1100
Subject: perf_counter: fix type/event_id layout on big-endian systems

Impact: build fix for powerpc

Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI")
expanded the hw_event.type field into a union of structs containing
bitfields.  In particular it introduced a type field and a raw_type
field, with the intention that the 1-bit raw_type field should
overlay the most-significant bit of the 8-bit type field, and in fact
perf_counter_alloc() now assumes that (or at least, assumes that
raw_type doesn't overlay any of the bits that are 1 in the values of
PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}).

Unfortunately this is not true on big-endian systems such as PowerPC,
where bitfields are laid out from left to right, i.e. from most
significant bit to least significant.  This means that setting
hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1.

This fixes it by making the layout depend on whether or not
__BIG_ENDIAN_BITFIELD is defined.  It's a bit ugly, but that's what
we get for using bitfields in a user/kernel ABI.

Also, that commit didn't fix up some places in arch/powerpc/kernel/
perf_counter.c where hw_event.raw and hw_event.event_id were used.
This fixes them too.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c |  9 +++++----
 include/linux/perf_counter.h       | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 830ca9c4494..6413d9c0313 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.event_id;
-	if (!counter->hw_event.raw) {
-		if (ev >= ppmu->n_generic ||
-		    ppmu->generic_events[ev] == 0)
+	if (!counter->hw_event.raw_type) {
+		ev = counter->hw_event.event_id;
+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
+	} else {
+		ev = counter->hw_event.raw_event_id;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a4b76c0175f..98f5990be1e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -15,6 +15,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <asm/byteorder.h>
 
 /*
  * User-space ABI bits:
@@ -86,6 +87,7 @@ enum perf_counter_record_type {
  */
 struct perf_counter_hw_event {
 	union {
+#ifndef __BIG_ENDIAN_BITFIELD
 		struct {
 			__u64			event_id	: 56,
 						type		:  8;
@@ -94,6 +96,16 @@ struct perf_counter_hw_event {
 			__u64			raw_event_id	: 63,
 						raw_type	:  1;
 		};
+#else
+		struct {
+			__u64			type		:  8,
+						event_id	: 56;
+		};
+		struct {
+			__u64			raw_type	:  1,
+						raw_event_id	: 63;
+		};
+#endif /* __BIT_ENDIAN_BITFIELD */
 		__u64		event_config;
 	};
 
-- 
cgit v1.2.3-70-g09d2


From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:06 +0100
Subject: perf_counter: remove the event config bitfields

Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.059499915@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  6 ++--
 arch/x86/kernel/cpu/perf_counter.c |  8 ++---
 include/linux/perf_counter.h       | 74 +++++++++++++++++++++++++-------------
 kernel/perf_counter.c              | 22 +++++++-----
 4 files changed, 70 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6413d9c0313..d05651584d4 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	if (!counter->hw_event.raw_type) {
-		ev = counter->hw_event.event_id;
+	if (!perf_event_raw(&counter->hw_event)) {
+		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = counter->hw_event.raw_event_id;
+		ev = perf_event_config(&counter->hw_event);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 902282d68b0..3f95b0cdc55 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw_type) {
-		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+	if (perf_event_raw(hw_event)) {
+		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
 	} else {
-		if (hw_event->event_id >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->event_id);
+		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 	counter->wakeup_pending = 0;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 98f5990be1e..56099e52970 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -82,32 +82,37 @@ enum perf_counter_record_type {
 	PERF_RECORD_GROUP		= 2,
 };
 
+#define __PERF_COUNTER_MASK(name) 			\
+	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
+	 PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW_BITS		1
+#define PERF_COUNTER_RAW_SHIFT		63
+#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
+
+#define PERF_COUNTER_CONFIG_BITS	63
+#define PERF_COUNTER_CONFIG_SHIFT	0
+#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
+
+#define PERF_COUNTER_TYPE_BITS		7
+#define PERF_COUNTER_TYPE_SHIFT		56
+#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
+
+#define PERF_COUNTER_EVENT_BITS		56
+#define PERF_COUNTER_EVENT_SHIFT	0
+#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	union {
-#ifndef __BIG_ENDIAN_BITFIELD
-		struct {
-			__u64			event_id	: 56,
-						type		:  8;
-		};
-		struct {
-			__u64			raw_event_id	: 63,
-						raw_type	:  1;
-		};
-#else
-		struct {
-			__u64			type		:  8,
-						event_id	: 56;
-		};
-		struct {
-			__u64			raw_type	:  1,
-						raw_event_id	: 63;
-		};
-#endif /* __BIT_ENDIAN_BITFIELD */
-		__u64		event_config;
-	};
+	/*
+	 * The MSB of the config word signifies if the rest contains cpu
+	 * specific (raw) counter configuration data, if unset, the next
+	 * 7 bits are an event type and the rest of the bits are the event
+	 * identifier.
+	 */
+	__u64			config;
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -157,6 +162,27 @@ struct perf_counter_hw_event {
 
 struct task_struct;
 
+static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_RAW_MASK;
+}
+
+static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+}
+
+static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+{
+	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+		PERF_COUNTER_TYPE_SHIFT;
+}
+
+static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+}
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw_type &&
-		counter->hw_event.type != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->hw_event) &&
+		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f054b8c9bf9..ca14fc41ccd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, sub->hw_event.config);
 		perf_counter_store_irq(counter, atomic64_read(&sub->count));
 	}
 }
@@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw_type)
+	if (perf_event_raw(&counter->hw_event))
 		return 0;
 
-	if (counter->hw_event.type != type)
+	if (perf_event_type(&counter->hw_event) != type)
 		return 0;
 
-	if (counter->hw_event.event_id != event)
+	if (perf_event_id(&counter->hw_event) != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(counter->hw_event.event_id);
+	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.event_id;
+	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.event_id) {
+	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	hw_ops = NULL;
 
-	if (hw_event->raw_type)
+	if (perf_event_raw(hw_event)) {
 		hw_ops = hw_perf_counter_init(counter);
-	else switch (hw_event->type) {
+		goto done;
+	}
+
+	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
 		break;
@@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		kfree(counter);
 		return NULL;
 	}
+done:
 	counter->hw_ops = hw_ops;
 
 	return counter;
-- 
cgit v1.2.3-70-g09d2


From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:07 +0100
Subject: perf_counter: avoid recursion

Tracepoint events like lock_acquire and software counters like
pagefaults can recurse into the perf counter code again, avoid that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.152096433@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++++++
 kernel/perf_counter.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 56099e52970..18dc17d0a61 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -328,6 +328,13 @@ struct perf_cpu_context {
 	int				active_oncpu;
 	int				max_pertask;
 	int				exclusive;
+
+	/*
+	 * Recursion avoidance:
+	 *
+	 * task, softirq, irq, nmi context
+	 */
+	int			recursion[4];
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ca14fc41ccd..ce34bff07bd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/rculist.h>
+#include <linux/hardirq.h>
 
 #include <asm/irq_regs.h>
 
@@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_unlock();
 }
 
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+	if (in_nmi())
+		return &cpuctx->recursion[3];
+
+	if (in_irq())
+		return &cpuctx->recursion[2];
+
+	if (in_softirq())
+		return &cpuctx->recursion[1];
+
+	return &cpuctx->recursion[0];
+}
+
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int *recursion = perf_swcounter_recursion_context(cpuctx);
+
+	if (*recursion)
+		goto out;
+
+	(*recursion)++;
+	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
 	if (cpuctx->task_ctx) {
@@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 				nr, nmi, regs);
 	}
 
+	barrier();
+	(*recursion)--;
+
+out:
 	put_cpu_var(perf_cpu_context);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Mar 2009 18:22:08 +0100
Subject: perf_counter: add an mmap method to allow userspace to read hardware
 counters

Impact: new feature giving performance improvement

This adds the ability for userspace to do an mmap on a hardware counter
fd and get access to a read-only page that contains the information
needed to translate a hardware counter value to the full 64-bit
counter value that would be returned by a read on the fd.  This is
useful on architectures that allow user programs to read the hardware
counters, such as PowerPC.

The mmap will only succeed if the counter is a hardware counter
monitoring the current process.

On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter
and translate it to the full 64-bit value in about 30ns using the
mmapped page, compared to about 830ns for the read syscall on the
counter, so this does give a significant performance improvement.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.297057964@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  6 +++
 include/linux/perf_counter.h       | 15 ++++++++
 kernel/perf_counter.c              | 76 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d05651584d4..e4349281b07 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
+		if (counter->user_page)
+			perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
+			if (counter->user_page)
+				perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
+	if (counter->user_page)
+		perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 18dc17d0a61..40b324e91bf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -143,6 +143,17 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
 
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware counter identifier */
+	__s64	offset;			/* add to hardware counter value */
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -278,6 +289,9 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
+	/* pointer to page shared with userspace via mmap */
+	unsigned long			user_page;
+
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
@@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
+extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ce34bff07bd..d9cfd902140 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
+	free_page(counter->user_page);
 	free_counter(counter);
 	put_context(ctx);
 
@@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+	struct perf_counter_mmap_page *userpg;
+
+	if (!counter->user_page)
+		return;
+	userpg = (struct perf_counter_mmap_page *) counter->user_page;
+
+	++userpg->lock;
+	smp_wmb();
+	userpg->index = counter->hw.idx;
+	userpg->offset = atomic64_read(&counter->count);
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+		userpg->offset -= atomic64_read(&counter->hw.prev_count);
+	smp_wmb();
+	++userpg->lock;
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	if (!counter->user_page)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = virt_to_page(counter->user_page);
+	get_page(vmf->page);
+	return 0;
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+	.fault = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = file->private_data;
+	unsigned long userpg;
+
+	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * For now, restrict to the case of a hardware counter
+	 * on the current task.
+	 */
+	if (is_software_counter(counter) || counter->task != current)
+		return -EINVAL;
+
+	userpg = counter->user_page;
+	if (!userpg) {
+		userpg = get_zeroed_page(GFP_KERNEL);
+		mutex_lock(&counter->mutex);
+		if (counter->user_page) {
+			free_page(userpg);
+			userpg = counter->user_page;
+		} else {
+			counter->user_page = userpg;
+		}
+		mutex_unlock(&counter->mutex);
+		if (!userpg)
+			return -ENOMEM;
+	}
+
+	perf_counter_update_userpage(counter);
+
+	vma->vm_flags &= ~VM_MAYWRITE;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &perf_mmap_vmops;
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
+	.mmap			= perf_mmap,
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 9ab772cd535c4b256a577eae516f9c7462346b2d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 23 Mar 2009 18:22:09 +0100
Subject: mutex: add atomic_dec_and_mutex_lock()

Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab..93054fc3635 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:10 +0100
Subject: perf_counter: new output ABI - part 1

Impact: Rework the perfcounter output ABI

use sys_read() only for instant data and provide mmap() output for all
async overflow data.

The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.

In case of 0 extra pages there is no data output and the first page
only contains meta data.

When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.

Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.

Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.470536358@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |   9 +-
 include/linux/perf_counter.h       |  36 ++-
 kernel/perf_counter.c              | 464 ++++++++++++++++++++-----------------
 3 files changed, 263 insertions(+), 246 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index e4349281b07..d48596ab655 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
-		if (counter->user_page)
-			perf_counter_update_userpage(counter);
+		perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
-			if (counter->user_page)
-				perf_counter_update_userpage(counter);
+			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
-	if (counter->user_page)
-		perf_counter_update_userpage(counter);
+	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40b324e91bf..2b5e66d5ebd 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -152,6 +152,8 @@ struct perf_counter_mmap_page {
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
+
+	__u32   data_head;		/* head in the data section */
 };
 
 #ifdef __KERNEL__
@@ -218,21 +220,6 @@ struct hw_perf_counter {
 #endif
 };
 
-/*
- * Hardcoded buffer length limit for now, for IRQ-fed events:
- */
-#define PERF_DATA_BUFLEN		2048
-
-/**
- * struct perf_data - performance counter IRQ data sampling ...
- */
-struct perf_data {
-	int				len;
-	int				rd_idx;
-	int				overrun;
-	u8				data[PERF_DATA_BUFLEN];
-};
-
 struct perf_counter;
 
 /**
@@ -256,6 +243,14 @@ enum perf_counter_active_state {
 
 struct file;
 
+struct perf_mmap_data {
+	struct rcu_head			rcu_head;
+	int				nr_pages;
+	atomic_t			head;
+	struct perf_counter_mmap_page   *user_page;
+	void 				*data_pages[0];
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -289,16 +284,15 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	/* pointer to page shared with userspace via mmap */
-	unsigned long			user_page;
+	/* mmap bits */
+	struct mutex			mmap_mutex;
+	atomic_t			mmap_count;
+	struct perf_mmap_data		*data;
 
-	/* read() / irq related data */
+	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
 	int				wakeup_pending;
-	struct perf_data		*irqdata;
-	struct perf_data		*usrdata;
-	struct perf_data		data[2];
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d9cfd902140..0dfe91094fd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *
+ *  For licensing details see kernel-base/COPYING
  */
 
 #include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
-/*
- * Cross CPU call to switch performance data pointers
- */
-static void __perf_switch_irq_data(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_data *oldirqdata = counter->irqdata;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task) {
-		if (cpuctx->task_ctx != ctx)
-			return;
-		spin_lock(&ctx->lock);
-	}
-
-	/* Change the pointer NMI safe */
-	atomic_long_set((atomic_long_t *)&counter->irqdata,
-			(unsigned long) counter->usrdata);
-	counter->usrdata = oldirqdata;
-
-	if (ctx->task)
-		spin_unlock(&ctx->lock);
-}
-
-static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_data *oldirqdata = counter->irqdata;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		smp_call_function_single(counter->cpu,
-					 __perf_switch_irq_data,
-					 counter, 1);
-		return counter->usrdata;
-	}
-
-retry:
-	spin_lock_irq(&ctx->lock);
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-		counter->irqdata = counter->usrdata;
-		counter->usrdata = oldirqdata;
-		spin_unlock_irq(&ctx->lock);
-		return oldirqdata;
-	}
-	spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
-	/* Might have failed, because task was scheduled out */
-	if (counter->irqdata == oldirqdata)
-		goto retry;
-
-	return counter->usrdata;
-}
-
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	free_page(counter->user_page);
 	free_counter(counter);
 	put_context(ctx);
 
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
 	u64 cntval;
 
-	if (count != sizeof(cntval))
+	if (count < sizeof(cntval))
 		return -EINVAL;
 
 	/*
@@ -1210,122 +1150,21 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
 }
 
-static ssize_t
-perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
-{
-	if (!usrdata->len)
-		return 0;
-
-	count = min(count, (size_t)usrdata->len);
-	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
-		return -EFAULT;
-
-	/* Adjust the counters */
-	usrdata->len -= count;
-	if (!usrdata->len)
-		usrdata->rd_idx = 0;
-	else
-		usrdata->rd_idx += count;
-
-	return count;
-}
-
-static ssize_t
-perf_read_irq_data(struct perf_counter	*counter,
-		   char __user		*buf,
-		   size_t		count,
-		   int			nonblocking)
-{
-	struct perf_data *irqdata, *usrdata;
-	DECLARE_WAITQUEUE(wait, current);
-	ssize_t res, res2;
-
-	irqdata = counter->irqdata;
-	usrdata = counter->usrdata;
-
-	if (usrdata->len + irqdata->len >= count)
-		goto read_pending;
-
-	if (nonblocking)
-		return -EAGAIN;
-
-	spin_lock_irq(&counter->waitq.lock);
-	__add_wait_queue(&counter->waitq, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (usrdata->len + irqdata->len >= count)
-			break;
-
-		if (signal_pending(current))
-			break;
-
-		if (counter->state == PERF_COUNTER_STATE_ERROR)
-			break;
-
-		spin_unlock_irq(&counter->waitq.lock);
-		schedule();
-		spin_lock_irq(&counter->waitq.lock);
-	}
-	__remove_wait_queue(&counter->waitq, &wait);
-	__set_current_state(TASK_RUNNING);
-	spin_unlock_irq(&counter->waitq.lock);
-
-	if (usrdata->len + irqdata->len < count &&
-	    counter->state != PERF_COUNTER_STATE_ERROR)
-		return -ERESTARTSYS;
-read_pending:
-	mutex_lock(&counter->mutex);
-
-	/* Drain pending data first: */
-	res = perf_copy_usrdata(usrdata, buf, count);
-	if (res < 0 || res == count)
-		goto out;
-
-	/* Switch irq buffer: */
-	usrdata = perf_switch_irq_data(counter);
-	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
-	if (res2 < 0) {
-		if (!res)
-			res = -EFAULT;
-	} else {
-		res += res2;
-	}
-out:
-	mutex_unlock(&counter->mutex);
-
-	return res;
-}
-
 static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return perf_read_hw(counter, buf, count);
-
-	case PERF_RECORD_IRQ:
-	case PERF_RECORD_GROUP:
-		return perf_read_irq_data(counter, buf, count,
-					  file->f_flags & O_NONBLOCK);
-	}
-	return -EINVAL;
+	return perf_read_hw(counter, buf, count);
 }
 
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned int events = 0;
-	unsigned long flags;
+	unsigned int events = POLLIN;
 
 	poll_wait(file, &counter->waitq, wait);
 
-	spin_lock_irqsave(&counter->waitq.lock, flags);
-	if (counter->usrdata->len || counter->irqdata->len)
-		events |= POLLIN;
-	spin_unlock_irqrestore(&counter->waitq.lock, flags);
-
 	return events;
 }
 
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-void perf_counter_update_userpage(struct perf_counter *counter)
+static void __perf_counter_update_userpage(struct perf_counter *counter,
+					   struct perf_mmap_data *data)
 {
-	struct perf_counter_mmap_page *userpg;
-
-	if (!counter->user_page)
-		return;
-	userpg = (struct perf_counter_mmap_page *) counter->user_page;
+	struct perf_counter_mmap_page *userpg = data->user_page;
 
+	/*
+	 * Disable preemption so as to not let the corresponding user-space
+	 * spin too long if we get preempted.
+	 */
+	preempt_disable();
 	++userpg->lock;
 	smp_wmb();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+	userpg->data_head = atomic_read(&data->head);
 	smp_wmb();
 	++userpg->lock;
+	preempt_enable();
+}
+
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data)
+		__perf_counter_update_userpage(counter, data);
+	rcu_read_unlock();
 }
 
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_counter *counter = vma->vm_file->private_data;
+	struct perf_mmap_data *data;
+	int ret = VM_FAULT_SIGBUS;
 
-	if (!counter->user_page)
-		return VM_FAULT_SIGBUS;
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto unlock;
+
+	if (vmf->pgoff == 0) {
+		vmf->page = virt_to_page(data->user_page);
+	} else {
+		int nr = vmf->pgoff - 1;
 
-	vmf->page = virt_to_page(counter->user_page);
+		if ((unsigned)nr > data->nr_pages)
+			goto unlock;
+
+		vmf->page = virt_to_page(data->data_pages[nr]);
+	}
 	get_page(vmf->page);
+	ret = 0;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+	struct perf_mmap_data *data;
+	unsigned long size;
+	int i;
+
+	WARN_ON(atomic_read(&counter->mmap_count));
+
+	size = sizeof(struct perf_mmap_data);
+	size += nr_pages * sizeof(void *);
+
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto fail;
+
+	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!data->user_page)
+		goto fail_user_page;
+
+	for (i = 0; i < nr_pages; i++) {
+		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!data->data_pages[i])
+			goto fail_data_pages;
+	}
+
+	data->nr_pages = nr_pages;
+
+	rcu_assign_pointer(counter->data, data);
+
 	return 0;
+
+fail_data_pages:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)data->data_pages[i]);
+
+	free_page((unsigned long)data->user_page);
+
+fail_user_page:
+	kfree(data);
+
+fail:
+	return -ENOMEM;
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+	struct perf_mmap_data *data = container_of(rcu_head,
+			struct perf_mmap_data, rcu_head);
+	int i;
+
+	free_page((unsigned long)data->user_page);
+	for (i = 0; i < data->nr_pages; i++)
+		free_page((unsigned long)data->data_pages[i]);
+	kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data = counter->data;
+
+	WARN_ON(atomic_read(&counter->mmap_count));
+
+	rcu_assign_pointer(counter->data, NULL);
+	call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
+				      &counter->mmap_mutex)) {
+		perf_mmap_data_free(counter);
+		mutex_unlock(&counter->mmap_mutex);
+	}
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
+	.open = perf_mmap_open,
+	.close = perf_mmap_close,
 	.fault = perf_mmap_fault,
 };
 
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned long userpg;
+	unsigned long vma_size;
+	unsigned long nr_pages;
+	unsigned long locked, lock_limit;
+	int ret = 0;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+
+	vma_size = vma->vm_end - vma->vm_start;
+	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	if (nr_pages == 0 || !is_power_of_2(nr_pages))
 		return -EINVAL;
 
-	/*
-	 * For now, restrict to the case of a hardware counter
-	 * on the current task.
-	 */
-	if (is_software_counter(counter) || counter->task != current)
+	if (vma_size != PAGE_SIZE * (1 + nr_pages))
 		return -EINVAL;
 
-	userpg = counter->user_page;
-	if (!userpg) {
-		userpg = get_zeroed_page(GFP_KERNEL);
-		mutex_lock(&counter->mutex);
-		if (counter->user_page) {
-			free_page(userpg);
-			userpg = counter->user_page;
-		} else {
-			counter->user_page = userpg;
-		}
-		mutex_unlock(&counter->mutex);
-		if (!userpg)
-			return -ENOMEM;
-	}
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	locked = vma_size >>  PAGE_SHIFT;
+	locked += vma->vm_mm->locked_vm;
 
-	perf_counter_update_userpage(counter);
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+		return -EPERM;
+
+	mutex_lock(&counter->mmap_mutex);
+	if (atomic_inc_not_zero(&counter->mmap_count))
+		goto out;
+
+	WARN_ON(counter->data);
+	ret = perf_mmap_data_alloc(counter, nr_pages);
+	if (!ret)
+		atomic_set(&counter->mmap_count, 1);
+out:
+	mutex_unlock(&counter->mmap_mutex);
 
 	vma->vm_flags &= ~VM_MAYWRITE;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_ops = &perf_mmap_vmops;
-	return 0;
+
+	return ret;
 }
 
 static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
  * Output
  */
 
-static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+static int perf_output_write(struct perf_counter *counter, int nmi,
+			     void *buf, ssize_t size)
 {
-	struct perf_data *irqdata = counter->irqdata;
+	struct perf_mmap_data *data;
+	unsigned int offset, head, nr;
+	unsigned int len;
+	int ret, wakeup;
 
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+	rcu_read_lock();
+	ret = -ENOSPC;
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto out;
+
+	if (!data->nr_pages)
+		goto out;
+
+	ret = -EINVAL;
+	if (size > PAGE_SIZE)
+		goto out;
+
+	do {
+		offset = head = atomic_read(&data->head);
+		head += sizeof(u64);
+	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+
+	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
-		*p = data;
-		irqdata->len += sizeof(u64);
+	nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
+	offset &= PAGE_SIZE - 1;
+
+	len = min_t(unsigned int, PAGE_SIZE - offset, size);
+	memcpy(data->data_pages[nr] + offset, buf, len);
+	size -= len;
+
+	if (size) {
+		nr = (nr + 1) & (data->nr_pages - 1);
+		memcpy(data->data_pages[nr], buf + len, size);
+	}
+
+	/*
+	 * generate a poll() wakeup for every page boundary crossed
+	 */
+	if (wakeup) {
+		__perf_counter_update_userpage(counter, data);
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_perf_counter_pending();
+		} else
+			wake_up(&counter->waitq);
 	}
+	ret = 0;
+out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
-static void perf_counter_handle_group(struct perf_counter *counter)
+static void perf_output_simple(struct perf_counter *counter,
+			       int nmi, struct pt_regs *regs)
+{
+	u64 entry;
+
+	entry = instruction_pointer(regs);
+
+	perf_output_write(counter, nmi, &entry, sizeof(entry));
+}
+
+struct group_entry {
+	u64 event;
+	u64 counter;
+};
+
+static void perf_output_group(struct perf_counter *counter, int nmi)
 {
 	struct perf_counter *leader, *sub;
 
 	leader = counter->group_leader;
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		struct group_entry entry;
+
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.config);
-		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+
+		entry.event = sub->hw_event.config;
+		entry.counter = atomic64_read(&sub->count);
+
+		perf_output_write(counter, nmi, &entry, sizeof(entry));
 	}
 }
 
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
 		return;
 
 	case PERF_RECORD_IRQ:
-		perf_counter_store_irq(counter, instruction_pointer(regs));
+		perf_output_simple(counter, nmi, regs);
 		break;
 
 	case PERF_RECORD_GROUP:
-		perf_counter_handle_group(counter);
+		perf_output_group(counter, nmi);
 		break;
 	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
 }
 
 /*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
+	mutex_init(&counter->mmap_mutex);
+
 	INIT_LIST_HEAD(&counter->child_list);
 
-	counter->irqdata		= &counter->data[0];
-	counter->usrdata		= &counter->data[1];
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
-- 
cgit v1.2.3-70-g09d2


From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 Mar 2009 13:18:16 +0100
Subject: perf_counter: fix perf_poll()

Impact: fix kerneltop 100% CPU usage

Only return a poll event when there's actually been one, poll_wait()
doesn't actually wait for the waitq you pass it, it only enqueues
you on it.

Only once all FDs have been iterated and none of thm returned a
poll-event will it schedule().

Also make it return POLL_HUP when there's not mmap() area to read from.

Further, fix a silly bug in the write code.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Orig-LKML-Reference: <1237897096.24918.181.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b5e66d5ebd..48212c15b7d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -246,6 +246,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;
+	atomic_t			wakeup;
 	atomic_t			head;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0dfe91094fd..affe227d56a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1161,7 +1161,16 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned int events = POLLIN;
+	struct perf_mmap_data *data;
+	unsigned int events;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data)
+		events = atomic_xchg(&data->wakeup, 0);
+	else
+		events = POLL_HUP;
+	rcu_read_unlock();
 
 	poll_wait(file, &counter->waitq, wait);
 
@@ -1425,7 +1434,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 
 	do {
 		offset = head = atomic_read(&data->head);
-		head += sizeof(u64);
+		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
 	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1446,6 +1455,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	 * generate a poll() wakeup for every page boundary crossed
 	 */
 	if (wakeup) {
+		atomic_xchg(&data->wakeup, POLL_IN);
 		__perf_counter_update_userpage(counter, data);
 		if (nmi) {
 			counter->wakeup_pending = 1;
-- 
cgit v1.2.3-70-g09d2


From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:23 +0100
Subject: perf_counter: output objects

Provide a {type,size} header for each output entry.

This should provide extensible output, and the ability to mix multiple streams.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.831607932@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 +++++++++
 kernel/perf_counter.c        | 53 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 52 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48212c15b7d..c256635377d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,16 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+struct perf_event_header {
+	__u32	type;
+	__u32	size;
+};
+
+enum perf_event_type {
+	PERF_EVENT_IP		= 0,
+	PERF_EVENT_GROUP	= 1,
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -260,6 +270,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	int 				nr_siblings;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0422fd9bf62..d76e3112d38 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -75,8 +75,10 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 */
 	if (counter->group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
-	else
+	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+		group_leader->nr_siblings++;
+	}
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 }
@@ -89,6 +91,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
 
+	if (counter->group_leader != counter)
+		counter->group_leader->nr_siblings--;
+
 	/*
 	 * If this was a group counter with sibling counters then
 	 * upgrade the siblings to singleton counters by adding them
@@ -381,9 +386,11 @@ static int is_software_only_group(struct perf_counter *leader)
 
 	if (!is_software_counter(leader))
 		return 0;
+
 	list_for_each_entry(counter, &leader->sibling_list, list_entry)
 		if (!is_software_counter(counter))
 			return 0;
+
 	return 1;
 }
 
@@ -1480,6 +1487,9 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	handle->offset = offset;
 }
 
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
@@ -1514,34 +1524,53 @@ out:
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
-	u64 entry;
+	struct {
+		struct perf_event_header header;
+		u64 ip;
+	} event;
 
-	entry = instruction_pointer(regs);
+	event.header.type = PERF_EVENT_IP;
+	event.header.size = sizeof(event);
+	event.ip = instruction_pointer(regs);
 
-	perf_output_write(counter, nmi, &entry, sizeof(entry));
+	perf_output_write(counter, nmi, &event, sizeof(event));
 }
 
-struct group_entry {
-	u64 event;
-	u64 counter;
-};
-
 static void perf_output_group(struct perf_counter *counter, int nmi)
 {
+	struct perf_output_handle handle;
+	struct perf_event_header header;
 	struct perf_counter *leader, *sub;
+	unsigned int size;
+	struct {
+		u64 event;
+		u64 counter;
+	} entry;
+	int ret;
+
+	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+
+	ret = perf_output_begin(&handle, counter, size);
+	if (ret)
+		return;
+
+	header.type = PERF_EVENT_GROUP;
+	header.size = size;
+
+	perf_output_put(&handle, header);
 
 	leader = counter->group_leader;
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		struct group_entry entry;
-
 		if (sub != counter)
 			sub->hw_ops->read(sub);
 
 		entry.event = sub->hw_event.config;
 		entry.counter = atomic64_read(&sub->count);
 
-		perf_output_write(counter, nmi, &entry, sizeof(entry));
+		perf_output_put(&handle, entry);
 	}
+
+	perf_output_end(&handle, nmi);
 }
 
 void perf_counter_output(struct perf_counter *counter,
-- 
cgit v1.2.3-70-g09d2


From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:25 +0100
Subject: perf_counter: optionally provide the pid/tid of the sampled task

Allow cpu wide counters to profile userspace by providing what process
the sample belongs to.

This raises the first issue with the output type, lots of these
options: group, tid, callchain, etc.. are non-exclusive and could be
combined, suggesting a bitfield.

However, things like the mmap() data stream doesn't fit in that.

How to split the type field...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.013775235@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  5 ++++-
 kernel/perf_counter.c        | 18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c256635377d..7fdbdf8be77 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,8 +127,9 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
+				include_tid    :  1, /* include the tid */
 
-				__reserved_1   : 55;
+				__reserved_1   : 54;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -164,6 +165,8 @@ struct perf_event_header {
 enum perf_event_type {
 	PERF_EVENT_IP		= 0,
 	PERF_EVENT_GROUP	= 1,
+
+	__PERF_EVENT_TID	= 0x100,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7669afe82cc..f3e1b27bc1b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1528,16 +1528,30 @@ out:
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
+	unsigned int size;
 	struct {
 		struct perf_event_header header;
 		u64 ip;
+		u32 pid, tid;
 	} event;
 
 	event.header.type = PERF_EVENT_IP;
-	event.header.size = sizeof(event);
 	event.ip = instruction_pointer(regs);
 
-	perf_output_write(counter, nmi, &event, sizeof(event));
+	size = sizeof(event);
+
+	if (counter->hw_event.include_tid) {
+		/* namespace issues */
+		event.pid = current->group_leader->pid;
+		event.tid = current->pid;
+
+		event.header.type |= __PERF_EVENT_TID;
+	} else
+		size -= sizeof(u64);
+
+	event.header.size = size;
+
+	perf_output_write(counter, nmi, &event, size);
 }
 
 static void perf_output_group(struct perf_counter *counter, int nmi)
-- 
cgit v1.2.3-70-g09d2


From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 25 Mar 2009 22:46:58 +1100
Subject: perf_counter: record time running and time enabled for each counter

Impact: new functionality

Currently, if there are more counters enabled than can fit on the CPU,
the kernel will multiplex the counters on to the hardware using
round-robin scheduling.  That isn't too bad for sampling counters, but
for counting counters it means that the value read from a counter
represents some unknown fraction of the true count of events that
occurred while the counter was enabled.

This remedies the situation by keeping track of how long each counter
is enabled for, and how long it is actually on the cpu and counting
events.  These times are recorded in nanoseconds using the task clock
for per-task counters and the cpu clock for per-cpu counters.

These values can be supplied to userspace on a read from the counter.
Userspace requests that they be supplied after the counter value by
setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or
PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field
when creating the counter.  (There is no way to change the read format
after the counter is created, though it would be possible to add some
way to do that.)

Using this information it is possible for userspace to scale the count
it reads from the counter to get an estimate of the true count:

true_count_estimate = count * total_time_enabled / total_time_running

This also lets userspace detect the situation where the counter never
got to go on the cpu: total_time_running == 0.

This functionality has been requested by the PAPI developers, and will
be generally needed for interpreting the count values from counting
counters correctly.

In the implementation, this keeps 5 time values (in nanoseconds) for
each counter: total_time_enabled and total_time_running are used when
the counter is in state OFF or ERROR and for reporting back to
userspace.  When the counter is in state INACTIVE or ACTIVE, it is the
tstamp_enabled, tstamp_running and tstamp_stopped values that are
relevant, and total_time_enabled and total_time_running are determined
from them.  (tstamp_stopped is only used in INACTIVE state.)  The
reason for doing it like this is that it means that only counters
being enabled or disabled at sched-in and sched-out time need to be
updated.  There are no new loops that iterate over all counters to
update total_time_enabled or total_time_running.

This also keeps separate child_total_time_running and
child_total_time_enabled fields that get added in when reporting the
totals to userspace.  They are separate fields so that they can be
atomic.  We don't want to use atomics for total_time_running,
total_time_enabled etc., because then we would have to use atomic
sequences to update them, which are slower than regular arithmetic and
memory accesses.

It is possible to measure total_time_running by adding a task_clock
counter to each group of counters, and total_time_enabled can be
measured approximately with a top-level task_clock counter (though
inaccuracies will creep in if you need to disable and enable groups
since it is not possible in general to disable/enable the top-level
task_clock counter simultaneously with another group).  However, that
adds extra overhead - I measured around 15% increase in the context
switch latency reported by lat_ctx (from lmbench) when a task_clock
counter was added to each of 2 groups, and around 25% increase when a
task_clock counter was added to each of 4 groups.  (In both cases a
top-level task-clock counter was also added.)

In contrast, the code added in this commit gives better information
with no overhead that I could measure (in fact in some cases I
measured lower times with this code, but the differences were all less
than one standard deviation).

[ v2: address review comments by Andrew Morton. ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |   2 +
 include/linux/perf_counter.h       |  53 +++++++++++++
 kernel/perf_counter.c              | 157 ++++++++++++++++++++++++++++++++-----
 3 files changed, 191 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d48596ab655..df007fe0cc0 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
+	counter->tstamp_running += counter->ctx->time_now -
+		counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7fdbdf8be77..6bf67ce1762 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -102,6 +102,16 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_SHIFT	0
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1,
+	PERF_FORMAT_TOTAL_TIME_RUNNING	=  2,
+};
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
@@ -281,6 +291,32 @@ struct perf_counter {
 	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
+	/*
+	 * These are the total time in nanoseconds that the counter
+	 * has been enabled (i.e. eligible to run, and the task has
+	 * been scheduled in, if this is a per-task counter)
+	 * and running (scheduled onto the CPU), respectively.
+	 *
+	 * They are computed from tstamp_enabled, tstamp_running and
+	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
+	 */
+	u64				total_time_enabled;
+	u64				total_time_running;
+
+	/*
+	 * These are timestamps used for computing total_time_enabled
+	 * and total_time_running when the counter is in INACTIVE or
+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
+	 * in time.
+	 * tstamp_enabled: the notional time when the counter was enabled
+	 * tstamp_running: the notional time when the counter was scheduled on
+	 * tstamp_stopped: in INACTIVE state, the notional time when the
+	 *	counter was scheduled off.
+	 */
+	u64				tstamp_enabled;
+	u64				tstamp_running;
+	u64				tstamp_stopped;
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -291,6 +327,13 @@ struct perf_counter {
 	struct perf_counter		*parent;
 	struct list_head		child_list;
 
+	/*
+	 * These accumulate total time (in nanoseconds) that children
+	 * counters have been enabled and running, respectively.
+	 */
+	atomic64_t			child_total_time_enabled;
+	atomic64_t			child_total_time_running;
+
 	/*
 	 * Protect attach/detach and child_list:
 	 */
@@ -339,6 +382,16 @@ struct perf_counter_context {
 	int			nr_active;
 	int			is_active;
 	struct task_struct	*task;
+
+	/*
+	 * time_now is the current time in nanoseconds since an arbitrary
+	 * point in the past.  For per-task counters, this is based on the
+	 * task clock, and for per-cpu counters it is based on the cpu clock.
+	 * time_lost is an offset from the task/cpu clock, used to make it
+	 * appear that time only passes while the context is scheduled in.
+	 */
+	u64			time_now;
+	u64			time_lost;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 95e02575546..3b862a7988c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_stopped = ctx->time_now;
 	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
@@ -251,6 +252,60 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Get the current time for this context.
+ * If this is a task context, we use the task's task clock,
+ * or for a per-cpu context, we use the cpu clock.
+ */
+static u64 get_context_time(struct perf_counter_context *ctx, int update)
+{
+	struct task_struct *curr = ctx->task;
+
+	if (!curr)
+		return cpu_clock(smp_processor_id());
+
+	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx, int update)
+{
+	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	u64 run_end;
+
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		counter->total_time_enabled = ctx->time_now -
+			counter->tstamp_enabled;
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+			run_end = counter->tstamp_stopped;
+		else
+			run_end = ctx->time_now;
+		counter->total_time_running = run_end - counter->tstamp_running;
+	}
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	update_counter_times(leader);
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		update_counter_times(counter);
+}
+
 /*
  * Cross CPU call to disable a performance counter
  */
@@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		update_context_time(ctx, 1);
+		update_counter_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
 		else
@@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 
 	spin_unlock_irq(&ctx->lock);
 }
@@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
+	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter,
 	return can_add_hw;
 }
 
+static void add_counter_to_ctx(struct perf_counter *counter,
+			       struct perf_counter_context *ctx)
+{
+	list_add_counter(counter, ctx);
+	ctx->nr_counters++;
+	counter->prev_state = PERF_COUNTER_STATE_OFF;
+	counter->tstamp_enabled = ctx->time_now;
+	counter->tstamp_running = ctx->time_now;
+	counter->tstamp_stopped = ctx->time_now;
+}
+
 /*
  * Cross CPU call to install and enable a performance counter
  */
@@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
+	update_context_time(ctx, 1);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_add_counter(counter, ctx);
-	ctx->nr_counters++;
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
+	add_counter_to_ctx(counter, ctx);
 
 	/*
 	 * Don't put the counter on if it is disabled or if
@@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned)
+		if (leader->hw_event.pinned) {
+			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
 	if (!err && !ctx->task && cpuctx->max_pertask)
@@ -548,10 +621,8 @@ retry:
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list_entry)) {
-		list_add_counter(counter, ctx);
-		ctx->nr_counters++;
-	}
+	if (list_empty(&counter->list_entry))
+		add_counter_to_ctx(counter, ctx);
 	spin_unlock_irq(&ctx->lock);
 }
 
@@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
+	update_context_time(ctx, 1);
 
 	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned)
+		if (leader->hw_event.pinned) {
+			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
  unlock:
@@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_OFF)
+	if (counter->state == PERF_COUNTER_STATE_OFF) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled = ctx->time_now -
+			counter->total_time_enabled;
+	}
  out:
 	spin_unlock_irq(&ctx->lock);
 }
@@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
 		goto out;
+	update_context_time(ctx, 0);
 
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
@@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	if (likely(!ctx->nr_counters))
 		goto out;
 
+	/*
+	 * Add any time since the last sched_out to the lost time
+	 * so it doesn't get included in the total_time_enabled and
+	 * total_time_running measures for counters in the context.
+	 */
+	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+
 	flags = hw_perf_save_disable();
 
 	/*
@@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * If this pinned group hasn't been scheduled,
 		 * put it in error state.
 		 */
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+			update_group_times(counter);
 			counter->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -902,8 +990,10 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ERROR)
+		if (counter->state != PERF_COUNTER_STATE_ERROR) {
+			update_group_times(counter);
 			counter->state = PERF_COUNTER_STATE_OFF;
+		}
 	}
 
 	hw_perf_restore(perf_flags);
@@ -946,6 +1036,8 @@ int perf_counter_task_enable(void)
 		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled = ctx->time_now -
+			counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
@@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
 	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
 	curr_rq_lock_irq_save(&flags);
+	if (ctx->is_active)
+		update_context_time(ctx, 1);
 	counter->hw_ops->read(counter);
+	update_counter_times(counter);
 	curr_rq_unlock_irq_restore(&flags);
 }
 
@@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __read, counter, 1);
+	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+		update_counter_times(counter);
 	}
 
 	return atomic64_read(&counter->count);
@@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 cntval;
-
-	if (count < sizeof(cntval))
-		return -EINVAL;
+	u64 values[3];
+	int n;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return 0;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_counter_read(counter);
+	values[0] = perf_counter_read(counter);
+	n = 1;
+	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
 	mutex_unlock(&counter->mutex);
 
-	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+	if (count < n * sizeof(u64))
+		return -EINVAL;
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
 }
 
 static ssize_t
@@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	 * Link it up in the child's context:
 	 */
 	child_counter->task = child;
-	list_add_counter(child_counter, child_ctx);
-	child_ctx->nr_counters++;
+	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
 	/*
@@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	 * Add back the child's count to the parent's count:
 	 */
 	atomic64_add(child_val, &parent_counter->count);
+	atomic64_add(child_counter->total_time_enabled,
+		     &parent_counter->child_total_time_enabled);
+	atomic64_add(child_counter->total_time_running,
+		     &parent_counter->child_total_time_running);
 
 	/*
 	 * Remove this counter from the parent's list
@@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	if (child != current) {
 		wait_task_inactive(child, 0);
 		list_del_init(&child_counter->list_entry);
+		update_counter_times(child_counter);
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
@@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
 		group_sched_out(child_counter, cpuctx, child_ctx);
+		update_counter_times(child_counter);
 
 		list_del_init(&child_counter->list_entry);
 
-- 
cgit v1.2.3-70-g09d2


From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:02 +0200
Subject: perf_counter: unify and fix delayed counter wakeup

While going over the wakeup code I noticed delayed wakeups only work
for hardware counters but basically all software counters rely on
them.

This patch unifies and generalizes the delayed wakeup to fix this
issue.

Since we're dealing with NMI context bits here, use a cmpxchg() based
single link list implementation to track counters that have pending
wakeups.

[ This should really be generic code for delayed wakeups, but since we
  cannot use cmpxchg()/xchg() in generic code, I've let it live in the
  perf_counter code. -- Eric Dumazet could use it to aggregate the
  network wakeups. ]

Furthermore, the x86 method of using TIF flags was flawed in that its
quite possible to end up setting the bit on the idle task, loosing the
wakeup.

The powerpc method uses per-cpu storage and does appear to be
sufficient.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.153932974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/hw_irq.h   |   4 +-
 arch/powerpc/kernel/irq.c           |   2 +-
 arch/powerpc/kernel/perf_counter.c  |  22 +------
 arch/x86/include/asm/perf_counter.h |   5 +-
 arch/x86/include/asm/thread_info.h  |   4 +-
 arch/x86/kernel/cpu/perf_counter.c  |  29 --------
 arch/x86/kernel/signal.c            |   6 --
 include/linux/perf_counter.h        |  15 +++--
 kernel/perf_counter.c               | 128 +++++++++++++++++++++++++++++++++---
 kernel/timer.c                      |   3 +
 10 files changed, 142 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index cb32d571c9c..20a44d0c9fd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	unsigned long x;
 
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
 
 #else
 
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	return 0;
 }
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 469e9635ff0..2cd471f92fe 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending()) {
+	if (test_perf_counter_pending()) {
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
 	}
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df007fe0cc0..cde720fc495 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return &power_perf_ops;
 }
 
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter && counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 	long val;
-	int need_wakeup = 0, found = 0;
+	int found = 0;
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * immediately; otherwise we'll have do the wakeup when interrupts
 	 * get soft-enabled.
 	 */
-	if (get_perf_counter_pending() && regs->softe) {
+	if (test_perf_counter_pending() && regs->softe) {
 		irq_enter();
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 1662043b340..e2b0e66b235 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,8 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	\
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+#define set_perf_counter_pending()	do { } while (0)
+#define clear_perf_counter_pending()	do { } while (0)
+#define test_perf_counter_pending()	(0)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ffd5d2a367..8820a73ae09 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,7 +83,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f95b0cdc55..7aab177fb56 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		 */
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
-	counter->wakeup_pending = 0;
 
 	return 0;
 }
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-/*
- * This handler is triggered by NMI contexts:
- */
-void perf_counter_notify(struct pt_regs *regs)
-{
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int bit, cpu;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		if (!counter)
-			continue;
-
-		if (counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 611615a92c9..0a813b17b17 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
-	if (thread_info_flags & _TIF_PERF_COUNTERS) {
-		clear_thread_flag(TIF_PERF_COUNTERS);
-		perf_counter_notify(regs);
-	}
-
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bf67ce1762..0d833228eee 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -275,6 +275,10 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
+struct perf_wakeup_entry {
+	struct perf_wakeup_entry *next;
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -350,7 +354,7 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
-	int				wakeup_pending;
+	struct perf_wakeup_entry	wakeup;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
@@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
@@ -461,7 +465,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
-static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)				{ }
@@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(u32 event, u64 nr,
-					int nmi, struct pt_regs *regs)	{ }
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7988c..f70ff80e79d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
 	kfree(counter);
 }
 
+static void perf_pending_sync(struct perf_counter *counter);
+
 static void free_counter(struct perf_counter *counter)
 {
+	perf_pending_sync(counter);
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = {
 	.mmap			= perf_mmap,
 };
 
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data) {
+		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		__perf_counter_update_userpage(counter, data);
+	}
+	rcu_read_unlock();
+
+	wake_up_all(&counter->waitq);
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+	PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_counter *counter)
+{
+	struct perf_wakeup_entry **head;
+	struct perf_wakeup_entry *prev, *next;
+
+	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+		return;
+
+	head = &get_cpu_var(perf_wakeup_head);
+
+	do {
+		prev = counter->wakeup.next = *head;
+		next = &counter->wakeup;
+	} while (cmpxchg(head, prev, next) != prev);
+
+	set_perf_counter_pending();
+
+	put_cpu_var(perf_wakeup_head);
+}
+
+static int __perf_pending_run(void)
+{
+	struct perf_wakeup_entry *list;
+	int nr = 0;
+
+	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	while (list != PENDING_TAIL) {
+		struct perf_counter *counter = container_of(list,
+				struct perf_counter, wakeup);
+
+		list = list->next;
+
+		counter->wakeup.next = NULL;
+		/*
+		 * Ensure we observe the unqueue before we issue the wakeup,
+		 * so that we won't be waiting forever.
+		 * -- see perf_not_pending().
+		 */
+		smp_wmb();
+
+		perf_counter_wakeup(counter);
+		nr++;
+	}
+
+	return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+	/*
+	 * If we flush on whatever cpu we run, there is a chance we don't
+	 * need to wait.
+	 */
+	get_cpu();
+	__perf_pending_run();
+	put_cpu();
+
+	/*
+	 * Ensure we see the proper queue state before going to sleep
+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
+	 */
+	smp_rmb();
+	return counter->wakeup.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+	wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+	__perf_pending_run();
+}
+
 /*
  * Output
  */
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
-		(void)atomic_xchg(&handle->data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(handle->counter, handle->data);
-		if (nmi) {
-			handle->counter->wakeup_pending = 1;
-			set_perf_counter_pending();
-		} else
-			wake_up(&handle->counter->waitq);
+		if (nmi)
+			perf_pending_queue(handle->counter);
+		else
+			perf_counter_wakeup(handle->counter);
 	}
 	rcu_read_unlock();
 }
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
-	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 	counter->ctx			= ctx;
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e..672ca25fbc4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+	perf_counter_do_pending();
+
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
-- 
cgit v1.2.3-70-g09d2


From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:03 +0200
Subject: perf_counter: fix update_userpage()

It just occured to me it is possible to have multiple contending
updates of the userpage (mmap information vs overflow vs counter).
This would break the seqlock logic.

It appear the arch code uses this from NMI context, so we cannot
possibly serialize its use, therefore separate the data_head update
from it and let it return to its original use.

The arch code needs to make sure there are no contending callers by
disabling the counter before using it -- powerpc appears to do this
nicely.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.241410660@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/perf_counter.c        | 38 +++++++++++++++++++++++---------------
 2 files changed, 58 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0d833228eee..8ac18852dcf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -160,10 +160,45 @@ struct perf_counter_hw_event {
 struct perf_counter_mmap_page {
 	__u32	version;		/* version number of this structure */
 	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw counters in user-space.
+	 *
+	 * The index and offset should be read atomically using the seqlock:
+	 *
+	 *   __u32 seq, index;
+	 *   __s64 offset;
+	 *
+	 * again:
+	 *   rmb();
+	 *   seq = pc->lock;
+	 *
+	 *   if (unlikely(seq & 1)) {
+	 *     cpu_relax();
+	 *     goto again;
+	 *   }
+	 *
+	 *   index = pc->index;
+	 *   offset = pc->offset;
+	 *
+	 *   rmb();
+	 *   if (pc->lock != seq)
+	 *     goto again;
+	 *
+	 * After this, index contains architecture specific counter index + 1,
+	 * so that 0 means unavailable, offset contains the value to be added
+	 * to the result of the raw timer read to obtain this counter's value.
+	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
 
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading this value should issue an rmb(), on SMP capable
+	 * platforms, after reading this value -- see perf_counter_wakeup().
+	 */
 	__u32   data_head;		/* head in the data section */
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f70ff80e79d..c95e92329b9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1316,10 +1316,22 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static void __perf_counter_update_userpage(struct perf_counter *counter,
-					   struct perf_mmap_data *data)
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
 {
-	struct perf_counter_mmap_page *userpg = data->user_page;
+	struct perf_mmap_data *data;
+	struct perf_counter_mmap_page *userpg;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto unlock;
+
+	userpg = data->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -1333,20 +1345,10 @@ static void __perf_counter_update_userpage(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	userpg->data_head = atomic_read(&data->head);
 	smp_wmb();
 	++userpg->lock;
 	preempt_enable();
-}
-
-void perf_counter_update_userpage(struct perf_counter *counter)
-{
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data)
-		__perf_counter_update_userpage(counter, data);
+unlock:
 	rcu_read_unlock();
 }
 
@@ -1547,7 +1549,13 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	data = rcu_dereference(counter->data);
 	if (data) {
 		(void)atomic_xchg(&data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(counter, data);
+		/*
+		 * Ensure all data writes are issued before updating the
+		 * user-space data head information. The matching rmb()
+		 * will be in userspace after reading this value.
+		 */
+		smp_wmb();
+		data->user_page->data_head = atomic_read(&data->head);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3-70-g09d2


From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:05 +0200
Subject: perf_counter: executable mmap() information

Currently the profiling information returns userspace IPs but no way
to correlate them to userspace code. Userspace could look into
/proc/$pid/maps but that might not be current or even present anymore
at the time of analyzing the IPs.

Therefore provide means to track the mmap information and provide it
in the output stream.

XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <20090330171023.417259499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  24 ++++++-
 kernel/perf_counter.c        | 145 +++++++++++++++++++++++++++++++++++++++++++
 mm/mmap.c                    |  10 +++
 3 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8ac18852dcf..037a81145ac 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -137,9 +137,11 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid */
+				include_tid    :  1, /* include the tid       */
+				mmap           :  1, /* include mmap data     */
+				munmap         :  1, /* include munmap data   */
 
-				__reserved_1   : 54;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -211,6 +213,9 @@ enum perf_event_type {
 	PERF_EVENT_IP		= 0,
 	PERF_EVENT_GROUP	= 1,
 
+	PERF_EVENT_MMAP		= 2,
+	PERF_EVENT_MUNMAP	= 3,
+
 	__PERF_EVENT_TID	= 0x100,
 };
 
@@ -491,6 +496,12 @@ static inline int is_software_counter(struct perf_counter *counter)
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
+extern void perf_counter_mmap(unsigned long addr, unsigned long len,
+			      unsigned long pgoff, struct file *file);
+
+extern void perf_counter_munmap(unsigned long addr, unsigned long len,
+				unsigned long pgoff, struct file *file);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -511,6 +522,15 @@ static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 static inline void
 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
 
+
+static inline void
+perf_counter_mmap(unsigned long addr, unsigned long len,
+		  unsigned long pgoff, struct file *file)		{ }
+
+static inline void
+perf_counter_munmap(unsigned long addr, unsigned long len,
+		    unsigned long pgoff, struct file *file) 		{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c95e92329b9..f35e89e3d6a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -25,6 +25,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
+#include <linux/dcache.h>
 
 #include <asm/irq_regs.h>
 
@@ -1843,6 +1844,150 @@ void perf_counter_output(struct perf_counter *counter,
 	}
 }
 
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+	struct file	*file;
+	char		*file_name;
+	int		file_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				start;
+		u64				len;
+		u64				pgoff;
+	} event;
+};
+
+static void perf_counter_mmap_output(struct perf_counter *counter,
+				     struct perf_mmap_event *mmap_event)
+{
+	struct perf_output_handle handle;
+	int size = mmap_event->event.header.size;
+	int ret = perf_output_begin(&handle, counter, size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, mmap_event->event);
+	perf_output_copy(&handle, mmap_event->file_name,
+				   mmap_event->file_size);
+	perf_output_end(&handle, 0);
+}
+
+static int perf_counter_mmap_match(struct perf_counter *counter,
+				   struct perf_mmap_event *mmap_event)
+{
+	if (counter->hw_event.mmap &&
+	    mmap_event->event.header.type == PERF_EVENT_MMAP)
+		return 1;
+
+	if (counter->hw_event.munmap &&
+	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+				  struct perf_mmap_event *mmap_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_mmap_match(counter, mmap_event))
+			perf_counter_mmap_output(counter, mmap_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct file *file = mmap_event->file;
+	unsigned int size;
+	char tmp[16];
+	char *buf = NULL;
+	char *name;
+
+	if (file) {
+		buf = kzalloc(PATH_MAX, GFP_KERNEL);
+		if (!buf) {
+			name = strncpy(tmp, "//enomem", sizeof(tmp));
+			goto got_name;
+		}
+		name = dentry_path(file->f_dentry, buf, PATH_MAX);
+		if (IS_ERR(name)) {
+			name = strncpy(tmp, "//toolong", sizeof(tmp));
+			goto got_name;
+		}
+	} else {
+		name = strncpy(tmp, "//anon", sizeof(tmp));
+		goto got_name;
+	}
+
+got_name:
+	size = ALIGN(strlen(name), sizeof(u64));
+
+	mmap_event->file_name = name;
+	mmap_event->file_size = size;
+
+	mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+	put_cpu_var(perf_cpu_context);
+
+	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+
+	kfree(buf);
+}
+
+void perf_counter_mmap(unsigned long addr, unsigned long len,
+		       unsigned long pgoff, struct file *file)
+{
+	struct perf_mmap_event mmap_event = {
+		.file   = file,
+		.event  = {
+			.header = { .type = PERF_EVENT_MMAP, },
+			.pid	= current->group_leader->pid,
+			.tid	= current->pid,
+			.start  = addr,
+			.len    = len,
+			.pgoff  = pgoff,
+		},
+	};
+
+	perf_counter_mmap_event(&mmap_event);
+}
+
+void perf_counter_munmap(unsigned long addr, unsigned long len,
+			 unsigned long pgoff, struct file *file)
+{
+	struct perf_mmap_event mmap_event = {
+		.file   = file,
+		.event  = {
+			.header = { .type = PERF_EVENT_MUNMAP, },
+			.pid	= current->group_leader->pid,
+			.tid	= current->pid,
+			.start  = addr,
+			.len    = len,
+			.pgoff  = pgoff,
+		},
+	};
+
+	perf_counter_mmap_event(&mmap_event);
+}
+
 /*
  * Generic software counter infrastructure
  */
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a3841186c1..1df63f614f9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1223,6 +1224,9 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
+	if (vm_flags & VM_EXEC)
+		perf_counter_mmap(addr, len, pgoff, file);
+
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 	do {
 		long nrpages = vma_pages(vma);
 
+		if (vma->vm_flags & VM_EXEC) {
+			perf_counter_munmap(vma->vm_start,
+					nrpages << PAGE_SHIFT,
+					vma->vm_pgoff, vma->vm_file);
+		}
+
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
-- 
cgit v1.2.3-70-g09d2


From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:12 +0200
Subject: perf_counter: re-arrange the perf_event_type

Breaks ABI yet again :-)

Change the event type so that [0, 2^31-1] are regular event types, but
[2^31, 2^32-1] forms a bitmask for overflow events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.047961770@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  6 +++--
 kernel/perf_counter.c        | 56 ++++++++++++++++++++------------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 037a81145ac..edf5bfb7ff5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -210,13 +210,15 @@ struct perf_event_header {
 };
 
 enum perf_event_type {
-	PERF_EVENT_IP		= 0,
+
 	PERF_EVENT_GROUP	= 1,
 
 	PERF_EVENT_MMAP		= 2,
 	PERF_EVENT_MUNMAP	= 3,
 
-	__PERF_EVENT_TID	= 0x100,
+	PERF_EVENT_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP		= 1UL << 30,
+	__PERF_EVENT_TID	= 1UL << 29,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4471e7e2c10..d93e9ddf784 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1754,50 +1754,44 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static int perf_output_write(struct perf_counter *counter, int nmi,
-			     void *buf, ssize_t size)
-{
-	struct perf_output_handle handle;
-	int ret;
-
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		goto out;
-
-	perf_output_copy(&handle, buf, size);
-	perf_output_end(&handle);
-
-out:
-	return ret;
-}
-
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
-	unsigned int size;
+	int ret;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	u64 ip;
 	struct {
-		struct perf_event_header header;
-		u64 ip;
 		u32 pid, tid;
-	} event;
+	} tid_entry;
 
-	event.header.type = PERF_EVENT_IP;
-	event.ip = instruction_pointer(regs);
+	header.type = PERF_EVENT_OVERFLOW;
+	header.size = sizeof(header);
 
-	size = sizeof(event);
+	ip = instruction_pointer(regs);
+	header.type |= __PERF_EVENT_IP;
+	header.size += sizeof(ip);
 
 	if (counter->hw_event.include_tid) {
 		/* namespace issues */
-		event.pid = current->group_leader->pid;
-		event.tid = current->pid;
+		tid_entry.pid = current->group_leader->pid;
+		tid_entry.tid = current->pid;
 
-		event.header.type |= __PERF_EVENT_TID;
-	} else
-		size -= sizeof(u64);
+		header.type |= __PERF_EVENT_TID;
+		header.size += sizeof(tid_entry);
+	}
 
-	event.header.size = size;
+	ret = perf_output_begin(&handle, counter, header.size, nmi);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, header);
+	perf_output_put(&handle, ip);
+
+	if (counter->hw_event.include_tid)
+		perf_output_put(&handle, tid_entry);
 
-	perf_output_write(counter, nmi, &event, size);
+	perf_output_end(&handle);
 }
 
 static void perf_output_group(struct perf_counter *counter, int nmi)
-- 
cgit v1.2.3-70-g09d2


From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:14 +0200
Subject: perf_counter: provide generic callchain bits

Provide the generic callchain support bits. If hw_event->callchain is
set the arch specific perf_callchain() function is called upon to
provide a perf_callchain_entry structure filled with the current
callchain.

If it does so, it is added to the overflow output event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.254266860@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 13 ++++++++++++-
 kernel/perf_counter.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index edf5bfb7ff5..43083afffe0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,8 +140,9 @@ struct perf_counter_hw_event {
 				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -219,6 +220,7 @@ enum perf_event_type {
 	PERF_EVENT_OVERFLOW	= 1UL << 31,
 	__PERF_EVENT_IP		= 1UL << 30,
 	__PERF_EVENT_TID	= 1UL << 29,
+	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
 };
 
 #ifdef __KERNEL__
@@ -504,6 +506,15 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
+#define MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	u64	nr;
+	u64	ip[MAX_STACK_DEPTH];
+};
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d93e9ddf784..860cdc26bd7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1653,6 +1653,17 @@ void perf_counter_do_pending(void)
 	__perf_pending_run();
 }
 
+/*
+ * Callchain support -- arch specific
+ */
+
+struct perf_callchain_entry *
+__attribute__((weak))
+perf_callchain(struct pt_regs *regs)
+{
+	return NULL;
+}
+
 /*
  * Output
  */
@@ -1764,6 +1775,8 @@ static void perf_output_simple(struct perf_counter *counter,
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct perf_callchain_entry *callchain = NULL;
+	int callchain_size = 0;
 
 	header.type = PERF_EVENT_OVERFLOW;
 	header.size = sizeof(header);
@@ -1781,6 +1794,17 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
+	if (counter->hw_event.callchain) {
+		callchain = perf_callchain(regs);
+
+		if (callchain) {
+			callchain_size = (1 + callchain->nr) * sizeof(u64);
+
+			header.type |= __PERF_EVENT_CALLCHAIN;
+			header.size += callchain_size;
+		}
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi);
 	if (ret)
 		return;
@@ -1791,6 +1815,9 @@ static void perf_output_simple(struct perf_counter *counter,
 	if (counter->hw_event.include_tid)
 		perf_output_put(&handle, tid_entry);
 
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
+
 	perf_output_end(&handle);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:11:59 +0200
Subject: perf_counter: move the event overflow output bits to record_type

Per suggestion from Paul, move the event overflow bits to record_type
and sanitize the enums a bit.

Breaks the ABI -- again ;-)

Suggested-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.151921176@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  50 +++++++++++----------
 kernel/perf_counter.c        | 101 +++++++++++++++++--------------------------
 2 files changed, 68 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 43083afffe0..06a6fba9f53 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,15 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
-};
-
 #define __PERF_COUNTER_MASK(name) 			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
@@ -102,6 +93,17 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_SHIFT	0
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
+/*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+	PERF_RECORD_IP		= 1U << 0,
+	PERF_RECORD_TID		= 1U << 1,
+	PERF_RECORD_GROUP	= 1U << 2,
+	PERF_RECORD_CALLCHAIN	= 1U << 3,
+};
+
 /*
  * Bits that can be set in hw_event.read_format to request that
  * reads on the counter should return the indicated quantities,
@@ -125,8 +127,8 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
+	__u32			record_type;
+	__u32			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -137,12 +139,10 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
-				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 51;
+				__reserved_1   : 53;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -212,15 +212,21 @@ struct perf_event_header {
 
 enum perf_event_type {
 
-	PERF_EVENT_GROUP	= 1,
-
-	PERF_EVENT_MMAP		= 2,
-	PERF_EVENT_MUNMAP	= 3,
+	PERF_EVENT_MMAP			= 1,
+	PERF_EVENT_MUNMAP		= 2,
 
-	PERF_EVENT_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP		= 1UL << 30,
-	__PERF_EVENT_TID	= 1UL << 29,
-	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
+	/*
+	 * Half the event type space is reserved for the counter overflow
+	 * bitfields, as found in hw_event.record_type.
+	 *
+	 * These events will have types of the form:
+	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 */
+	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP			= PERF_RECORD_IP,
+	__PERF_EVENT_TID		= PERF_RECORD_TID,
+	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
+	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 860cdc26bd7..995063df910 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void perf_output_simple(struct perf_counter *counter,
-			       int nmi, struct pt_regs *regs)
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
 {
 	int ret;
+	u64 record_type = counter->hw_event.record_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct {
+		u64 event;
+		u64 counter;
+	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 
-	header.type = PERF_EVENT_OVERFLOW;
+	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
-	ip = instruction_pointer(regs);
-	header.type |= __PERF_EVENT_IP;
-	header.size += sizeof(ip);
+	if (record_type & PERF_RECORD_IP) {
+		ip = instruction_pointer(regs);
+		header.type |= __PERF_EVENT_IP;
+		header.size += sizeof(ip);
+	}
 
-	if (counter->hw_event.include_tid) {
+	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
@@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
-	if (counter->hw_event.callchain) {
+	if (record_type & PERF_RECORD_GROUP) {
+		header.type |= __PERF_EVENT_GROUP;
+		header.size += sizeof(u64) +
+			counter->nr_siblings * sizeof(group_entry);
+	}
+
+	if (record_type & PERF_RECORD_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
@@ -1810,69 +1823,35 @@ static void perf_output_simple(struct perf_counter *counter,
 		return;
 
 	perf_output_put(&handle, header);
-	perf_output_put(&handle, ip);
 
-	if (counter->hw_event.include_tid)
-		perf_output_put(&handle, tid_entry);
+	if (record_type & PERF_RECORD_IP)
+		perf_output_put(&handle, ip);
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
-
-	perf_output_end(&handle);
-}
-
-static void perf_output_group(struct perf_counter *counter, int nmi)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_counter *leader, *sub;
-	unsigned int size;
-	struct {
-		u64 event;
-		u64 counter;
-	} entry;
-	int ret;
-
-	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+	if (record_type & PERF_RECORD_TID)
+		perf_output_put(&handle, tid_entry);
 
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		return;
+	if (record_type & PERF_RECORD_GROUP) {
+		struct perf_counter *leader, *sub;
+		u64 nr = counter->nr_siblings;
 
-	header.type = PERF_EVENT_GROUP;
-	header.size = size;
+		perf_output_put(&handle, nr);
 
-	perf_output_put(&handle, header);
+		leader = counter->group_leader;
+		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+			if (sub != counter)
+				sub->hw_ops->read(sub);
 
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
+			group_entry.event = sub->hw_event.config;
+			group_entry.counter = atomic64_read(&sub->count);
 
-		entry.event = sub->hw_event.config;
-		entry.counter = atomic64_read(&sub->count);
-
-		perf_output_put(&handle, entry);
+			perf_output_put(&handle, group_entry);
+		}
 	}
 
-	perf_output_end(&handle);
-}
-
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return;
-
-	case PERF_RECORD_IRQ:
-		perf_output_simple(counter, nmi, regs);
-		break;
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
 
-	case PERF_RECORD_GROUP:
-		perf_output_group(counter, nmi);
-		break;
-	}
+	perf_output_end(&handle);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:01 +0200
Subject: perf_counter: per event wakeups

By request, provide a way to request a wakeup every 'n' events instead
of every page of output.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.323309784@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 ++-
 kernel/perf_counter.c        | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 06a6fba9f53..5428ba120d7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -145,7 +145,7 @@ struct perf_counter_hw_event {
 				__reserved_1   : 53;
 
 	__u32			extra_config_len;
-	__u32			__reserved_4;
+	__u32			wakeup_events;	/* wakeup every n events */
 
 	__u64			__reserved_2;
 	__u64			__reserved_3;
@@ -321,6 +321,7 @@ struct perf_mmap_data {
 	int				nr_pages;
 	atomic_t			wakeup;
 	atomic_t			head;
+	atomic_t			events;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 995063df910..9bcab10e735 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup)
+	int wakeup_events = handle->counter->hw_event.wakeup_events;
+
+	if (wakeup_events) {
+		int events = atomic_inc_return(&handle->data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &handle->data->events);
+			__perf_output_wakeup(handle);
+		}
+	} else if (handle->wakeup)
 		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3-70-g09d2


From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:03 +0200
Subject: perf_counter: add more context information

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.493101305@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++
 include/linux/perf_counter.h       | 4 ++--
 kernel/perf_counter.c              | 2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2a946a160ca..c74e20d593a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5428ba120d7..90cce0c74a0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH		254
 
 struct perf_callchain_entry {
-	u64	nr;
+	u32	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9bcab10e735..f105a6e696c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
+			callchain_size = (2 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
cgit v1.2.3-70-g09d2


From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:04 +0200
Subject: perf_counter: update mmap() counter read

Paul noted that we don't need SMP barriers for the mmap() counter read
because its always on the same cpu (otherwise you can't access the hw
counter anyway).

So remove the SMP barriers and replace them with regular compiler
barriers.

Further, update the comment to include a race free method of reading
said hardware counter. The primary change is putting the pmc_read
inside the seq-loop, otherwise we can still race and read rubbish.

Noticed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.577951445@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 22 ++++++++++------------
 kernel/perf_counter.c        |  4 ++--
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 90cce0c74a0..f2b914de3f0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -167,30 +167,28 @@ struct perf_counter_mmap_page {
 	/*
 	 * Bits needed to read the hw counters in user-space.
 	 *
-	 * The index and offset should be read atomically using the seqlock:
-	 *
-	 *   __u32 seq, index;
-	 *   __s64 offset;
+	 *   u32 seq;
+	 *   s64 count;
 	 *
 	 * again:
-	 *   rmb();
 	 *   seq = pc->lock;
-	 *
 	 *   if (unlikely(seq & 1)) {
 	 *     cpu_relax();
 	 *     goto again;
 	 *   }
 	 *
-	 *   index = pc->index;
-	 *   offset = pc->offset;
+	 *   if (pc->index) {
+	 *     count = pmc_read(pc->index - 1);
+	 *     count += pc->offset;
+	 *   } else
+	 *     goto regular_read;
 	 *
-	 *   rmb();
+	 *   barrier();
 	 *   if (pc->lock != seq)
 	 *     goto again;
 	 *
-	 * After this, index contains architecture specific counter index + 1,
-	 * so that 0 means unavailable, offset contains the value to be added
-	 * to the result of the raw timer read to obtain this counter's value.
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
 	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f105a6e696c..2a5d4f52556 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	 */
 	preempt_disable();
 	++userpg->lock;
-	smp_wmb();
+	barrier();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	smp_wmb();
+	barrier();
 	++userpg->lock;
 	preempt_enable();
 unlock:
-- 
cgit v1.2.3-70-g09d2


From 52400ba946759af28442dee6265c5c0180ac7122 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:49 -0700
Subject: futex: add requeue_pi functionality

PI Futexes and their underlying rt_mutex cannot be left ownerless if
there are pending waiters as this will break the PI boosting logic, so
the standard requeue commands aren't sufficient.  The new commands
properly manage pi futex ownership by ensuring a futex with waiters
has an owner at all times.  This will allow glibc to properly handle
pi mutexes with pthread_condvars.

The approach taken here is to create two new futex op codes:

FUTEX_WAIT_REQUEUE_PI:
Tasks will use this op code to wait on a futex (such as a non-pi waitqueue)
and wake after they have been requeued to a pi futex.  Prior to returning to
userspace, they will acquire this pi futex (and the underlying rt_mutex).

futex_wait_requeue_pi() is the result of a high speed collision between
futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being
done by futex_proxy_trylock_atomic() on behalf of the top_waiter).

FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI):
This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI,
regardless of how many tasks the caller intends to wake or requeue.
pthread_cond_broadcast() should call this with nr_wake=1 and
nr_requeue=INT_MAX.  pthread_cond_signal() should call this with nr_wake=1 and
nr_requeue=0.  The reason being we need both callers to get the benefit of the
futex_proxy_trylock_atomic() routine.  futex_requeue() also enqueues the
top_waiter on the rt_mutex via rt_mutex_start_proxy_lock().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/futex.h       |   8 +
 include/linux/thread_info.h |   3 +-
 kernel/futex.c              | 519 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 510 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3bf5bb5a34f..b05519ca9e5 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -23,6 +23,9 @@ union ktime;
 #define FUTEX_TRYLOCK_PI	8
 #define FUTEX_WAIT_BITSET	9
 #define FUTEX_WAKE_BITSET	10
+#define FUTEX_WAIT_REQUEUE_PI	11
+#define FUTEX_REQUEUE_PI	12
+#define FUTEX_CMP_REQUEUE_PI	13
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -38,6 +41,11 @@ union ktime;
 #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_BITSET_PRIVATE	(FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_BITSET_PRIVATE	(FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
+					 FUTEX_PRIVATE_FLAG)
+#define FUTEX_REQUEUE_PI_PRIVATE	(FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
+					 FUTEX_PRIVATE_FLAG)
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e6b820f8b56..a8cc4e13434 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -21,13 +21,14 @@ struct restart_block {
 		struct {
 			unsigned long arg0, arg1, arg2, arg3;
 		};
-		/* For futex_wait */
+		/* For futex_wait and futex_wait_requeue_pi */
 		struct {
 			u32 *uaddr;
 			u32 val;
 			u32 flags;
 			u32 bitset;
 			u64 time;
+			u32 *uaddr2;
 		} futex;
 		/* For nanosleep */
 		struct {
diff --git a/kernel/futex.c b/kernel/futex.c
index dbe857aa438..185c981d89e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
  *  PRIVATE futexes by Eric Dumazet
  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -109,6 +113,9 @@ struct futex_q {
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
 
+	/* rt_waiter storage for requeue_pi: */
+	struct rt_mutex_waiter *rt_waiter;
+
 	/* Bitset for the optional bitmasked wakeup */
 	u32 bitset;
 };
@@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 
 	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
-			if (this->pi_state) {
+			if (this->pi_state || this->rt_waiter) {
 				ret = -EINVAL;
 				break;
 			}
@@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 	q->key = *key2;
 }
 
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:	the futex_q
+ * key:	the key of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+{
+	drop_futex_key_refs(&q->key);
+	get_futex_key_refs(key);
+	q->key = *key;
+
+	WARN_ON(plist_node_empty(&q->list));
+	plist_del(&q->list, &q->list.plist);
+
+	WARN_ON(!q->rt_waiter);
+	q->rt_waiter = NULL;
+
+	wake_up(&q->waiter);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:	the user address of the to futex
+ * @hb1:	the from futex hash bucket, must be locked by the caller
+ * @hb2:	the to futex hash bucket, must be locked by the caller
+ * @key1:	the from futex key
+ * @key2:	the to futex key
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+				 struct futex_hash_bucket *hb1,
+				 struct futex_hash_bucket *hb2,
+				 union futex_key *key1, union futex_key *key2,
+				 struct futex_pi_state **ps)
+{
+	struct futex_q *top_waiter;
+	u32 curval;
+	int ret;
+
+	if (get_futex_value_locked(&curval, pifutex))
+		return -EFAULT;
+
+	top_waiter = futex_top_waiter(hb1, key1);
+
+	/* There are no waiters, nothing for us to do. */
+	if (!top_waiter)
+		return 0;
+
+	/*
+	 * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
+	 * The pi_state is returned in ps in contended cases.
+	 */
+	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
+	if (ret == 1)
+		requeue_pi_wake_futex(top_waiter, key2);
+
+	return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:	source futex user address
+ * uaddr2:	target futex user address
+ * nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue:	number of waiters to requeue (0-INT_MAX)
+ * requeue_pi:	if we are attempting to requeue from a non-pi futex to a
+ * 		pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
  */
 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-			 int nr_wake, int nr_requeue, u32 *cmpval)
+			 int nr_wake, int nr_requeue, u32 *cmpval,
+			 int requeue_pi)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+	int drop_count = 0, task_count = 0, ret;
+	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
-	int ret, drop_count = 0;
+	u32 curval2;
+
+	if (requeue_pi) {
+		/*
+		 * requeue_pi requires a pi_state, try to allocate it now
+		 * without any locks in case it fails.
+		 */
+		if (refill_pi_state_cache())
+			return -ENOMEM;
+		/*
+		 * requeue_pi must wake as many tasks as it can, up to nr_wake
+		 * + nr_requeue, since it acquires the rt_mutex prior to
+		 * returning to userspace, so as to not leave the rt_mutex with
+		 * waiters and no owner.  However, second and third wake-ups
+		 * cannot be predicted as they involve race conditions with the
+		 * first wake and a fault while looking up the pi_state.  Both
+		 * pthread_cond_signal() and pthread_cond_broadcast() should
+		 * use nr_wake=1.
+		 */
+		if (nr_wake != 1)
+			return -EINVAL;
+	}
 
 retry:
+	if (pi_state != NULL) {
+		/*
+		 * We will have to lookup the pi_state again, so free this one
+		 * to keep the accounting correct.
+		 */
+		free_pi_state(pi_state);
+		pi_state = NULL;
+	}
+
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -1020,19 +1145,94 @@ retry_private:
 		}
 	}
 
+	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+		/* Attempt to acquire uaddr2 and wake the top_waiter. */
+		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+						 &key2, &pi_state);
+
+		/*
+		 * At this point the top_waiter has either taken uaddr2 or is
+		 * waiting on it.  If the former, then the pi_state will not
+		 * exist yet, look it up one more time to ensure we have a
+		 * reference to it.
+		 */
+		if (ret == 1) {
+			WARN_ON(pi_state);
+			task_count++;
+			ret = get_futex_value_locked(&curval2, uaddr2);
+			if (!ret)
+				ret = lookup_pi_state(curval2, hb2, &key2,
+						      &pi_state);
+		}
+
+		switch (ret) {
+		case 0:
+			break;
+		case -EFAULT:
+			double_unlock_hb(hb1, hb2);
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			ret = get_user(curval2, uaddr2);
+			if (!ret)
+				goto retry;
+			goto out;
+		case -EAGAIN:
+			/* The owner was exiting, try again. */
+			double_unlock_hb(hb1, hb2);
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			cond_resched();
+			goto retry;
+		default:
+			goto out_unlock;
+		}
+	}
+
 	head1 = &hb1->chain;
 	plist_for_each_entry_safe(this, next, head1, list) {
-		if (!match_futex (&this->key, &key1))
+		if (task_count - nr_wake >= nr_requeue)
+			break;
+
+		if (!match_futex(&this->key, &key1))
 			continue;
-		if (++ret <= nr_wake) {
+
+		WARN_ON(!requeue_pi && this->rt_waiter);
+		WARN_ON(requeue_pi && !this->rt_waiter);
+
+		/*
+		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+		 * lock, we already woke the top_waiter.  If not, it will be
+		 * woken by futex_unlock_pi().
+		 */
+		if (++task_count <= nr_wake && !requeue_pi) {
 			wake_futex(this);
-		} else {
-			requeue_futex(this, hb1, hb2, &key2);
-			drop_count++;
+			continue;
+		}
 
-			if (ret - nr_wake >= nr_requeue)
-				break;
+		/*
+		 * Requeue nr_requeue waiters and possibly one more in the case
+		 * of requeue_pi if we couldn't acquire the lock atomically.
+		 */
+		if (requeue_pi) {
+			/* Prepare the waiter to take the rt_mutex. */
+			atomic_inc(&pi_state->refcount);
+			this->pi_state = pi_state;
+			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+							this->rt_waiter,
+							this->task, 1);
+			if (ret == 1) {
+				/* We got the lock. */
+				requeue_pi_wake_futex(this, &key2);
+				continue;
+			} else if (ret) {
+				/* -EDEADLK */
+				this->pi_state = NULL;
+				free_pi_state(pi_state);
+				goto out_unlock;
+			}
 		}
+		requeue_futex(this, hb1, hb2, &key2);
+		drop_count++;
 	}
 
 out_unlock:
@@ -1047,7 +1247,9 @@ out_put_keys:
 out_put_key1:
 	put_futex_key(fshared, &key1);
 out:
-	return ret;
+	if (pi_state != NULL)
+		free_pi_state(pi_state);
+	return ret ? ret : task_count;
 }
 
 /* The key must be already stored in q->key. */
@@ -1270,6 +1472,7 @@ handle_fault:
 #define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
+static long futex_lock_pi_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
+	q.rt_waiter = NULL;
 
 	if (abs_time) {
 		to = &timeout;
@@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	q.pi_state = NULL;
+	q.rt_waiter = NULL;
 retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
@@ -1701,6 +1906,20 @@ uaddr_faulted:
 	goto retry;
 }
 
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+	ktime_t t, *tp = NULL;
+	int fshared = restart->futex.flags & FLAGS_SHARED;
+
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+		t.tv64 = restart->futex.time;
+		tp = &t;
+	}
+	restart->fn = do_no_restart_syscall;
+
+	return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
+}
 
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
@@ -1803,6 +2022,253 @@ pi_faulted:
 	return ret;
 }
 
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:		the hash_bucket futex_q was original enqueued on
+ * @q:		the futex_q woken while waiting to be requeued
+ * @key2:	the futex_key of the requeue target futex
+ * @timeout:	the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+				   struct futex_q *q, union futex_key *key2,
+				   struct hrtimer_sleeper *timeout)
+{
+	int ret = 0;
+
+	/*
+	 * With the hb lock held, we avoid races while we process the wakeup.
+	 * We only need to hold hb (and not hb2) to ensure atomicity as the
+	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+	 * It can't be requeued from uaddr2 to something else since we don't
+	 * support a PI aware source futex for requeue.
+	 */
+	if (!match_futex(&q->key, key2)) {
+		WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+		/*
+		 * We were woken prior to requeue by a timeout or a signal.
+		 * Unqueue the futex_q and determine which it was.
+		 */
+		plist_del(&q->list, &q->list.plist);
+		drop_futex_key_refs(&q->key);
+
+		if (timeout && !timeout->task)
+			ret = -ETIMEDOUT;
+		else {
+			/*
+			 * We expect signal_pending(current), but another
+			 * thread may have handled it for us already.
+			 */
+			/* FIXME: ERESTARTSYS or ERESTARTNOINTR?  Do we care if
+			 * the user specified SA_RESTART or not? */
+			ret = -ERESTARTSYS;
+		}
+	}
+	return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:	the futex we initialyl wait on (non-pi)
+ * @fshared:	whether the futexes are shared (1) or not (0).  They must be
+ * 		the same type, no requeueing from private to shared, etc.
+ * @val:	the expected value of uaddr
+ * @abs_time:	absolute timeout
+ * @bitset:	32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:	the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+				 u32 val, ktime_t *abs_time, u32 bitset,
+				 int clockrt, u32 __user *uaddr2)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	struct rt_mutex_waiter rt_waiter;
+	struct rt_mutex *pi_mutex = NULL;
+	DECLARE_WAITQUEUE(wait, current);
+	struct restart_block *restart;
+	struct futex_hash_bucket *hb;
+	union futex_key key2;
+	struct futex_q q;
+	int res, ret;
+	u32 uval;
+
+	if (!bitset)
+		return -EINVAL;
+
+	if (abs_time) {
+		to = &timeout;
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
+	/*
+	 * The waiter is allocated on our stack, manipulated by the requeue
+	 * code while we sleep on uaddr.
+	 */
+	debug_rt_mutex_init_waiter(&rt_waiter);
+	rt_waiter.task = NULL;
+
+	q.pi_state = NULL;
+	q.bitset = bitset;
+	q.rt_waiter = &rt_waiter;
+
+	key2 = FUTEX_KEY_INIT;
+	ret = get_futex_key(uaddr2, fshared, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	/* Prepare to wait on uaddr. */
+	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	if (ret) {
+		put_futex_key(fshared, &key2);
+		goto out;
+	}
+
+	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
+	futex_wait_queue_me(hb, &q, to, &wait);
+
+	spin_lock(&hb->lock);
+	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+	spin_unlock(&hb->lock);
+	if (ret)
+		goto out_put_keys;
+
+	/*
+	 * In order for us to be here, we know our q.key == key2, and since
+	 * we took the hb->lock above, we also know that futex_requeue() has
+	 * completed and we no longer have to concern ourselves with a wakeup
+	 * race with the atomic proxy lock acquition by the requeue code.
+	 */
+
+	/* Check if the requeue code acquired the second futex for us. */
+	if (!q.rt_waiter) {
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case.
+		 */
+		if (q.pi_state && (q.pi_state->owner != current)) {
+			spin_lock(q.lock_ptr);
+			ret = fixup_pi_state_owner(uaddr2, &q, current,
+						   fshared);
+			spin_unlock(q.lock_ptr);
+		}
+	} else {
+		/*
+		 * We have been woken up by futex_unlock_pi(), a timeout, or a
+		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+		 * the pi_state.
+		 */
+		WARN_ON(!&q.pi_state);
+		pi_mutex = &q.pi_state->pi_mutex;
+		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+		debug_rt_mutex_free_waiter(&rt_waiter);
+
+		spin_lock(q.lock_ptr);
+		/*
+		 * Fixup the pi_state owner and possibly acquire the lock if we
+		 * haven't already.
+		 */
+		res = fixup_owner(uaddr2, fshared, &q, !ret);
+		/*
+		 * If fixup_owner() returned an error, proprogate that.  If it
+		 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+		 */
+		if (res)
+			ret = (res < 0) ? res : 0;
+
+		/* Unqueue and drop the lock. */
+		unqueue_me_pi(&q);
+	}
+
+	/*
+	 * If fixup_pi_state_owner() faulted and was unable to handle the
+	 * fault, unlock the rt_mutex and return the fault to userspace.
+	 */
+	if (ret == -EFAULT) {
+		if (rt_mutex_owner(pi_mutex) == current)
+			rt_mutex_unlock(pi_mutex);
+	} else if (ret == -EINTR) {
+		ret = -EFAULT;
+		if (get_user(uval, uaddr2))
+			goto out_put_keys;
+
+		/*
+		 * We've already been requeued, so restart by calling
+		 * futex_lock_pi() directly, rather then returning to this
+		 * function.
+		 */
+		ret = -ERESTART_RESTARTBLOCK;
+		restart = &current_thread_info()->restart_block;
+		restart->fn = futex_lock_pi_restart;
+		restart->futex.uaddr = (u32 *)uaddr2;
+		restart->futex.val = uval;
+		restart->futex.flags = 0;
+		if (abs_time) {
+			restart->futex.flags |= FLAGS_HAS_TIMEOUT;
+			restart->futex.time = abs_time->tv64;
+		}
+
+		if (fshared)
+			restart->futex.flags |= FLAGS_SHARED;
+		if (clockrt)
+			restart->futex.flags |= FLAGS_CLOCKRT;
+	}
+
+out_put_keys:
+	put_futex_key(fshared, &q.key);
+	put_futex_key(fshared, &key2);
+
+out:
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
+	return ret;
+}
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		fshared = 1;
 
 	clockrt = op & FUTEX_CLOCK_REALTIME;
-	if (clockrt && cmd != FUTEX_WAIT_BITSET)
+	if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
 		return -ENOSYS;
 
 	switch (cmd) {
@@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		ret = futex_wake(uaddr, fshared, val, val3);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
 		break;
 	case FUTEX_CMP_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+				    0);
 		break;
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		if (futex_cmpxchg_enabled)
 			ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
 		break;
+	case FUTEX_WAIT_REQUEUE_PI:
+		val3 = FUTEX_BITSET_MATCH_ANY;
+		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+					    clockrt, uaddr2);
+		break;
+	case FUTEX_REQUEUE_PI:
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
+		break;
+	case FUTEX_CMP_REQUEUE_PI:
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+				    1);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	int cmd = op & FUTEX_CMD_MASK;
 
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-		      cmd == FUTEX_WAIT_BITSET)) {
+		      cmd == FUTEX_WAIT_BITSET ||
+		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&ts))
@@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 	/*
-	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+	 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
 	    cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
-- 
cgit v1.2.3-70-g09d2


From a2e87d06ddbe6e6fdb8d6d2e5e985efe4efb07dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:44:59 +0200
Subject: perf_counter: update mmap() counter read, take 2

Update the userspace read method.

Paul noted that:
 - userspace cannot observe ->lock & 1 on the same cpu.
 - we need a barrier() between reading ->lock and ->index
   to ensure we read them in that prticular order.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.368446033@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f2b914de3f0..e22ab47a2f4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -170,22 +170,18 @@ struct perf_counter_mmap_page {
 	 *   u32 seq;
 	 *   s64 count;
 	 *
-	 * again:
-	 *   seq = pc->lock;
-	 *   if (unlikely(seq & 1)) {
-	 *     cpu_relax();
-	 *     goto again;
-	 *   }
+	 *   do {
+	 *     seq = pc->lock;
 	 *
-	 *   if (pc->index) {
-	 *     count = pmc_read(pc->index - 1);
-	 *     count += pc->offset;
-	 *   } else
-	 *     goto regular_read;
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
 	 *
-	 *   barrier();
-	 *   if (pc->lock != seq)
-	 *     goto again;
+	 *     barrier();
+	 *   } while (pc->lock != seq);
 	 *
 	 * NOTE: for obvious reason this only works on self-monitoring
 	 *       processes.
-- 
cgit v1.2.3-70-g09d2


From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:00 +0200
Subject: perf_counter: add more context information

Change the callchain context entries to u16, so as to gain some space.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.457320003@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 4 ++--
 kernel/perf_counter.c        | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e22ab47a2f4..f9d5cf0bfbd 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -507,10 +507,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		254
+#define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
-	u32	nr, hv, kernel, user;
+	u16	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2a5d4f52556..727624db507 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1657,9 +1657,7 @@ void perf_counter_do_pending(void)
  * Callchain support -- arch specific
  */
 
-struct perf_callchain_entry *
-__attribute__((weak))
-perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	return NULL;
 }
@@ -1819,7 +1817,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (2 + callchain->nr) * sizeof(u64);
+			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
cgit v1.2.3-70-g09d2


From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:01 +0200
Subject: perf_counter: SIGIO support

Provide support for fcntl() I/O availability signals.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.579788800@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f9d5cf0bfbd..8d5d11b8d01 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -238,6 +238,7 @@ enum perf_event_type {
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
+#include <linux/fs.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -398,6 +399,7 @@ struct perf_counter {
 
 	/* poll related */
 	wait_queue_head_t		waitq;
+	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
 	struct perf_wakeup_entry	wakeup;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 727624db507..c58cc64319e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1526,6 +1526,22 @@ out:
 	return ret;
 }
 
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+	struct perf_counter *counter = filp->private_data;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &counter->fasync);
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
@@ -1533,6 +1549,7 @@ static const struct file_operations perf_fops = {
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
 	.mmap			= perf_mmap,
+	.fasync			= perf_fasync,
 };
 
 /*
@@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data) {
-		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		atomic_set(&data->wakeup, POLL_IN);
 		/*
 		 * Ensure all data writes are issued before updating the
 		 * user-space data head information. The matching rmb()
@@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
+	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:02 +0200
Subject: perf_counter: generalize pending infrastructure

Prepare the pending infrastructure to do more than wakeups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.634732847@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++---
 kernel/perf_counter.c        | 53 ++++++++++++++++++++++++++------------------
 2 files changed, 36 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8d5d11b8d01..977fb15a53f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -321,8 +321,9 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
-struct perf_wakeup_entry {
-	struct perf_wakeup_entry *next;
+struct perf_pending_entry {
+	struct perf_pending_entry *next;
+	void (*func)(struct perf_pending_entry *);
 };
 
 /**
@@ -401,7 +402,7 @@ struct perf_counter {
 	wait_queue_head_t		waitq;
 	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
-	struct perf_wakeup_entry	wakeup;
+	struct perf_pending_entry	pending;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c58cc64319e..0a2ade2e4f1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1581,6 +1581,14 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
+static void perf_pending_wakeup(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	perf_counter_wakeup(counter);
+}
+
 /*
  * Pending wakeups
  *
@@ -1590,45 +1598,47 @@ void perf_counter_wakeup(struct perf_counter *counter)
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
-#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
-static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
 	PENDING_TAIL,
 };
 
-static void perf_pending_queue(struct perf_counter *counter)
+static void perf_pending_queue(struct perf_pending_entry *entry,
+			       void (*func)(struct perf_pending_entry *))
 {
-	struct perf_wakeup_entry **head;
-	struct perf_wakeup_entry *prev, *next;
+	struct perf_pending_entry **head;
 
-	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
 		return;
 
-	head = &get_cpu_var(perf_wakeup_head);
+	entry->func = func;
+
+	head = &get_cpu_var(perf_pending_head);
 
 	do {
-		prev = counter->wakeup.next = *head;
-		next = &counter->wakeup;
-	} while (cmpxchg(head, prev, next) != prev);
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
 
 	set_perf_counter_pending();
 
-	put_cpu_var(perf_wakeup_head);
+	put_cpu_var(perf_pending_head);
 }
 
 static int __perf_pending_run(void)
 {
-	struct perf_wakeup_entry *list;
+	struct perf_pending_entry *list;
 	int nr = 0;
 
-	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
 	while (list != PENDING_TAIL) {
-		struct perf_counter *counter = container_of(list,
-				struct perf_counter, wakeup);
+		void (*func)(struct perf_pending_entry *);
+		struct perf_pending_entry *entry = list;
 
 		list = list->next;
 
-		counter->wakeup.next = NULL;
+		func = entry->func;
+		entry->next = NULL;
 		/*
 		 * Ensure we observe the unqueue before we issue the wakeup,
 		 * so that we won't be waiting forever.
@@ -1636,7 +1646,7 @@ static int __perf_pending_run(void)
 		 */
 		smp_wmb();
 
-		perf_counter_wakeup(counter);
+		func(entry);
 		nr++;
 	}
 
@@ -1658,7 +1668,7 @@ static inline int perf_not_pending(struct perf_counter *counter)
 	 * so that we do not miss the wakeup. -- see perf_pending_handle()
 	 */
 	smp_rmb();
-	return counter->wakeup.next == NULL;
+	return counter->pending.next == NULL;
 }
 
 static void perf_pending_sync(struct perf_counter *counter)
@@ -1695,9 +1705,10 @@ struct perf_output_handle {
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
-	if (handle->nmi)
-		perf_pending_queue(handle->counter);
-	else
+	if (handle->nmi) {
+		perf_pending_queue(&handle->counter->pending,
+				   perf_pending_wakeup);
+	} else
 		perf_counter_wakeup(handle->counter);
 }
 
-- 
cgit v1.2.3-70-g09d2


From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:04 +0200
Subject: perf_counter: theres more to overflow than writing events

Prepare for more generic overflow handling. The new perf_counter_overflow()
method will handle the generic bits of the counter overflow, and can return
a !0 return value, in which case the counter should be (soft) disabled, so
that it won't count until it's properly disabled.

XXX: do powerpc and swcounter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.812109629@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  2 +-
 arch/x86/kernel/cpu/perf_counter.c |  3 ++-
 include/linux/perf_counter.h       |  4 ++--
 kernel/perf_counter.c              | 29 +++++++++++++++++++++++------
 4 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0a4d14f279a..f88c35d0710 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_output(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 438415866fe..1116a41bc7b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,8 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		perf_counter_output(counter, nmi, regs);
+		if (perf_counter_overflow(counter, nmi, regs))
+			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 977fb15a53f..ca2d4df29e0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs);
+extern int perf_counter_overflow(struct perf_counter *counter,
+				 int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0a2ade2e4f1..195e976eb07 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
+static void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -2033,6 +2033,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
+/*
+ * Generic counter overflow handling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter,
+			  int nmi, struct pt_regs *regs)
+{
+	perf_counter_output(counter, nmi, regs);
+	return 0;
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
+	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
 
@@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			!counter->hw_event.exclude_user)
 		regs = task_pt_regs(current);
 
-	if (regs)
-		perf_counter_output(counter, 0, regs);
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, regs))
+			ret = HRTIMER_NORESTART;
+	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
-	return HRTIMER_RESTART;
+	return ret;
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
@@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_counter_output(counter, nmi, regs);
+	if (perf_counter_overflow(counter, nmi, regs))
+		/* soft-disable the counter */
+		;
+
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v1.2.3-70-g09d2


From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:06 +0200
Subject: perf_counter: PERF_RECORD_TIME

By popular request, provide means to log a timestamp along with the
counter overflow event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.024173282@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ca2d4df29e0..928a7fae096 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -102,6 +102,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_GROUP	= 1U << 2,
 	PERF_RECORD_CALLCHAIN	= 1U << 3,
+	PERF_RECORD_TIME	= 1U << 4,
 };
 
 /*
@@ -221,6 +222,7 @@ enum perf_event_type {
 	__PERF_EVENT_TID		= PERF_RECORD_TID,
 	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
 	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
+	__PERF_EVENT_TIME		= PERF_RECORD_TIME,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c841563de04..19990d1f021 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1826,6 +1826,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
+	u64 time;
 
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
@@ -1862,6 +1863,16 @@ static void perf_counter_output(struct perf_counter *counter,
 		}
 	}
 
+	if (record_type & PERF_RECORD_TIME) {
+		/*
+		 * Maybe do better on x86 and provide cpu_clock_nmi()
+		 */
+		time = sched_clock();
+
+		header.type |= __PERF_EVENT_TIME;
+		header.size += sizeof(u64);
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi);
 	if (ret)
 		return;
@@ -1895,6 +1906,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (callchain)
 		perf_output_copy(&handle, callchain, callchain_size);
 
+	if (record_type & PERF_RECORD_TIME)
+		perf_output_put(&handle, time);
+
 	perf_output_end(&handle);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:07 +0200
Subject: perf_counter: counter overflow limit

Provide means to auto-disable the counter after 'n' overflow events.

Create the counter with hw_event.disabled = 1, and then issue an
ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable
the counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.083139737@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 12 ++++++++---
 kernel/perf_counter.c        | 51 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 928a7fae096..ef4dcbff75a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -155,8 +155,9 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
 
 /*
  * Structure of the page that can be mapped via mmap
@@ -403,9 +404,14 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	struct fasync_struct		*fasync;
-	/* optional: for NMIs */
+
+	/* delayed work for NMIs and such */
+	int				pending_wakeup;
+	int				pending_disable;
 	struct perf_pending_entry	pending;
 
+	atomic_t			event_limit;
+
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 19990d1f021..c05e10354bc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -744,6 +744,12 @@ static void perf_counter_enable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+	atomic_add(refresh, &counter->event_limit);
+	perf_counter_enable(counter);
+}
+
 /*
  * Enable a counter and all its children.
  */
@@ -1311,6 +1317,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_DISABLE:
 		perf_counter_disable_family(counter);
 		break;
+	case PERF_COUNTER_IOC_REFRESH:
+		perf_counter_refresh(counter, arg);
+		break;
 	default:
 		err = -ENOTTY;
 	}
@@ -1590,14 +1599,6 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
-static void perf_pending_wakeup(struct perf_pending_entry *entry)
-{
-	struct perf_counter *counter = container_of(entry,
-			struct perf_counter, pending);
-
-	perf_counter_wakeup(counter);
-}
-
 /*
  * Pending wakeups
  *
@@ -1607,6 +1608,22 @@ static void perf_pending_wakeup(struct perf_pending_entry *entry)
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		perf_counter_disable(counter);
+	}
+
+	if (counter->pending_wakeup) {
+		counter->pending_wakeup = 0;
+		perf_counter_wakeup(counter);
+	}
+}
+
 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
@@ -1715,8 +1732,9 @@ struct perf_output_handle {
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
 	if (handle->nmi) {
+		handle->counter->pending_wakeup = 1;
 		perf_pending_queue(&handle->counter->pending,
-				   perf_pending_wakeup);
+				   perf_pending_counter);
 	} else
 		perf_counter_wakeup(handle->counter);
 }
@@ -2063,8 +2081,21 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs)
 {
+	int events = atomic_read(&counter->event_limit);
+	int ret = 0;
+
+	if (events && atomic_dec_and_test(&counter->event_limit)) {
+		ret = 1;
+		if (nmi) {
+			counter->pending_disable = 1;
+			perf_pending_queue(&counter->pending,
+					   perf_pending_counter);
+		} else
+			perf_counter_disable(counter);
+	}
+
 	perf_counter_output(counter, nmi, regs);
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0c593b3411341e3a05a61f5527df36ab02bd11e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:08 +0200
Subject: perf_counter: comment the perf_event_type stuff

Describe the event format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.211174347@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ef4dcbff75a..81220188d05 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -208,6 +208,20 @@ struct perf_event_header {
 
 enum perf_event_type {
 
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u32				pid, tid;
+	 * 	u64				addr;
+	 * 	u64				len;
+	 * 	u64				pgoff;
+	 * 	char				filename[];
+	 * };
+	 */
 	PERF_EVENT_MMAP			= 1,
 	PERF_EVENT_MUNMAP		= 2,
 
@@ -217,6 +231,24 @@ enum perf_event_type {
 	 *
 	 * These events will have types of the form:
 	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	{ u64			ip;	  } && __PERF_EVENT_IP
+	 * 	{ u32			pid, tid; } && __PERF_EVENT_TID
+	 *
+	 * 	{ u64			nr;
+	 * 	  { u64 event, val; } 	cnt[nr];  } && __PERF_EVENT_GROUP
+	 *
+	 * 	{ u16			nr,
+	 * 				hv,
+	 * 				kernel,
+	 * 				user;
+	 * 	  u64			ips[nr];  } && __PERF_EVENT_CALLCHAIN
+	 *
+	 * 	{ u64			time;     } && __PERF_EVENT_TIME
+	 * };
 	 */
 	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
 	__PERF_EVENT_IP			= PERF_RECORD_IP,
-- 
cgit v1.2.3-70-g09d2


From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:09 +0200
Subject: perf_counter: change event definition

Currently the definition of an event is slightly ambiguous. We have
wakeup events, for poll() and SIGIO, which are either generated
when a record crosses a page boundary (hw_events.wakeup_events == 0),
or every wakeup_events new records.

Now a record can be either a counter overflow record, or a number of
different things, like the mmap PROT_EXEC region notifications.

Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only
considers counter overflows.

This patch changes then wakeup_events and SIGIO notification to only
consider overflow events. Furthermore it changes the SIGIO notification
to report SIGHUP when the event limit is reached and the counter will
be disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.266679874@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81220188d05..0f5a4005048 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -439,6 +439,7 @@ struct perf_counter {
 
 	/* delayed work for NMIs and such */
 	int				pending_wakeup;
+	int				pending_kill;
 	int				pending_disable;
 	struct perf_pending_entry	pending;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c05e10354bc..8c8eaf0625f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1596,7 +1596,11 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
-	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
+
+	if (counter->pending_kill) {
+		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+		counter->pending_kill = 0;
+	}
 }
 
 /*
@@ -1727,6 +1731,7 @@ struct perf_output_handle {
 	unsigned int		head;
 	int			wakeup;
 	int			nmi;
+	int			overflow;
 };
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
@@ -1741,7 +1746,7 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
-			     int nmi)
+			     int nmi, int overflow)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
@@ -1751,8 +1756,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
-	handle->counter	= counter;
-	handle->nmi	= nmi;
+	handle->counter	 = counter;
+	handle->nmi	 = nmi;
+	handle->overflow = overflow;
 
 	if (!data->nr_pages)
 		goto fail;
@@ -1816,7 +1822,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 {
 	int wakeup_events = handle->counter->hw_event.wakeup_events;
 
-	if (wakeup_events) {
+	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&handle->data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &handle->data->events);
@@ -1891,7 +1897,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
-	ret = perf_output_begin(&handle, counter, header.size, nmi);
+	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
 
@@ -1955,7 +1961,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 {
 	struct perf_output_handle handle;
 	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0);
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
 
 	if (ret)
 		return;
@@ -2084,8 +2090,10 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->pending_kill = POLL_IN;
 	if (events && atomic_dec_and_test(&counter->event_limit)) {
 		ret = 1;
+		counter->pending_kill = POLL_HUP;
 		if (nmi) {
 			counter->pending_disable = 1;
 			perf_pending_queue(&counter->pending,
-- 
cgit v1.2.3-70-g09d2


From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:10 +0200
Subject: perf_counter: rework context time

Since perf_counter_context is switched along with tasks, we can
maintain the context time without using the task runtime clock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.353552838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 10 ++----
 kernel/perf_counter.c        | 78 +++++++++++++++++++-------------------------
 2 files changed, 37 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0f5a4005048..7f5d353d78a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -477,14 +477,10 @@ struct perf_counter_context {
 	struct task_struct	*task;
 
 	/*
-	 * time_now is the current time in nanoseconds since an arbitrary
-	 * point in the past.  For per-task counters, this is based on the
-	 * task clock, and for per-cpu counters it is based on the cpu clock.
-	 * time_lost is an offset from the task/cpu clock, used to make it
-	 * appear that time only passes while the context is scheduled in.
+	 * Context clock, runs when context enabled.
 	 */
-	u64			time_now;
-	u64			time_lost;
+	u64			time;
+	u64			timestamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8c8eaf0625f..84d85ab4e16 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -117,7 +117,7 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_stopped = ctx->time_now;
+	counter->tstamp_stopped = ctx->time;
 	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
@@ -253,27 +253,20 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Get the current time for this context.
- * If this is a task context, we use the task's task clock,
- * or for a per-cpu context, we use the cpu clock.
- */
-static u64 get_context_time(struct perf_counter_context *ctx, int update)
+static inline u64 perf_clock(void)
 {
-	struct task_struct *curr = ctx->task;
-
-	if (!curr)
-		return cpu_clock(smp_processor_id());
-
-	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+	return cpu_clock(smp_processor_id());
 }
 
 /*
  * Update the record of the current time in a context.
  */
-static void update_context_time(struct perf_counter_context *ctx, int update)
+static void update_context_time(struct perf_counter_context *ctx)
 {
-	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
 }
 
 /*
@@ -284,15 +277,17 @@ static void update_counter_times(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	u64 run_end;
 
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		counter->total_time_enabled = ctx->time_now -
-			counter->tstamp_enabled;
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
-			run_end = counter->tstamp_stopped;
-		else
-			run_end = ctx->time_now;
-		counter->total_time_running = run_end - counter->tstamp_running;
-	}
+	if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+		return;
+
+	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		run_end = counter->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	counter->total_time_running = run_end - counter->tstamp_running;
 }
 
 /*
@@ -332,7 +327,7 @@ static void __perf_counter_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		update_context_time(ctx, 1);
+		update_context_time(ctx);
 		update_counter_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
@@ -426,7 +421,7 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
-	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+	counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu++;
@@ -493,9 +488,9 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 	counter->prev_state = PERF_COUNTER_STATE_OFF;
-	counter->tstamp_enabled = ctx->time_now;
-	counter->tstamp_running = ctx->time_now;
-	counter->tstamp_stopped = ctx->time_now;
+	counter->tstamp_enabled = ctx->time;
+	counter->tstamp_running = ctx->time;
+	counter->tstamp_stopped = ctx->time;
 }
 
 /*
@@ -522,7 +517,7 @@ static void __perf_install_in_context(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
-	update_context_time(ctx, 1);
+	update_context_time(ctx);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -648,13 +643,13 @@ static void __perf_counter_enable(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
-	update_context_time(ctx, 1);
+	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
+	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -737,8 +732,8 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_OFF) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled = ctx->time_now -
-			counter->total_time_enabled;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -778,7 +773,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
 		goto out;
-	update_context_time(ctx, 0);
+	update_context_time(ctx);
 
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
@@ -883,12 +878,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	if (likely(!ctx->nr_counters))
 		goto out;
 
-	/*
-	 * Add any time since the last sched_out to the lost time
-	 * so it doesn't get included in the total_time_enabled and
-	 * total_time_running measures for counters in the context.
-	 */
-	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+	ctx->timestamp = perf_clock();
 
 	flags = hw_perf_save_disable();
 
@@ -1043,8 +1033,8 @@ int perf_counter_task_enable(void)
 		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled = ctx->time_now -
-			counter->total_time_enabled;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
@@ -1113,7 +1103,7 @@ static void __read(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	if (ctx->is_active)
-		update_context_time(ctx, 1);
+		update_context_time(ctx);
 	counter->hw_ops->read(counter);
 	update_counter_times(counter);
 	curr_rq_unlock_irq_restore(&flags);
-- 
cgit v1.2.3-70-g09d2


From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:12 +0200
Subject: perf_counter: remove rq->lock usage

Now that all the task runtime clock users are gone, remove the ugly
rq->lock usage from perf counters, which solves the nasty deadlock
seen when a software task clock counter was read from an NMI overflow
context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.531137582@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |  2 --
 kernel/perf_counter.c       | 42 ++++++++++++++++--------------------------
 kernel/sched.c              | 20 --------------------
 3 files changed, 16 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b6d2887a5d8..080d1fd461d 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -85,8 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
  */
-extern void curr_rq_lock_irq_save(unsigned long *flags);
-extern void curr_rq_unlock_irq_restore(unsigned long *flags);
 extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 56b7eb53d67..f4f7596f784 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -172,8 +172,7 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	counter_sched_out(counter, cpuctx, ctx);
 
@@ -198,8 +197,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 
@@ -319,8 +317,7 @@ static void __perf_counter_disable(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	/*
 	 * If the counter is on, turn it off.
@@ -336,8 +333,7 @@ static void __perf_counter_disable(void *info)
 		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -515,8 +511,7 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 	update_context_time(ctx);
 
 	/*
@@ -565,8 +560,7 @@ static void __perf_install_in_context(void *info)
  unlock:
 	hw_perf_restore(perf_flags);
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -641,8 +635,7 @@ static void __perf_counter_enable(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -678,8 +671,7 @@ static void __perf_counter_enable(void *info)
 	}
 
  unlock:
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -971,7 +963,7 @@ int perf_counter_task_disable(void)
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	cpu = smp_processor_id();
 
 	perf_counter_task_sched_out(curr, cpu);
@@ -992,9 +984,7 @@ int perf_counter_task_disable(void)
 
 	hw_perf_restore(perf_flags);
 
-	spin_unlock(&ctx->lock);
-
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 
 	return 0;
 }
@@ -1011,7 +1001,7 @@ int perf_counter_task_enable(void)
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	cpu = smp_processor_id();
 
 	perf_counter_task_sched_out(curr, cpu);
@@ -1037,7 +1027,7 @@ int perf_counter_task_enable(void)
 
 	perf_counter_task_sched_in(curr, cpu);
 
-	curr_rq_unlock_irq_restore(&flags);
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -1095,12 +1085,12 @@ static void __read(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
 	counter->hw_ops->read(counter);
 	update_counter_times(counter);
-	curr_rq_unlock_irq_restore(&flags);
+	local_irq_restore(flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -2890,7 +2880,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * Be careful about zapping the list - IRQ/NMI context
 		 * could still be processing it:
 		 */
-		curr_rq_lock_irq_save(&flags);
+		local_irq_save(flags);
 		perf_flags = hw_perf_save_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -2903,7 +2893,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		child_ctx->nr_counters--;
 
 		hw_perf_restore(perf_flags);
-		curr_rq_unlock_irq_restore(&flags);
+		local_irq_restore(flags);
 	}
 
 	parent_counter = child_counter->parent;
diff --git a/kernel/sched.c b/kernel/sched.c
index f76e3c0188a..0de2f814fb1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
-void curr_rq_lock_irq_save(unsigned long *flags)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	local_irq_save(*flags);
-	rq = cpu_rq(smp_processor_id());
-	spin_lock(&rq->lock);
-}
-
-void curr_rq_unlock_irq_restore(unsigned long *flags)
-	__releases(rq->lock)
-{
-	struct rq *rq;
-
-	rq = cpu_rq(smp_processor_id());
-	spin_unlock(&rq->lock);
-	local_irq_restore(*flags);
-}
-
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
-- 
cgit v1.2.3-70-g09d2


From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:34 +0200
Subject: sched, hw-branch-tracer: add wait_task_context_switch() function to
 sched.h

Add a function to wait until some other task has been
switched out at least once.

This differs from wait_task_inactive() subtly, in that the
latter will wait until the task has left the CPU.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: markus.t.metzger@gmail.com
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144549.794157000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b94f3541f67..a5b9a83065f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
+extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
+static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5d507..f91bc8141dc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 }
 
+/*
+ * wait_task_context_switch -	wait for a thread to complete at least one
+ *				context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+	unsigned long nvcsw, nivcsw, flags;
+	int running;
+	struct rq *rq;
+
+	nvcsw	= p->nvcsw;
+	nivcsw	= p->nivcsw;
+	for (;;) {
+		/*
+		 * The runqueue is assigned before the actual context
+		 * switch. We need to take the runqueue lock.
+		 *
+		 * We could check initially without the lock but it is
+		 * very likely that we need to take the lock in every
+		 * iteration.
+		 */
+		rq = task_rq_lock(p, &flags);
+		running = task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+
+		if (likely(!running))
+			break;
+		/*
+		 * The switch count is incremented before the actual
+		 * context switch. We thus wait for two switches to be
+		 * sure at least one completed.
+		 */
+		if ((p->nvcsw - nvcsw) > 1)
+			break;
+		if ((p->nivcsw - nivcsw) > 1)
+			break;
+
+		cpu_relax();
+	}
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
-- 
cgit v1.2.3-70-g09d2


From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:35 +0200
Subject: mm, x86, ptrace, bts: defer branch trace stopping

When a ptraced task is unlinked, we need to stop branch tracing for
that task.

Since the unlink is called with interrupts disabled, and we need
interrupts enabled to stop branch tracing, we defer the work.

Collect all branch tracing related stuff in a branch tracing context.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144550.712401000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h |   4 -
 arch/x86/kernel/ptrace.c         | 254 ++++++++++++++++++++++++++-------------
 include/linux/mm.h               |   3 +-
 include/linux/sched.h            |   9 +-
 mm/mlock.c                       |  13 +-
 5 files changed, 179 insertions(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 34c52370f2f..2483807e06e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -458,10 +458,6 @@ struct thread_struct {
 /* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
 	struct ds_context	*ds_ctx;
 #endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
-	unsigned int	bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
 };
 
 static inline unsigned long native_get_debugreg(int regno)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fe9345c967d..7c21d1e8cae 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/ftrace.h>
+#include <linux/workqueue.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target,
 }
 
 #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+	/* The branch trace handle. */
+	struct bts_tracer	*tracer;
+
+	/* The buffer used to store the branch trace and its size. */
+	void			*buffer;
+	unsigned int		size;
+
+	/* The mm that paid for the above buffer. */
+	struct mm_struct	*mm;
+
+	/* The task this context belongs to. */
+	struct task_struct	*task;
+
+	/* The signal to send on a bts buffer overflow. */
+	unsigned int		bts_ovfl_signal;
+
+	/* The work struct to destroy a context. */
+	struct work_struct	work;
+};
+
+static inline void alloc_bts_buffer(struct bts_context *context,
+				    unsigned int size)
+{
+	void *buffer;
+
+	buffer = alloc_locked_buffer(size);
+	if (buffer) {
+		context->buffer = buffer;
+		context->size = size;
+		context->mm = get_task_mm(current);
+	}
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+	if (!context->buffer)
+		return;
+
+	kfree(context->buffer);
+	context->buffer = NULL;
+
+	refund_locked_buffer_memory(context->mm, context->size);
+	context->size = 0;
+
+	mmput(context->mm);
+	context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+	struct bts_context *context;
+
+	context = container_of(w, struct bts_context, work);
+
+	ds_release_bts(context->tracer);
+	put_task_struct(context->task);
+	free_bts_buffer(context);
+	kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+	INIT_WORK(&context->work, free_bts_context_work);
+	schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (context) {
+		context->task = task;
+		task->bts = context;
+
+		get_task_struct(task);
+	}
+
+	return context;
+}
+
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	struct bts_struct bts;
 	const unsigned char *at;
 	int error;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	at = trace->ds.top - ((index + 1) * trace->ds.size);
 	if ((void *)at < trace->ds.begin)
@@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 	if (!trace->read)
 		return -EOPNOTSUPP;
 
-	error = trace->read(child->bts, at, &bts);
+	error = trace->read(context->tracer, at, &bts);
 	if (error < 0)
 		return error;
 
@@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    struct bts_struct __user *out)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const unsigned char *at;
 	int error, drained = 0;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	if (!trace->read)
 		return -EOPNOTSUPP;
@@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child,
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     out++, drained++, at += trace->ds.size) {
 		struct bts_struct bts;
-		int error;
 
-		error = trace->read(child->bts, at, &bts);
+		error = trace->read(context->tracer, at, &bts);
 		if (error < 0)
 			return error;
 
@@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	error = ds_reset_bts(child->bts);
+	error = ds_reset_bts(context->tracer);
 	if (error < 0)
 		return error;
 
 	return drained;
 }
 
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-	child->bts_buffer = alloc_locked_buffer(size);
-	if (!child->bts_buffer)
-		return -ENOMEM;
-
-	child->bts_size = size;
-
-	return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-	free_locked_buffer(child->bts_buffer, child->bts_size);
-	child->bts_buffer = NULL;
-	child->bts_size = 0;
-}
-
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
 {
+	struct bts_context *context;
 	struct ptrace_bts_config cfg;
 	unsigned int flags = 0;
 
@@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 		return -EFAULT;
 
-	if (child->bts) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-	}
+	context = child->bts;
+	if (!context)
+		context = alloc_bts_context(child);
+	if (!context)
+		return -ENOMEM;
 
 	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 		if (!cfg.signal)
 			return -EINVAL;
 
-		child->thread.bts_ovfl_signal = cfg.signal;
 		return -EOPNOTSUPP;
+		context->bts_ovfl_signal = cfg.signal;
 	}
 
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-	    (cfg.size != child->bts_size)) {
-		int error;
+	ds_release_bts(context->tracer);
+	context->tracer = NULL;
 
-		ptrace_bts_free_buffer(child);
+	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		free_bts_buffer(context);
+		if (!cfg.size)
+			return 0;
 
-		error = ptrace_bts_allocate_buffer(child, cfg.size);
-		if (error < 0)
-			return error;
+		alloc_bts_buffer(context, cfg.size);
+		if (!context->buffer)
+			return -ENOMEM;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		flags |= BTS_TIMESTAMPS;
 
-	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-				    /* ovfl = */ NULL, /* th = */ (size_t)-1,
-				    flags);
-	if (IS_ERR(child->bts)) {
-		int error = PTR_ERR(child->bts);
-
-		ptrace_bts_free_buffer(child);
-		child->bts = NULL;
+	context->tracer = ds_request_bts(child, context->buffer, context->size,
+					 NULL, (size_t)-1, flags);
+	if (unlikely(IS_ERR(context->tracer))) {
+		int error = PTR_ERR(context->tracer);
 
+		free_bts_buffer(context);
+		context->tracer = NULL;
 		return error;
 	}
 
@@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	struct ptrace_bts_config cfg;
 
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	trace = ds_read_bts(child->bts);
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	memset(&cfg, 0, sizeof(cfg));
-	cfg.size = trace->ds.end - trace->ds.begin;
-	cfg.signal = child->thread.bts_ovfl_signal;
-	cfg.bts_size = sizeof(struct bts_struct);
+	cfg.size	= trace->ds.end - trace->ds.begin;
+	cfg.signal	= context->bts_ovfl_signal;
+	cfg.bts_size	= sizeof(struct bts_struct);
 
 	if (cfg.signal)
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child,
 
 static int ptrace_bts_clear(struct task_struct *child)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	return ds_reset_bts(child->bts);
+	return ds_reset_bts(context->tracer);
 }
 
 static int ptrace_bts_size(struct task_struct *child)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static void ptrace_bts_fork(struct task_struct *tsk)
+static inline void ptrace_bts_fork(struct task_struct *tsk)
 {
 	tsk->bts = NULL;
-	tsk->bts_buffer = NULL;
-	tsk->bts_size = 0;
-	tsk->thread.bts_ovfl_signal = 0;
 }
 
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+static inline void ptrace_bts_untrace(struct task_struct *child)
 {
 	if (unlikely(child->bts)) {
-		ds_release_bts(child->bts);
+		free_bts_context(child->bts);
 		child->bts = NULL;
-
-		/* We cannot update total_vm and locked_vm since
-		   child's mm is already gone. But we can reclaim the
-		   memory. */
-		kfree(child->bts_buffer);
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
 	}
 }
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
-	/*
-	 * Ptrace_detach() races with ptrace_untrace() in case
-	 * the child dies and is reaped by another thread.
-	 *
-	 * We only do the memory accounting at this point and
-	 * leave the buffer deallocation and the bts tracer
-	 * release to ptrace_bts_untrace() which will be called
-	 * later on with tasklist_lock held.
-	 */
-	release_locked_buffer(child->bts_buffer, child->bts_size);
-}
 #else
 static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
 static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
@@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-	ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d475c..64d8ed2538a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1321,6 +1322,6 @@ void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
 extern void free_locked_buffer(void *buffer, size_t size);
-extern void release_locked_buffer(void *buffer, size_t size);
+extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5b9a83065f..52b8cd049c2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -96,8 +96,8 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
-struct bts_tracer;
 struct fs_struct;
+struct bts_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1210,12 +1210,7 @@ struct task_struct {
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 */
-	struct bts_tracer *bts;
-	/*
-	 * The buffer to hold the BTS data.
-	 */
-	void *bts_buffer;
-	size_t bts_size;
+	struct bts_context *bts;
 #endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e0581b7..749383b442c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -660,21 +660,20 @@ void *alloc_locked_buffer(size_t size)
 	return buffer;
 }
 
-void release_locked_buffer(void *buffer, size_t size)
+void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
 
-	current->mm->total_vm  -= pgsz;
-	current->mm->locked_vm -= pgsz;
+	mm->total_vm  -= pgsz;
+	mm->locked_vm -= pgsz;
 
-	up_write(&current->mm->mmap_sem);
+	up_write(&mm->mmap_sem);
 }
 
 void free_locked_buffer(void *buffer, size_t size)
 {
-	release_locked_buffer(buffer, size);
-
+	refund_locked_buffer_memory(current->mm, size);
 	kfree(buffer);
 }
-- 
cgit v1.2.3-70-g09d2


From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:48 +0200
Subject: x86, ptrace: add bts context unconditionally

Add the ptrace bts context field to task_struct unconditionally.

Initialize the field directly in copy_process().
Remove all the unneeded functionality used to initialize that field.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144603.292754000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ptrace.h |  9 ++++-----
 arch/x86/kernel/ptrace.c      | 20 +-------------------
 include/linux/ptrace.h        | 10 ----------
 include/linux/sched.h         |  2 --
 kernel/fork.c                 |  4 ++--
 kernel/ptrace.c               | 10 ----------
 6 files changed, 7 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index e304b66abee..5cdd19f20b5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -235,12 +235,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
-			    unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
 
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index adbb24322d8..b32a8ee5338 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child)
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static inline void ptrace_bts_fork(struct task_struct *tsk)
-{
-	tsk->bts = NULL;
-}
-
 /*
  * Called from __ptrace_unlink() after the child has been moved back
  * to its original parent.
  */
-static inline void ptrace_bts_untrace(struct task_struct *child)
+void ptrace_bts_untrace(struct task_struct *child)
 {
 	if (unlikely(child->bts)) {
 		free_bts_context(child->bts);
 		child->bts = NULL;
 	}
 }
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-	ptrace_bts_untrace(child);
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 67c15653fc2..59e133d39d5 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
@@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_untrace(task)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_fork
-/*
- * Do machine-specific work to initialize a new task.
- *
- * This is called from copy_process().
- */
-#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52b8cd049c2..451186a22ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1205,13 +1205,11 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-#ifdef CONFIG_X86_PTRACE_BTS
 	/*
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 */
 	struct bts_context *bts;
-#endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765b..69bde7a22e9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-	if (unlikely(current->ptrace))
-		ptrace_fork(p, clone_flags);
+
+	p->bts = NULL;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec3419..321127d965c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -26,16 +26,6 @@
 #include <asm/uaccess.h>
 
 
-/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	arch_ptrace_fork(child, clone_flags);
-}
-
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
-- 
cgit v1.2.3-70-g09d2


From 44bc9dc729e33a4ec6ebed4d0b6c08e8d20b42cf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 10:47:17 +0200
Subject: mm, x86, ptrace, bts: defer branch trace stopping, cleanup

Andrew Morton noticed that mm.h needlessly includes sched.h - remove it.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64d8ed2538a..776b641f37e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,7 +13,6 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
-#include <linux/sched.h>
 
 struct mempolicy;
 struct anon_vma;
-- 
cgit v1.2.3-70-g09d2


From a34b50ddc265bae058c66661b096ef6384c5a8b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 10:56:54 +0200
Subject: mm, x86, ptrace, bts: defer branch trace stopping, remove dead code

Remove the unused free_locked_buffer() API.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mm.h | 1 -
 mm/mlock.c         | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 776b641f37e..a3963ba23a6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1320,7 +1320,6 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
-extern void free_locked_buffer(void *buffer, size_t size);
 extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 749383b442c..28be15ead9c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -671,9 +671,3 @@ void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
 
 	up_write(&mm->mmap_sem);
 }
-
-void free_locked_buffer(void *buffer, size_t size)
-{
-	refund_locked_buffer_memory(current->mm, size);
-	kfree(buffer);
-}
-- 
cgit v1.2.3-70-g09d2


From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:21 -0500
Subject: swiotlb: change swiotlb_bus_to[phys,virt] prototypes

Add a hwdev argument that is needed on some architectures
in order to access a per-device offset that is taken into
account when producing a physical address (also needed to
get from bus address to virtual address because the physical
address is an intermediate step).

Also make swiotlb_bus_to_virt weak so architectures can
override it.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb.c |  2 +-
 include/linux/swiotlb.h       |  3 ++-
 lib/swiotlb.c                 | 10 +++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9996e..887388a1c57 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
 	return baddr;
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ac9ff54f7cb..cb1a6631b8f 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
 extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
 				      phys_addr_t address);
-extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
+				       dma_addr_t address);
 
 extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d912f068145..bffe6d7ef9d 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
 	return baddr;
 }
@@ -140,9 +140,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 	return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
 }
 
-static void *swiotlb_bus_to_virt(dma_addr_t address)
+void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
 {
-	return phys_to_virt(swiotlb_bus_to_phys(address));
+	return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
 }
 
 int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
@@ -691,7 +691,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
 static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 			 size_t size, int dir)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
@@ -728,7 +728,7 @@ static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 		    size_t size, int dir, int target)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
-- 
cgit v1.2.3-70-g09d2


From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:26 +0200
Subject: perf_counter: provide misc bits in the event header

Limit the size of each record to 64k (or should we count in multiples
of u64 and have a 512K limit?), this gives 16 bits or spare room in the
header, which we can use for misc bits, so as to not have to grow the
record with u64 every time we have a few bits to report.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.769271806@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 6 +++++-
 kernel/perf_counter.c        | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7f5d353d78a..5bd8817b12d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -201,9 +201,13 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+#define PERF_EVENT_MISC_KERNEL	(1 << 0)
+#define PERF_EVENT_MISC_USER	(1 << 1)
+
 struct perf_event_header {
 	__u32	type;
-	__u32	size;
+	__u16	misc;
+	__u16	size;
 };
 
 enum perf_event_type {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 84a39081344..4af98f943d3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1831,6 +1831,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
+	header.misc = user_mode(regs) ?
+		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);
 		header.type |= __PERF_EVENT_IP;
-- 
cgit v1.2.3-70-g09d2


From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:27 +0200
Subject: perf_counter: use misc field to widen type

Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that
we can have the full 32bit for PERF_RECORD_ bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.891867663@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 28 ++++++++++------------------
 kernel/perf_counter.c        | 15 ++++++++-------
 2 files changed, 18 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5bd8817b12d..4809ae18a94 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -201,8 +201,9 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
-#define PERF_EVENT_MISC_KERNEL	(1 << 0)
-#define PERF_EVENT_MISC_USER	(1 << 1)
+#define PERF_EVENT_MISC_KERNEL		(1 << 0)
+#define PERF_EVENT_MISC_USER		(1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -230,36 +231,27 @@ enum perf_event_type {
 	PERF_EVENT_MUNMAP		= 2,
 
 	/*
-	 * Half the event type space is reserved for the counter overflow
-	 * bitfields, as found in hw_event.record_type.
-	 *
-	 * These events will have types of the form:
-	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+	 * will be PERF_RECORD_*
 	 *
 	 * struct {
 	 * 	struct perf_event_header	header;
 	 *
-	 * 	{ u64			ip;	  } && __PERF_EVENT_IP
-	 * 	{ u32			pid, tid; } && __PERF_EVENT_TID
+	 * 	{ u64			ip;	  } && PERF_RECORD_IP
+	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 *
 	 * 	{ u64			nr;
-	 * 	  { u64 event, val; } 	cnt[nr];  } && __PERF_EVENT_GROUP
+	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
 	 * 	{ u16			nr,
 	 * 				hv,
 	 * 				kernel,
 	 * 				user;
-	 * 	  u64			ips[nr];  } && __PERF_EVENT_CALLCHAIN
+	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
 	 *
-	 * 	{ u64			time;     } && __PERF_EVENT_TIME
+	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * };
 	 */
-	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP			= PERF_RECORD_IP,
-	__PERF_EVENT_TID		= PERF_RECORD_TID,
-	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
-	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
-	__PERF_EVENT_TIME		= PERF_RECORD_TIME,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4af98f943d3..bf12df6f353 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1828,15 +1828,16 @@ static void perf_counter_output(struct perf_counter *counter,
 	int callchain_size = 0;
 	u64 time;
 
-	header.type = PERF_EVENT_COUNTER_OVERFLOW;
+	header.type = 0;
 	header.size = sizeof(header);
 
-	header.misc = user_mode(regs) ?
+	header.misc = PERF_EVENT_MISC_OVERFLOW;
+	header.misc |= user_mode(regs) ?
 		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
 
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);
-		header.type |= __PERF_EVENT_IP;
+		header.type |= PERF_RECORD_IP;
 		header.size += sizeof(ip);
 	}
 
@@ -1845,12 +1846,12 @@ static void perf_counter_output(struct perf_counter *counter,
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
 
-		header.type |= __PERF_EVENT_TID;
+		header.type |= PERF_RECORD_TID;
 		header.size += sizeof(tid_entry);
 	}
 
 	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= __PERF_EVENT_GROUP;
+		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
@@ -1861,7 +1862,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= __PERF_EVENT_CALLCHAIN;
+			header.type |= PERF_RECORD_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -1872,7 +1873,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		 */
 		time = sched_clock();
 
-		header.type |= __PERF_EVENT_TIME;
+		header.type |= PERF_RECORD_TIME;
 		header.size += sizeof(u64);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 8740f9418c78dcad694b46ab25d1645d5aef1f5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:29 +0200
Subject: perf_counter: add some comments

Add a few comments because I was forgetting what field what for what
functionality.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.036984214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4809ae18a94..8bf764fc622 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -344,10 +344,12 @@ struct file;
 
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
-	int				nr_pages;
-	atomic_t			wakeup;
-	atomic_t			head;
-	atomic_t			events;
+	int				nr_pages;	/* nr of data pages  */
+
+	atomic_t			wakeup;		/* POLL_ for wakeups */
+	atomic_t			head;		/* write position    */
+	atomic_t			events;		/* event limit       */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
-- 
cgit v1.2.3-70-g09d2


From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:30 +0200
Subject: perf_counter: track task-comm data

Similar to the mmap data stream, add one that tracks the task COMM field,
so that the userspace reporting knows what to call a task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.127422406@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c                    |  1 +
 include/linux/perf_counter.h | 16 +++++++-
 kernel/perf_counter.c        | 93 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e015c0b5a08..bf47ed0278f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	task_lock(tsk);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
+	perf_counter_comm(tsk);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bf764fc622..a70a55f2759 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -142,8 +142,9 @@ struct perf_counter_hw_event {
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				comm	       :  1, /* include comm data     */
 
-				__reserved_1   : 53;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -230,6 +231,16 @@ enum perf_event_type {
 	PERF_EVENT_MMAP			= 1,
 	PERF_EVENT_MUNMAP		= 2,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u32				pid, tid;
+	 * 	char				comm[];
+	 * };
+	 */
+	PERF_EVENT_COMM			= 3,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -545,6 +556,8 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
+extern void perf_counter_comm(struct task_struct *tsk);
+
 #define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
@@ -583,6 +596,7 @@ static inline void
 perf_counter_munmap(unsigned long addr, unsigned long len,
 		    unsigned long pgoff, struct file *file) 		{ }
 
+static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index bf12df6f353..2d4aebb2982 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1916,6 +1916,99 @@ static void perf_counter_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+	struct task_struct 	*task;
+	char 			*comm;
+	int			comm_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+	} event;
+};
+
+static void perf_counter_comm_output(struct perf_counter *counter,
+				     struct perf_comm_event *comm_event)
+{
+	struct perf_output_handle handle;
+	int size = comm_event->event.header.size;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, comm_event->event);
+	perf_output_copy(&handle, comm_event->comm,
+				   comm_event->comm_size);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_comm_match(struct perf_counter *counter,
+				   struct perf_comm_event *comm_event)
+{
+	if (counter->hw_event.comm &&
+	    comm_event->event.header.type == PERF_EVENT_COMM)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+				  struct perf_comm_event *comm_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_comm_match(counter, comm_event))
+			perf_counter_comm_output(counter, comm_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned int size;
+	char *comm = comm_event->task->comm;
+
+	size = ALIGN(strlen(comm), sizeof(u64));
+
+	comm_event->comm = comm;
+	comm_event->comm_size = size;
+
+	comm_event->event.header.size = sizeof(comm_event->event) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+	put_cpu_var(perf_cpu_context);
+
+	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+}
+
+void perf_counter_comm(struct task_struct *task)
+{
+	struct perf_comm_event comm_event = {
+		.task	= task,
+		.event  = {
+			.header = { .type = PERF_EVENT_COMM, },
+			.pid	= task->group_leader->pid,
+			.tid	= task->pid,
+		},
+	};
+
+	perf_counter_comm_event(&comm_event);
+}
+
 /*
  * mmap tracking
  */
-- 
cgit v1.2.3-70-g09d2


From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:32 +0200
Subject: perf_counter: move PERF_RECORD_TIME

Move PERF_RECORD_TIME so that all the fixed length items come before
the variable length ones.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.307926436@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  9 ++++-----
 kernel/perf_counter.c        | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a70a55f2759..8bd1be58c93 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -100,9 +100,9 @@ enum sw_event_ids {
 enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
-	PERF_RECORD_GROUP	= 1U << 2,
-	PERF_RECORD_CALLCHAIN	= 1U << 3,
-	PERF_RECORD_TIME	= 1U << 4,
+	PERF_RECORD_TIME	= 1U << 2,
+	PERF_RECORD_GROUP	= 1U << 3,
+	PERF_RECORD_CALLCHAIN	= 1U << 4,
 };
 
 /*
@@ -250,6 +250,7 @@ enum perf_event_type {
 	 *
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
+	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -259,8 +260,6 @@ enum perf_event_type {
 	 * 				kernel,
 	 * 				user;
 	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
-	 *
-	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * };
 	 */
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2d4aebb2982..4dc8600d282 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1850,6 +1850,16 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
+	if (record_type & PERF_RECORD_TIME) {
+		/*
+		 * Maybe do better on x86 and provide cpu_clock_nmi()
+		 */
+		time = sched_clock();
+
+		header.type |= PERF_RECORD_TIME;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1867,16 +1877,6 @@ static void perf_counter_output(struct perf_counter *counter,
 		}
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
-		/*
-		 * Maybe do better on x86 and provide cpu_clock_nmi()
-		 */
-		time = sched_clock();
-
-		header.type |= PERF_RECORD_TIME;
-		header.size += sizeof(u64);
-	}
-
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
@@ -1889,6 +1889,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TID)
 		perf_output_put(&handle, tid_entry);
 
+	if (record_type & PERF_RECORD_TIME)
+		perf_output_put(&handle, time);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -1910,9 +1913,6 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (callchain)
 		perf_output_copy(&handle, callchain, callchain_size);
 
-	if (record_type & PERF_RECORD_TIME)
-		perf_output_put(&handle, time);
-
 	perf_output_end(&handle);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:33 +0200
Subject: perf_counter: allow for data addresses to be recorded

Paul suggested we allow for data addresses to be recorded along with
the traditional IPs as power can provide these.

For now, only the software pagefault events provide data addresses,
but in the future power might as well for some events.

x86 doesn't seem capable of providing this atm.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.394816925@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  2 +-
 arch/powerpc/mm/fault.c            |  8 ++++---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 arch/x86/mm/fault.c                |  8 ++++---
 include/linux/perf_counter.h       | 14 +++++++-----
 kernel/perf_counter.c              | 46 ++++++++++++++++++++++++--------------
 6 files changed, 49 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0697ade84dd..c9d019f1907 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs, 0);
 }
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 17bbf6f91fb..ac0e112031b 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,8 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -322,7 +323,8 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1116a41bc7b..0fcbaab83f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		if (perf_counter_overflow(counter, nmi, regs))
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f2d3324d921..6f9df2babe4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,10 +1142,12 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 
 	check_v8086_mode(regs, address, tsk);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bd1be58c93..c22363a4f74 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -101,8 +101,9 @@ enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_GROUP	= 1U << 3,
-	PERF_RECORD_CALLCHAIN	= 1U << 4,
+	PERF_RECORD_ADDR	= 1U << 3,
+	PERF_RECORD_GROUP	= 1U << 4,
+	PERF_RECORD_CALLCHAIN	= 1U << 5,
 };
 
 /*
@@ -251,6 +252,7 @@ enum perf_event_type {
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
+	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs);
+				 int nmi, struct pt_regs *regs, u64 addr);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
@@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter)
 		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
@@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
 static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
-
+perf_swcounter_event(u32 event, u64 nr, int nmi,
+		     struct pt_regs *regs, u64 addr)			{ }
 
 static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4dc8600d282..321c57e3556 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	update_context_time(ctx);
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 }
 
 static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs)
+				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_ADDR) {
+		header.type |= PERF_RECORD_ADDR;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TIME)
 		perf_output_put(&handle, time);
 
+	if (record_type & PERF_RECORD_ADDR)
+		perf_output_put(&handle, addr);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
  */
 
 int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs)
+			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
@@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs, addr);
 	return ret;
 }
 
@@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs))
+		if (perf_counter_overflow(counter, 0, regs, 0))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct pt_regs *regs)
+				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs))
+	if (perf_counter_overflow(counter, nmi, regs, addr))
 		/* soft-disable the counter */
 		;
 
@@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct pt_regs *regs)
+			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 	if (counter->hw.irq_period && !neg)
-		perf_swcounter_overflow(counter, nmi, regs);
+		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_event_types type, u32 event,
-				     u64 nr, int nmi, struct pt_regs *regs)
+				     u64 nr, int nmi, struct pt_regs *regs,
+				     u64 addr)
 {
 	struct perf_counter *counter;
 
@@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, type, event, regs))
-			perf_swcounter_add(counter, nr, nmi, regs);
+			perf_swcounter_add(counter, nr, nmi, regs, addr);
 	}
 	rcu_read_unlock();
 }
@@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 }
 
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
-				   u64 nr, int nmi, struct pt_regs *regs)
+				   u64 nr, int nmi, struct pt_regs *regs,
+				   u64 addr)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 	(*recursion)++;
 	barrier();
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+				 nr, nmi, regs, addr);
 	if (cpuctx->task_ctx) {
 		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-				nr, nmi, regs);
+					 nr, nmi, regs, addr);
 	}
 
 	barrier();
@@ -2349,9 +2360,10 @@ out:
 	put_cpu_var(perf_cpu_context);
 }
 
-void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id)
 	if (!regs)
 		regs = task_pt_regs(current);
 
-	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
 }
 
 extern int ftrace_profile_enable(int);
-- 
cgit v1.2.3-70-g09d2


From 1c1452be2e9ae282a7316c3b23987811bd7acda6 Mon Sep 17 00:00:00 2001
From: Jonas Larsson <jonas.larsson@martinsson.se>
Date: Tue, 31 Mar 2009 11:16:48 +0200
Subject: atmel-mci: Add support for inverted detect pin

Same patch as before, modified to use bool. Also adds description of
the new field in struct atmel_mci that I missed in the first patch.

This patch adds Atmel MCI support for inverted detect pins.

Signed-off-by: Jonas Larsson <jonas.larsson@martinsson.se>
Acked-by: Pierre Ossman <pierre@ossman.eu>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 drivers/mmc/host/atmel-mci.c | 12 +++++++++---
 include/linux/atmel-mci.h    |  2 ++
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index cf6a100bb38..7b603e4b41d 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -177,6 +177,7 @@ struct atmel_mci {
  *	available.
  * @wp_pin: GPIO pin used for card write protect sending, or negative
  *	if not available.
+ * @detect_is_active_high: The state of the detect pin when it is active.
  * @detect_timer: Timer used for debouncing @detect_pin interrupts.
  */
 struct atmel_mci_slot {
@@ -196,6 +197,7 @@ struct atmel_mci_slot {
 
 	int			detect_pin;
 	int			wp_pin;
+	bool			detect_is_active_high;
 
 	struct timer_list	detect_timer;
 };
@@ -924,7 +926,8 @@ static int atmci_get_cd(struct mmc_host *mmc)
 	struct atmel_mci_slot	*slot = mmc_priv(mmc);
 
 	if (gpio_is_valid(slot->detect_pin)) {
-		present = !gpio_get_value(slot->detect_pin);
+		present = !(gpio_get_value(slot->detect_pin) ^
+			    slot->detect_is_active_high);
 		dev_dbg(&mmc->class_dev, "card is %spresent\n",
 				present ? "" : "not ");
 	}
@@ -1028,7 +1031,8 @@ static void atmci_detect_change(unsigned long data)
 		return;
 
 	enable_irq(gpio_to_irq(slot->detect_pin));
-	present = !gpio_get_value(slot->detect_pin);
+	present = !(gpio_get_value(slot->detect_pin) ^
+		    slot->detect_is_active_high);
 	present_old = test_bit(ATMCI_CARD_PRESENT, &slot->flags);
 
 	dev_vdbg(&slot->mmc->class_dev, "detect change: %d (was %d)\n",
@@ -1456,6 +1460,7 @@ static int __init atmci_init_slot(struct atmel_mci *host,
 	slot->host = host;
 	slot->detect_pin = slot_data->detect_pin;
 	slot->wp_pin = slot_data->wp_pin;
+	slot->detect_is_active_high = slot_data->detect_is_active_high;
 	slot->sdc_reg = sdc_reg;
 
 	mmc->ops = &atmci_ops;
@@ -1477,7 +1482,8 @@ static int __init atmci_init_slot(struct atmel_mci *host,
 		if (gpio_request(slot->detect_pin, "mmc_detect")) {
 			dev_dbg(&mmc->class_dev, "no detect pin available\n");
 			slot->detect_pin = -EBUSY;
-		} else if (gpio_get_value(slot->detect_pin)) {
+		} else if (gpio_get_value(slot->detect_pin) ^
+				slot->detect_is_active_high) {
 			clear_bit(ATMCI_CARD_PRESENT, &slot->flags);
 		}
 	}
diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h
index 2f1f95737ac..57b1846a3c8 100644
--- a/include/linux/atmel-mci.h
+++ b/include/linux/atmel-mci.h
@@ -10,6 +10,7 @@
  * @bus_width: Number of data lines wired up the slot
  * @detect_pin: GPIO pin wired to the card detect switch
  * @wp_pin: GPIO pin wired to the write protect sensor
+ * @detect_is_active_high: The state of the detect pin when it is active
  *
  * If a given slot is not present on the board, @bus_width should be
  * set to 0. The other fields are ignored in this case.
@@ -24,6 +25,7 @@ struct mci_slot_pdata {
 	unsigned int		bus_width;
 	int			detect_pin;
 	int			wp_pin;
+	bool			detect_is_active_high;
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:45 +0200
Subject: perf_counter: sysctl for system wide perf counters

Impact: add sysctl for paranoid/relaxed perfcounters policy

Allow the use of system wide perf counters to everybody, but provide
a sysctl to disable it for the paranoid security minded.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.514046352@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        |  4 +++-
 kernel/sysctl.c              | 11 +++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c22363a4f74..98143288530 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -568,6 +568,8 @@ struct perf_callchain_entry {
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
+extern int sysctl_perf_counter_priv;
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 76376ecb23b..7efb7ebaaae 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,8 @@ static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
+int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+
 /*
  * Mutex for (sysadmin-configurable) counter reservations:
  */
@@ -1132,7 +1134,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (!capable(CAP_SYS_ADMIN))
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4286b62b34a..8ba457838d9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -920,6 +921,16 @@ static struct ctl_table kern_table[] = {
 		.child		= slow_work_sysctls,
 	},
 #endif
+#ifdef CONFIG_PERF_COUNTERS
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_privileged",
+		.data		= &sysctl_perf_counter_priv,
+		.maxlen		= sizeof(sysctl_perf_counter_priv),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-70-g09d2


From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 14:26:18 +0800
Subject: tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace
 part and tracepoint part

Impact: refactor code for future changes

Current kmemtrace.h is used both as header file of kmemtrace and kmem's
tracepoints definition.

Tracepoints' definition file may be used by other code, and should only have
definition of tracepoint.

We can separate include/trace/kmemtrace.h into 2 files:

  include/linux/kmemtrace.h: header file for kmemtrace
  include/trace/kmem.h:      definition of kmem tracepoints

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kmemtrace.h | 25 +++++++++++++++++++
 include/linux/slab_def.h  |  2 +-
 include/linux/slub_def.h  |  2 +-
 include/trace/kmem.h      | 44 +++++++++++++++++++++++++++++++++
 include/trace/kmemtrace.h | 63 -----------------------------------------------
 init/main.c               |  2 +-
 kernel/trace/kmemtrace.c  |  2 +-
 kernel/trace/trace.h      |  2 +-
 mm/slab.c                 |  2 +-
 mm/slob.c                 |  2 +-
 mm/slub.c                 |  2 +-
 11 files changed, 77 insertions(+), 71 deletions(-)
 create mode 100644 include/linux/kmemtrace.h
 create mode 100644 include/trace/kmem.h
 delete mode 100644 include/trace/kmemtrace.h

(limited to 'include/linux')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
new file mode 100644
index 00000000000..15c45a27a92
--- /dev/null
+++ b/include/linux/kmemtrace.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <trace/kmem.h>
+
+#ifdef CONFIG_KMEMTRACE
+extern void kmemtrace_init(void);
+#else
+static inline void kmemtrace_init(void)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 5ac9b0bcaf9..713f841ecaa 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5046f90c117..be5d40c43bd 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
new file mode 100644
index 00000000000..24d25192818
--- /dev/null
+++ b/include/trace/kmem.h
@@ -0,0 +1,44 @@
+#ifndef _TRACE_KMEM_H
+#define _TRACE_KMEM_H
+
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+DECLARE_TRACE(kmalloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmem_cache_alloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmalloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kmem_cache_alloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kfree,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+DECLARE_TRACE(kmem_cache_free,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+
+#endif /* _TRACE_KMEM_H */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
deleted file mode 100644
index 28ee69f9cd4..00000000000
--- a/include/trace/kmemtrace.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/tracepoint.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-DECLARE_TRACE(kmalloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/init/main.c b/init/main.c
index 3585f073d63..eece40cd8a6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -64,6 +64,7 @@
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/kmemtrace.h>
 #include <trace/boot.h>
 
 #include <asm/io.h>
@@ -71,7 +72,6 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e3..7a0aa0e260d 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
 
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 #include "trace_output.h"
 #include "trace.h"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f76a8f8689d..34b94c3f40a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
 
 enum trace_type {
diff --git a/mm/slab.c b/mm/slab.c
index 9a90b00d2f9..f85831da908 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<trace/kmemtrace.h>
+#include	<linux/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
diff --git a/mm/slob.c b/mm/slob.c
index a2d4ab32198..494f05f1941 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index 7ab54ecbd3f..ea9e7160e2e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
cgit v1.2.3-70-g09d2


From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 00:09:41 -0400
Subject: ring-buffer: add ring_buffer_discard_commit

The ring_buffer_discard_commit is similar to ring_buffer_event_discard
but it can only be done on an event that has yet to be commited.
Unpredictable results can happen otherwise.

The main difference between ring_buffer_discard_commit and
ring_buffer_event_discard is that ring_buffer_discard_commit will try
to free the data in the ring buffer if nothing has addded data
after the reserved event. If something did, then it acts almost the
same as ring_buffer_event_discard followed by a
ring_buffer_unlock_commit.

Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit
can be called on an event, not both.

This commit also exports both discard functions to be usable by
GPL modules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  29 ++++++++++
 kernel/trace/ring_buffer.c  | 125 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 133 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e1b7b217388..f0aa486d131 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -68,8 +68,37 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+/*
+ * ring_buffer_event_discard can discard any event in the ring buffer.
+ *   it is up to the caller to protect against a reader from
+ *   consuming it or a writer from wrapping and replacing it.
+ *
+ * No external protection is needed if this is called before
+ * the event is commited. But in that case it would be better to
+ * use ring_buffer_discard_commit.
+ *
+ * Note, if an event that has not been committed is discarded
+ * with ring_buffer_event_discard, it must still be committed.
+ */
 void ring_buffer_event_discard(struct ring_buffer_event *event);
 
+/*
+ * ring_buffer_discard_commit will remove an event that has not
+ *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   must not be called on the discarded event. This function
+ *   will try to remove the event from the ring buffer completely
+ *   if another event has not been written after it.
+ *
+ * Example use:
+ *
+ *  if (some_condition)
+ *    ring_buffer_discard_commit(buffer, event);
+ *  else
+ *    ring_buffer_unlock_commit(buffer, event);
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 74a11808c28..f935bd5ec3e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event)
 	event->time_delta = 0;
 }
 
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
-}
-
 static unsigned
 rb_event_data_length(struct ring_buffer_event *event)
 {
@@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+	int cpu;
+
+	/* The event is discarded regardless */
+	ring_buffer_event_discard(event);
+
+	/*
+	 * This must only be called if the event has not been
+	 * committed yet. Thus we can assume that preemption
+	 * is still disabled.
+	 */
+	RB_WARN_ON(buffer, !preempt_count());
+
+	cpu = smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			goto out;
+	}
+
+	/*
+	 * The commit is still visible by the reader, so we
+	 * must increment entries.
+	 */
+	cpu_buffer->entries++;
+ out:
+	/*
+	 * If a write came in and pushed the tail page
+	 * we still need to update the commit pointer
+	 * if we were the commit.
+	 */
+	if (rb_is_commit(cpu_buffer, event))
+		rb_set_commit_to_write(cpu_buffer);
+
+	/*
+	 * Only the last preempt count needs to restore preemption.
+	 */
+	if (preempt_count() == 1)
+		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+	else
+		preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
 /**
  * ring_buffer_write - write data to the buffer without reserving
  * @buffer: The ring buffer to write to.
-- 
cgit v1.2.3-70-g09d2


From 6e837fb152410e571a81aaadbd9884f0bc46a55e Mon Sep 17 00:00:00 2001
From: Etienne Basset <etienne.basset@numericable.fr>
Date: Wed, 8 Apr 2009 20:39:40 +0200
Subject: smack: implement logging V3

This patch creates auditing functions usable by LSM to audit security
events. It provides standard dumping of FS, NET, task etc ... events
(code borrowed from SELinux)
and provides 2 callbacks to define LSM specific auditing, which should be
flexible enough to convert SELinux too.

Signed-off-by: Etienne Basset <etienne.basset@numericable.fr>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
cked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_audit.h | 111 +++++++++++++
 security/lsm_audit.c      | 386 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 497 insertions(+)
 create mode 100644 include/linux/lsm_audit.h
 create mode 100644 security/lsm_audit.c

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
new file mode 100644
index 00000000000..e461b2c3d71
--- /dev/null
+++ b/include/linux/lsm_audit.h
@@ -0,0 +1,111 @@
+/*
+ * Common LSM logging functions
+ * Heavily borrowed from selinux/avc.h
+ *
+ * Author : Etienne BASSET  <etienne.basset@ensta.org>
+ *
+ * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * All BUGS to : Etienne BASSET  <etienne.basset@ensta.org>
+ */
+#ifndef _LSM_COMMON_LOGGING_
+#define _LSM_COMMON_LOGGING_
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kdev_t.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/audit.h>
+#include <linux/in6.h>
+#include <linux/path.h>
+#include <linux/key.h>
+#include <linux/skbuff.h>
+#include <asm/system.h>
+
+
+/* Auxiliary data to use in generating the audit record. */
+struct common_audit_data {
+	char    type;
+#define LSM_AUDIT_DATA_FS      1
+#define LSM_AUDIT_DATA_NET     2
+#define LSM_AUDIT_DATA_CAP     3
+#define LSM_AUDIT_DATA_IPC     4
+#define LSM_AUDIT_DATA_TASK    5
+#define LSM_AUDIT_DATA_KEY     6
+	struct task_struct *tsk;
+	union 	{
+		struct {
+			struct path path;
+			struct inode *inode;
+		} fs;
+		struct {
+			int netif;
+			struct sock *sk;
+			u16 family;
+			__be16 dport;
+			__be16 sport;
+			union {
+				struct {
+					__be32 daddr;
+					__be32 saddr;
+				} v4;
+				struct {
+					struct in6_addr daddr;
+					struct in6_addr saddr;
+				} v6;
+			} fam;
+		} net;
+		int cap;
+		int ipc_id;
+		struct task_struct *tsk;
+#ifdef CONFIG_KEYS
+		struct {
+			key_serial_t key;
+			char *key_desc;
+		} key_struct;
+#endif
+	} u;
+	const char *function;
+	/* this union contains LSM specific data */
+	union {
+		/* SMACK data */
+		struct smack_audit_data {
+			char *subject;
+			char *object;
+			char *request;
+			int result;
+		} smack_audit_data;
+		/* SELinux data */
+		struct {
+			u32 ssid;
+			u32 tsid;
+			u16 tclass;
+			u32 requested;
+			u32 audited;
+			struct av_decision *avd;
+			int result;
+		} selinux_audit_data;
+	} lsm_priv;
+	/* these callback will be implemented by a specific LSM */
+	void (*lsm_pre_audit)(struct audit_buffer *, void *);
+	void (*lsm_post_audit)(struct audit_buffer *, void *);
+};
+
+#define v4info fam.v4
+#define v6info fam.v6
+
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto);
+
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto);
+
+/* Initialize an LSM audit data structure. */
+#define COMMON_AUDIT_DATA_INIT(_d, _t) \
+	{ memset((_d), 0, sizeof(struct common_audit_data)); \
+	 (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; }
+
+void common_lsm_audit(struct common_audit_data *a);
+
+#endif
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
new file mode 100644
index 00000000000..94b868494b3
--- /dev/null
+++ b/security/lsm_audit.c
@@ -0,0 +1,386 @@
+/*
+ * common LSM auditing functions
+ *
+ * Based on code written for SELinux by :
+ *			Stephen Smalley, <sds@epoch.ncsc.mil>
+ * 			James Morris <jmorris@redhat.com>
+ * Author : Etienne Basset, <etienne.basset@ensta.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <linux/un.h>
+#include <net/af_unix.h>
+#include <linux/audit.h>
+#include <linux/ipv6.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/dccp.h>
+#include <linux/sctp.h>
+#include <linux/lsm_audit.h>
+
+/**
+ * ipv4_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto)
+{
+	int ret = 0;
+	struct iphdr *ih;
+
+	ih = ip_hdr(skb);
+	if (ih == NULL)
+		return -EINVAL;
+
+	ad->u.net.v4info.saddr = ih->saddr;
+	ad->u.net.v4info.daddr = ih->daddr;
+
+	if (proto)
+		*proto = ih->protocol;
+	/* non initial fragment */
+	if (ntohs(ih->frag_off) & IP_OFFSET)
+		return 0;
+
+	switch (ih->protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr *th = tcp_hdr(skb);
+		if (th == NULL)
+			break;
+
+		ad->u.net.sport = th->source;
+		ad->u.net.dport = th->dest;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *uh = udp_hdr(skb);
+		if (uh == NULL)
+			break;
+
+		ad->u.net.sport = uh->source;
+		ad->u.net.dport = uh->dest;
+		break;
+	}
+	case IPPROTO_DCCP: {
+		struct dccp_hdr *dh = dccp_hdr(skb);
+		if (dh == NULL)
+			break;
+
+		ad->u.net.sport = dh->dccph_sport;
+		ad->u.net.dport = dh->dccph_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr *sh = sctp_hdr(skb);
+		if (sh == NULL)
+			break;
+		ad->u.net.sport = sh->source;
+		ad->u.net.dport = sh->dest;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * ipv6_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto)
+{
+	int offset, ret = 0;
+	struct ipv6hdr *ip6;
+	u8 nexthdr;
+
+	ip6 = ipv6_hdr(skb);
+	if (ip6 == NULL)
+		return -EINVAL;
+	ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr);
+	ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr);
+	ret = 0;
+	/* IPv6 can have several extension header before the Transport header
+	 * skip them */
+	offset = skb_network_offset(skb);
+	offset += sizeof(*ip6);
+	nexthdr = ip6->nexthdr;
+	offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+	if (offset < 0)
+		return 0;
+	if (proto)
+		*proto = nexthdr;
+	switch (nexthdr) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph, *th;
+
+		th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			break;
+
+		ad->u.net.sport = th->source;
+		ad->u.net.dport = th->dest;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr _udph, *uh;
+
+		uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+		if (uh == NULL)
+			break;
+
+		ad->u.net.sport = uh->source;
+		ad->u.net.dport = uh->dest;
+		break;
+	}
+	case IPPROTO_DCCP: {
+		struct dccp_hdr _dccph, *dh;
+
+		dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
+		if (dh == NULL)
+			break;
+
+		ad->u.net.sport = dh->dccph_sport;
+		ad->u.net.dport = dh->dccph_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr _sctph, *sh;
+
+		sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
+		if (sh == NULL)
+			break;
+		ad->u.net.sport = sh->source;
+		ad->u.net.dport = sh->dest;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+#endif
+
+
+static inline void print_ipv6_addr(struct audit_buffer *ab,
+				   struct in6_addr *addr, __be16 port,
+				   char *name1, char *name2)
+{
+	if (!ipv6_addr_any(addr))
+		audit_log_format(ab, " %s=%pI6", name1, addr);
+	if (port)
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
+				   __be16 port, char *name1, char *name2)
+{
+	if (addr)
+		audit_log_format(ab, " %s=%pI4", name1, &addr);
+	if (port)
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+/**
+ * dump_common_audit_data - helper to dump common audit data
+ * @a : common audit data
+ *
+ */
+static void dump_common_audit_data(struct audit_buffer *ab,
+				   struct common_audit_data *a)
+{
+	struct inode *inode = NULL;
+	struct task_struct *tsk = current;
+
+	if (a->tsk)
+		tsk = a->tsk;
+	if (tsk && tsk->pid) {
+		audit_log_format(ab, " pid=%d comm=", tsk->pid);
+		audit_log_untrustedstring(ab, tsk->comm);
+	}
+
+	switch (a->type) {
+	case LSM_AUDIT_DATA_IPC:
+		audit_log_format(ab, " key=%d ", a->u.ipc_id);
+		break;
+	case LSM_AUDIT_DATA_CAP:
+		audit_log_format(ab, " capability=%d ", a->u.cap);
+		break;
+	case LSM_AUDIT_DATA_FS:
+		if (a->u.fs.path.dentry) {
+			struct dentry *dentry = a->u.fs.path.dentry;
+			if (a->u.fs.path.mnt) {
+				audit_log_d_path(ab, "path=", &a->u.fs.path);
+			} else {
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab,
+						 dentry->d_name.name);
+			}
+			inode = dentry->d_inode;
+		} else if (a->u.fs.inode) {
+			struct dentry *dentry;
+			inode = a->u.fs.inode;
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab,
+						 dentry->d_name.name);
+				dput(dentry);
+			}
+		}
+		if (inode)
+			audit_log_format(ab, " dev=%s ino=%lu",
+					inode->i_sb->s_id,
+					inode->i_ino);
+		break;
+	case LSM_AUDIT_DATA_TASK:
+		tsk = a->u.tsk;
+		if (tsk && tsk->pid) {
+			audit_log_format(ab, " pid=%d comm=", tsk->pid);
+			audit_log_untrustedstring(ab, tsk->comm);
+		}
+		break;
+	case LSM_AUDIT_DATA_NET:
+		if (a->u.net.sk) {
+			struct sock *sk = a->u.net.sk;
+			struct unix_sock *u;
+			int len = 0;
+			char *p = NULL;
+
+			switch (sk->sk_family) {
+			case AF_INET: {
+				struct inet_sock *inet = inet_sk(sk);
+
+				print_ipv4_addr(ab, inet->rcv_saddr,
+						inet->sport,
+						"laddr", "lport");
+				print_ipv4_addr(ab, inet->daddr,
+						inet->dport,
+						"faddr", "fport");
+				break;
+			}
+			case AF_INET6: {
+				struct inet_sock *inet = inet_sk(sk);
+				struct ipv6_pinfo *inet6 = inet6_sk(sk);
+
+				print_ipv6_addr(ab, &inet6->rcv_saddr,
+						inet->sport,
+						"laddr", "lport");
+				print_ipv6_addr(ab, &inet6->daddr,
+						inet->dport,
+						"faddr", "fport");
+				break;
+			}
+			case AF_UNIX:
+				u = unix_sk(sk);
+				if (u->dentry) {
+					struct path path = {
+						.dentry = u->dentry,
+						.mnt = u->mnt
+					};
+					audit_log_d_path(ab, "path=", &path);
+					break;
+				}
+				if (!u->addr)
+					break;
+				len = u->addr->len-sizeof(short);
+				p = &u->addr->name->sun_path[0];
+				audit_log_format(ab, " path=");
+				if (*p)
+					audit_log_untrustedstring(ab, p);
+				else
+					audit_log_n_hex(ab, p, len);
+				break;
+			}
+		}
+
+		switch (a->u.net.family) {
+		case AF_INET:
+			print_ipv4_addr(ab, a->u.net.v4info.saddr,
+					a->u.net.sport,
+					"saddr", "src");
+			print_ipv4_addr(ab, a->u.net.v4info.daddr,
+					a->u.net.dport,
+					"daddr", "dest");
+			break;
+		case AF_INET6:
+			print_ipv6_addr(ab, &a->u.net.v6info.saddr,
+					a->u.net.sport,
+					"saddr", "src");
+			print_ipv6_addr(ab, &a->u.net.v6info.daddr,
+					a->u.net.dport,
+					"daddr", "dest");
+			break;
+		}
+		if (a->u.net.netif > 0) {
+			struct net_device *dev;
+
+			/* NOTE: we always use init's namespace */
+			dev = dev_get_by_index(&init_net, a->u.net.netif);
+			if (dev) {
+				audit_log_format(ab, " netif=%s", dev->name);
+				dev_put(dev);
+			}
+		}
+		break;
+#ifdef CONFIG_KEYS
+	case LSM_AUDIT_DATA_KEY:
+		audit_log_format(ab, " key_serial=%u", a->u.key_struct.key);
+		if (a->u.key_struct.key_desc) {
+			audit_log_format(ab, " key_desc=");
+			audit_log_untrustedstring(ab, a->u.key_struct.key_desc);
+		}
+		break;
+#endif
+	} /* switch (a->type) */
+}
+
+/**
+ * common_lsm_audit - generic LSM auditing function
+ * @a:  auxiliary audit data
+ *
+ * setup the audit buffer for common security information
+ * uses callback to print LSM specific information
+ */
+void common_lsm_audit(struct common_audit_data *a)
+{
+	struct audit_buffer *ab;
+
+	if (a == NULL)
+		return;
+	/* we use GFP_ATOMIC so we won't sleep */
+	ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC);
+
+	if (ab == NULL)
+		return;
+
+	if (a->lsm_pre_audit)
+		a->lsm_pre_audit(ab, a);
+
+	dump_common_audit_data(ab, a);
+
+	if (a->lsm_post_audit)
+		a->lsm_post_audit(ab, a);
+
+	audit_log_end(ab);
+}
-- 
cgit v1.2.3-70-g09d2


From 7ba5c840e64d4a967379f1ae3eca73278180b11d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:17 -0700
Subject: rcu: Add __rcu_pending tracing to hierarchical RCU

Add tracing to __rcu_pending() to provide information on why RCU
processing was kicked off.  This is helpful for debugging hierarchical
RCU, and might also be helpful in learning how hierarchical RCU operates.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <1239683479943-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutree.h |  9 ++++++-
 kernel/rcutree.c        | 25 ++++++++++++++-----
 kernel/rcutree_trace.c  | 64 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 58b2aa5312b..5a5153806c4 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -161,8 +161,15 @@ struct rcu_data {
 	unsigned long offline_fqs;	/* Kicked due to being offline. */
 	unsigned long resched_ipi;	/* Sent a resched IPI. */
 
-	/* 5) For future __rcu_pending statistics. */
+	/* 5) __rcu_pending() statistics. */
 	long n_rcu_pending;		/* rcu_pending() calls since boot. */
+	long n_rp_qs_pending;
+	long n_rp_cb_ready;
+	long n_rp_cpu_needs_gp;
+	long n_rp_gp_completed;
+	long n_rp_gp_started;
+	long n_rp_need_fqs;
+	long n_rp_need_nothing;
 
 	int cpu;
 };
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9..0dccfbba6d2 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending)
+	if (rdp->qs_pending) {
+		rdp->n_rp_qs_pending++;
 		return 1;
+	}
 
 	/* Does this CPU have callbacks ready to invoke? */
-	if (cpu_has_callbacks_ready_to_invoke(rdp))
+	if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+		rdp->n_rp_cb_ready++;
 		return 1;
+	}
 
 	/* Has RCU gone idle with this CPU needing another grace period? */
-	if (cpu_needs_another_gp(rsp, rdp))
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		rdp->n_rp_cpu_needs_gp++;
 		return 1;
+	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+		rdp->n_rp_gp_completed++;
 		return 1;
+	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+		rdp->n_rp_gp_started++;
 		return 1;
+	}
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+		rdp->n_rp_need_fqs++;
 		return 1;
+	}
 
 	/* nothing to do */
+	rdp->n_rp_need_nothing++;
 	return 0;
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba940..fe1dcdbf1ca 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
 	.release = single_release,
 };
 
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+	seq_printf(m, "%3d%cnp=%ld "
+		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->n_rcu_pending,
+		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_cb_ready,
+		   rdp->n_rp_cpu_needs_gp,
+		   rdp->n_rp_gp_completed,
+		   rdp->n_rp_gp_started,
+		   rdp->n_rp_need_fqs,
+		   rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+	int cpu;
+	struct rcu_data *rdp;
+
+	for_each_possible_cpu(cpu) {
+		rdp = rsp->rda[cpu];
+		if (rdp->beenonline)
+			print_one_rcu_pending(m, rdp);
+	}
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_rcu_pendings(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_rcu_pendings(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+	.owner = THIS_MODULE,
+	.open = rcu_pending_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
+
 static int __init rcuclassic_trace_init(void)
 {
 	rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
 						NULL, &rcuhier_fops);
 	if (!hierdir)
 		goto free_out;
+
+	rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+						NULL, &rcu_pending_fops);
+	if (!rcu_pendingdir)
+		goto free_out;
 	return 0;
 free_out:
 	if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
 	debugfs_remove(datadir_csv);
 	debugfs_remove(gpdir);
 	debugfs_remove(hierdir);
+	debugfs_remove(rcu_pendingdir);
 	debugfs_remove(rcudir);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 02bec490450836ebbd628e97ec03f10b57def8ce Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Tue, 24 Mar 2009 12:24:35 +0100
Subject: ALSA: lx6464es - driver for the digigram lx6464es interface

prototype of a driver for the digigram lx6464es 64 channel ethersound
interface.

Signed-off-by: Tim Blechmann <tim@klingt.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h       |    5 +
 sound/pci/Kconfig             |   10 +
 sound/pci/Makefile            |    1 +
 sound/pci/lx6464es/Makefile   |    2 +
 sound/pci/lx6464es/lx6464es.c | 1152 ++++++++++++++++++++++++++++++++
 sound/pci/lx6464es/lx6464es.h |  114 ++++
 sound/pci/lx6464es/lx_core.c  | 1442 +++++++++++++++++++++++++++++++++++++++++
 sound/pci/lx6464es/lx_core.h  |  242 +++++++
 sound/pci/lx6464es/lx_defs.h  |  376 +++++++++++
 9 files changed, 3344 insertions(+)
 create mode 100644 sound/pci/lx6464es/Makefile
 create mode 100644 sound/pci/lx6464es/lx6464es.c
 create mode 100644 sound/pci/lx6464es/lx6464es.h
 create mode 100644 sound/pci/lx6464es/lx_core.c
 create mode 100644 sound/pci/lx6464es/lx_core.h
 create mode 100644 sound/pci/lx6464es/lx_defs.h

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ee98cd57088..2b1a69598e7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1005,6 +1005,7 @@
 #define PCI_DEVICE_ID_PLX_PCI200SYN	0x3196
 #define PCI_DEVICE_ID_PLX_9030          0x9030
 #define PCI_DEVICE_ID_PLX_9050		0x9050
+#define PCI_DEVICE_ID_PLX_9056		0x9056
 #define PCI_DEVICE_ID_PLX_9080		0x9080
 #define PCI_DEVICE_ID_PLX_GTEK_SERIAL2	0xa001
 
@@ -1847,6 +1848,10 @@
 #define PCI_SUBDEVICE_ID_HYPERCOPE_METRO	0x0107
 #define PCI_SUBDEVICE_ID_HYPERCOPE_CHAMP2	0x0108
 
+#define PCI_VENDOR_ID_DIGIGRAM		0x1369
+#define PCI_SUBDEVICE_ID_DIGIGRAM_LX6464ES_SERIAL_SUBSYSTEM	0xc001
+#define PCI_SUBDEVICE_ID_DIGIGRAM_LX6464ES_CAE_SERIAL_SUBSYSTEM	0xc002
+
 #define PCI_VENDOR_ID_KAWASAKI		0x136b
 #define PCI_DEVICE_ID_MCHIP_KL5A72002	0xff01
 
diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig
index 93422e3a3f0..e912b70b6f9 100644
--- a/sound/pci/Kconfig
+++ b/sound/pci/Kconfig
@@ -622,6 +622,16 @@ config SND_KORG1212
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-korg1212.
 
+config SND_LX6464ES
+	tristate "Digigram LX6464ES"
+	select SND_PCM
+	help
+	  Say Y here to include support for Digigram LX6464ES boards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called snd-lx6464es.
+
+
 config SND_MAESTRO3
 	tristate "ESS Allegro/Maestro3"
 	select SND_AC97_CODEC
diff --git a/sound/pci/Makefile b/sound/pci/Makefile
index 65b25d221cd..7d83e084dcf 100644
--- a/sound/pci/Makefile
+++ b/sound/pci/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_SND) += \
 	ca0106/ \
 	cs46xx/ \
 	cs5535audio/ \
+	lx6464es/ \
 	echoaudio/ \
 	emu10k1/ \
 	hda/ \
diff --git a/sound/pci/lx6464es/Makefile b/sound/pci/lx6464es/Makefile
new file mode 100644
index 00000000000..eb04a6c73d8
--- /dev/null
+++ b/sound/pci/lx6464es/Makefile
@@ -0,0 +1,2 @@
+snd-lx6464es-objs := lx6464es.o lx_core.o
+obj-$(CONFIG_SND_LX6464ES) += snd-lx6464es.o
diff --git a/sound/pci/lx6464es/lx6464es.c b/sound/pci/lx6464es/lx6464es.c
new file mode 100644
index 00000000000..7bc8b8caa99
--- /dev/null
+++ b/sound/pci/lx6464es/lx6464es.c
@@ -0,0 +1,1152 @@
+/* -*- linux-c -*- *
+ *
+ * ALSA driver for the digigram lx6464es interface
+ *
+ * Copyright (c) 2008, 2009 Tim Blechmann <tim@klingt.org>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include <sound/initval.h>
+#include <sound/control.h>
+#include <sound/info.h>
+
+#include "lx6464es.h"
+
+MODULE_AUTHOR("Tim Blechmann");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("digigram lx6464es");
+MODULE_SUPPORTED_DEVICE("{digigram lx6464es{}}");
+
+
+static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
+static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
+static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
+
+static const char card_name[] = "LX6464ES";
+
+
+#define PCI_DEVICE_ID_PLX_LX6464ES		PCI_DEVICE_ID_PLX_9056
+
+static struct pci_device_id snd_lx6464es_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_LX6464ES),
+	  .subvendor = PCI_VENDOR_ID_DIGIGRAM,
+	  .subdevice = PCI_SUBDEVICE_ID_DIGIGRAM_LX6464ES_SERIAL_SUBSYSTEM
+	},			/* LX6464ES */
+	{ PCI_DEVICE(PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_LX6464ES),
+	  .subvendor = PCI_VENDOR_ID_DIGIGRAM,
+	  .subdevice = PCI_SUBDEVICE_ID_DIGIGRAM_LX6464ES_CAE_SERIAL_SUBSYSTEM
+	},			/* LX6464ES-CAE */
+	{ 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, snd_lx6464es_ids);
+
+
+
+/* PGO pour USERo dans le registre pci_0x06/loc_0xEC */
+#define CHIPSC_RESET_XILINX (1L<<16)
+
+
+/* alsa callbacks */
+static struct snd_pcm_hardware lx_caps = {
+	.info             = (SNDRV_PCM_INFO_MMAP |
+			     SNDRV_PCM_INFO_INTERLEAVED |
+			     SNDRV_PCM_INFO_MMAP_VALID |
+			     SNDRV_PCM_INFO_SYNC_START),
+	.formats	  = (SNDRV_PCM_FMTBIT_S16_LE |
+			     SNDRV_PCM_FMTBIT_S16_BE |
+			     SNDRV_PCM_FMTBIT_S24_3LE |
+			     SNDRV_PCM_FMTBIT_S24_3BE),
+	.rates            = (SNDRV_PCM_RATE_CONTINUOUS |
+			     SNDRV_PCM_RATE_8000_192000),
+	.rate_min         = 8000,
+	.rate_max         = 192000,
+	.channels_min     = 2,
+	.channels_max     = 64,
+	.buffer_bytes_max = 64*2*3*MICROBLAZE_IBL_MAX*MAX_STREAM_BUFFER,
+	.period_bytes_min = (2*2*MICROBLAZE_IBL_MIN*2),
+	.period_bytes_max = (4*64*MICROBLAZE_IBL_MAX*MAX_STREAM_BUFFER),
+	.periods_min      = 2,
+	.periods_max      = MAX_STREAM_BUFFER,
+};
+
+static int lx_set_granularity(struct lx6464es *chip, u32 gran);
+
+
+static int lx_hardware_open(struct lx6464es *chip,
+			    struct snd_pcm_substream *substream)
+{
+	int err = 0;
+	struct snd_pcm_runtime *runtime = substream->runtime;
+	int channels = runtime->channels;
+	int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	snd_pcm_uframes_t period_size = runtime->period_size;
+
+	snd_printd(LXP "allocating pipe for %d channels\n", channels);
+	err = lx_pipe_allocate(chip, 0, is_capture, channels);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "allocating pipe failed\n");
+		return err;
+	}
+
+	err = lx_set_granularity(chip, period_size);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "setting granularity to %ld failed\n",
+			   period_size);
+		return err;
+	}
+
+	return 0;
+}
+
+static int lx_hardware_start(struct lx6464es *chip,
+			     struct snd_pcm_substream *substream)
+{
+	int err = 0;
+	struct snd_pcm_runtime *runtime = substream->runtime;
+	int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	snd_printd(LXP "setting stream format\n");
+	err = lx_stream_set_format(chip, runtime, 0, is_capture);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "setting stream format failed\n");
+		return err;
+	}
+
+	snd_printd(LXP "starting pipe\n");
+	err = lx_pipe_start(chip, 0, is_capture);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "starting pipe failed\n");
+		return err;
+	}
+
+	snd_printd(LXP "waiting for pipe to start\n");
+	err = lx_pipe_wait_for_start(chip, 0, is_capture);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "waiting for pipe failed\n");
+		return err;
+	}
+
+	return err;
+}
+
+
+static int lx_hardware_stop(struct lx6464es *chip,
+			    struct snd_pcm_substream *substream)
+{
+	int err = 0;
+	int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	snd_printd(LXP "pausing pipe\n");
+	err = lx_pipe_pause(chip, 0, is_capture);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "pausing pipe failed\n");
+		return err;
+	}
+
+	snd_printd(LXP "waiting for pipe to become idle\n");
+	err = lx_pipe_wait_for_idle(chip, 0, is_capture);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "waiting for pipe failed\n");
+		return err;
+	}
+
+	snd_printd(LXP "stopping pipe\n");
+	err = lx_pipe_stop(chip, 0, is_capture);
+	if (err < 0) {
+		snd_printk(LXP "stopping pipe failed\n");
+		return err;
+	}
+
+	return err;
+}
+
+
+static int lx_hardware_close(struct lx6464es *chip,
+			     struct snd_pcm_substream *substream)
+{
+	int err = 0;
+	int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	snd_printd(LXP "releasing pipe\n");
+	err = lx_pipe_release(chip, 0, is_capture);
+	if (err < 0) {
+		snd_printk(LXP "releasing pipe failed\n");
+		return err;
+	}
+
+	return err;
+}
+
+
+static int lx_pcm_open(struct snd_pcm_substream *substream)
+{
+	struct lx6464es *chip = snd_pcm_substream_chip(substream);
+	struct snd_pcm_runtime *runtime = substream->runtime;
+	int err = 0;
+	int board_rate;
+
+	snd_printdd("->lx_pcm_open\n");
+	mutex_lock(&chip->setup_mutex);
+
+	/* copy the struct snd_pcm_hardware struct */
+	runtime->hw = lx_caps;
+
+#if 0
+	/* buffer-size should better be multiple of period-size */
+	err = snd_pcm_hw_constraint_integer(runtime,
+					    SNDRV_PCM_HW_PARAM_PERIODS);
+	if (err < 0) {
+		snd_printk(KERN_WARNING LXP "could not constrain periods\n");
+		goto exit;
+	}
+#endif
+
+	/* the clock rate cannot be changed */
+	board_rate = chip->board_sample_rate;
+	err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_RATE,
+					   board_rate, board_rate);
+
+	if (err < 0) {
+		snd_printk(KERN_WARNING LXP "could not constrain periods\n");
+		goto exit;
+	}
+
+	/* constrain period size */
+	err = snd_pcm_hw_constraint_minmax(runtime,
+					   SNDRV_PCM_HW_PARAM_PERIOD_SIZE,
+					   MICROBLAZE_IBL_MIN,
+					   MICROBLAZE_IBL_MAX);
+	if (err < 0) {
+		snd_printk(KERN_WARNING LXP
+			   "could not constrain period size\n");
+		goto exit;
+	}
+
+	snd_pcm_hw_constraint_step(runtime, 0,
+				   SNDRV_PCM_HW_PARAM_BUFFER_SIZE, 32);
+
+	snd_pcm_set_sync(substream);
+	err = 0;
+
+exit:
+	runtime->private_data = chip;
+
+	mutex_unlock(&chip->setup_mutex);
+	snd_printdd("<-lx_pcm_open, %d\n", err);
+	return err;
+}
+
+static int lx_pcm_close(struct snd_pcm_substream *substream)
+{
+	int err = 0;
+	snd_printdd("->lx_pcm_close\n");
+	return err;
+}
+
+static snd_pcm_uframes_t lx_pcm_stream_pointer(struct snd_pcm_substream
+					       *substream)
+{
+	struct lx6464es *chip = snd_pcm_substream_chip(substream);
+	snd_pcm_uframes_t pos;
+	unsigned long flags;
+	int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	struct lx_stream *lx_stream = is_capture ? &chip->capture_stream :
+		&chip->playback_stream;
+
+	snd_printdd("->lx_pcm_stream_pointer\n");
+
+	spin_lock_irqsave(&chip->lock, flags);
+	pos = lx_stream->frame_pos * substream->runtime->period_size;
+	spin_unlock_irqrestore(&chip->lock, flags);
+
+	snd_printdd(LXP "stream_pointer at %ld\n", pos);
+	return pos;
+}
+
+static int lx_pcm_prepare(struct snd_pcm_substream *substream)
+{
+	struct lx6464es *chip = snd_pcm_substream_chip(substream);
+	int err = 0;
+	const int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	snd_printdd("->lx_pcm_prepare\n");
+
+	mutex_lock(&chip->setup_mutex);
+
+	if (chip->hardware_running[is_capture]) {
+		err = lx_hardware_stop(chip, substream);
+		if (err < 0) {
+			snd_printk(KERN_ERR LXP "failed to stop hardware. "
+				   "Error code %d\n", err);
+			goto exit;
+		}
+
+		err = lx_hardware_close(chip, substream);
+		if (err < 0) {
+			snd_printk(KERN_ERR LXP "failed to close hardware. "
+				   "Error code %d\n", err);
+			goto exit;
+		}
+	}
+
+	snd_printd(LXP "opening hardware\n");
+	err = lx_hardware_open(chip, substream);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "failed to open hardware. "
+			   "Error code %d\n", err);
+		goto exit;
+	}
+
+	err = lx_hardware_start(chip, substream);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "failed to start hardware. "
+			   "Error code %d\n", err);
+		goto exit;
+	}
+
+	chip->hardware_running[is_capture] = 1;
+
+	if (chip->board_sample_rate != substream->runtime->rate) {
+		if (!err)
+			chip->board_sample_rate = substream->runtime->rate;
+	}
+
+exit:
+	mutex_unlock(&chip->setup_mutex);
+	return err;
+}
+
+static int lx_pcm_hw_params(struct snd_pcm_substream *substream,
+			    struct snd_pcm_hw_params *hw_params, int is_capture)
+{
+	struct lx6464es *chip = snd_pcm_substream_chip(substream);
+	int err = 0;
+
+	snd_printdd("->lx_pcm_hw_params\n");
+
+	mutex_lock(&chip->setup_mutex);
+
+	/* set dma buffer */
+	err = snd_pcm_lib_malloc_pages(substream,
+				       params_buffer_bytes(hw_params));
+
+	if (is_capture)
+		chip->capture_stream.stream = substream;
+	else
+		chip->playback_stream.stream = substream;
+
+	mutex_unlock(&chip->setup_mutex);
+	return err;
+}
+
+static int lx_pcm_hw_params_playback(struct snd_pcm_substream *substream,
+				 struct snd_pcm_hw_params *hw_params)
+{
+	return lx_pcm_hw_params(substream, hw_params, 0);
+}
+
+static int lx_pcm_hw_params_capture(struct snd_pcm_substream *substream,
+				 struct snd_pcm_hw_params *hw_params)
+{
+	return lx_pcm_hw_params(substream, hw_params, 1);
+}
+
+static int lx_pcm_hw_free(struct snd_pcm_substream *substream)
+{
+	struct lx6464es *chip = snd_pcm_substream_chip(substream);
+	int err = 0;
+	int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+
+	snd_printdd("->lx_pcm_hw_free\n");
+	mutex_lock(&chip->setup_mutex);
+
+	if (chip->hardware_running[is_capture]) {
+		err = lx_hardware_stop(chip, substream);
+		if (err < 0) {
+			snd_printk(KERN_ERR LXP "failed to stop hardware. "
+				   "Error code %d\n", err);
+			goto exit;
+		}
+
+		err = lx_hardware_close(chip, substream);
+		if (err < 0) {
+			snd_printk(KERN_ERR LXP "failed to close hardware. "
+				   "Error code %d\n", err);
+			goto exit;
+		}
+
+		chip->hardware_running[is_capture] = 0;
+	}
+
+	err = snd_pcm_lib_free_pages(substream);
+
+	if (is_capture)
+		chip->capture_stream.stream = 0;
+	else
+		chip->playback_stream.stream = 0;
+
+exit:
+	mutex_unlock(&chip->setup_mutex);
+	return err;
+}
+
+static void lx_trigger_start(struct lx6464es *chip, struct lx_stream *lx_stream)
+{
+	struct snd_pcm_substream *substream = lx_stream->stream;
+	const int is_capture = lx_stream->is_capture;
+
+	int err;
+
+	const u32 channels = substream->runtime->channels;
+	const u32 bytes_per_frame = channels * 3;
+	const u32 period_size = substream->runtime->period_size;
+	const u32 periods = substream->runtime->periods;
+	const u32 period_bytes = period_size * bytes_per_frame;
+
+	dma_addr_t buf = substream->dma_buffer.addr;
+	int i;
+
+	u32 needed, freed;
+	u32 size_array[5];
+
+	for (i = 0; i != periods; ++i) {
+		u32 buffer_index = 0;
+
+		err = lx_buffer_ask(chip, 0, is_capture, &needed, &freed,
+				    size_array);
+		snd_printdd(LXP "starting: needed %d, freed %d\n",
+			    needed, freed);
+
+		err = lx_buffer_give(chip, 0, is_capture, period_bytes,
+				     lower_32_bits(buf), upper_32_bits(buf),
+				     &buffer_index);
+
+		snd_printdd(LXP "starting: buffer index %x on %p (%d bytes)\n",
+			    buffer_index, (void *)buf, period_bytes);
+		buf += period_bytes;
+	}
+
+	err = lx_buffer_ask(chip, 0, is_capture, &needed, &freed, size_array);
+	snd_printdd(LXP "starting: needed %d, freed %d\n", needed, freed);
+
+	snd_printd(LXP "starting: starting stream\n");
+	err = lx_stream_start(chip, 0, is_capture);
+	if (err < 0)
+		snd_printk(KERN_ERR LXP "couldn't start stream\n");
+	else
+		lx_stream->status = LX_STREAM_STATUS_RUNNING;
+
+	lx_stream->frame_pos = 0;
+}
+
+static void lx_trigger_stop(struct lx6464es *chip, struct lx_stream *lx_stream)
+{
+	const int is_capture = lx_stream->is_capture;
+	int err;
+
+	snd_printd(LXP "stopping: stopping stream\n");
+	err = lx_stream_stop(chip, 0, is_capture);
+	if (err < 0)
+		snd_printk(KERN_ERR LXP "couldn't stop stream\n");
+	else
+		lx_stream->status = LX_STREAM_STATUS_FREE;
+
+}
+
+static void lx_trigger_tasklet_dispatch_stream(struct lx6464es *chip,
+					       struct lx_stream *lx_stream)
+{
+	switch (lx_stream->status) {
+	case LX_STREAM_STATUS_SCHEDULE_RUN:
+		lx_trigger_start(chip, lx_stream);
+		break;
+
+	case LX_STREAM_STATUS_SCHEDULE_STOP:
+		lx_trigger_stop(chip, lx_stream);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static void lx_trigger_tasklet(unsigned long data)
+{
+	struct lx6464es *chip = (struct lx6464es *)data;
+	unsigned long flags;
+
+	snd_printdd("->lx_trigger_tasklet\n");
+
+	spin_lock_irqsave(&chip->lock, flags);
+	lx_trigger_tasklet_dispatch_stream(chip, &chip->capture_stream);
+	lx_trigger_tasklet_dispatch_stream(chip, &chip->playback_stream);
+	spin_unlock_irqrestore(&chip->lock, flags);
+}
+
+static int lx_pcm_trigger_dispatch(struct lx6464es *chip,
+				   struct lx_stream *lx_stream, int cmd)
+{
+	int err = 0;
+
+	switch (cmd) {
+	case SNDRV_PCM_TRIGGER_START:
+		lx_stream->status = LX_STREAM_STATUS_SCHEDULE_RUN;
+		break;
+
+	case SNDRV_PCM_TRIGGER_STOP:
+		lx_stream->status = LX_STREAM_STATUS_SCHEDULE_STOP;
+		break;
+
+	default:
+		err = -EINVAL;
+		goto exit;
+	}
+	tasklet_schedule(&chip->trigger_tasklet);
+
+exit:
+	return err;
+}
+
+
+static int lx_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
+{
+	struct lx6464es *chip = snd_pcm_substream_chip(substream);
+	const int is_capture = (substream->stream == SNDRV_PCM_STREAM_CAPTURE);
+	struct lx_stream *stream = is_capture ? &chip->capture_stream :
+		&chip->playback_stream;
+
+	snd_printdd("->lx_pcm_trigger\n");
+
+	return lx_pcm_trigger_dispatch(chip, stream, cmd);
+}
+
+static int snd_lx6464es_free(struct lx6464es *chip)
+{
+	snd_printdd("->snd_lx6464es_free\n");
+
+	lx_irq_disable(chip);
+
+	if (chip->irq >= 0)
+		free_irq(chip->irq, chip);
+
+	iounmap(chip->port_dsp_bar);
+	ioport_unmap(chip->port_plx_remapped);
+
+	pci_release_regions(chip->pci);
+	pci_disable_device(chip->pci);
+
+	kfree(chip);
+
+	return 0;
+}
+
+static int snd_lx6464es_dev_free(struct snd_device *device)
+{
+	return snd_lx6464es_free(device->device_data);
+}
+
+/* reset the dsp during initialization */
+static int __devinit lx_init_xilinx_reset(struct lx6464es *chip)
+{
+	int i;
+	u32 plx_reg = lx_plx_reg_read(chip, ePLX_CHIPSC);
+
+	snd_printdd("->lx_init_xilinx_reset\n");
+
+	/* activate reset of xilinx */
+	plx_reg &= ~CHIPSC_RESET_XILINX;
+
+	lx_plx_reg_write(chip, ePLX_CHIPSC, plx_reg);
+	msleep(1);
+
+	lx_plx_reg_write(chip, ePLX_MBOX3, 0);
+	msleep(1);
+
+	plx_reg |= CHIPSC_RESET_XILINX;
+	lx_plx_reg_write(chip, ePLX_CHIPSC, plx_reg);
+
+	/* deactivate reset of xilinx */
+	for (i = 0; i != 100; ++i) {
+		u32 reg_mbox3;
+		msleep(10);
+		reg_mbox3 = lx_plx_reg_read(chip, ePLX_MBOX3);
+		if (reg_mbox3) {
+			snd_printd(LXP "xilinx reset done\n");
+			snd_printdd(LXP "xilinx took %d loops\n", i);
+			break;
+		}
+	}
+
+	/* todo: add some error handling? */
+
+	/* clear mr */
+	lx_dsp_reg_write(chip, eReg_CSM, 0);
+
+	/* le xilinx ES peut ne pas etre encore pret, on attend. */
+	msleep(600);
+
+	return 0;
+}
+
+static int __devinit lx_init_xilinx_test(struct lx6464es *chip)
+{
+	u32 reg;
+
+	snd_printdd("->lx_init_xilinx_test\n");
+
+	/* TEST if we have access to Xilinx/MicroBlaze */
+	lx_dsp_reg_write(chip, eReg_CSM, 0);
+
+	reg = lx_dsp_reg_read(chip, eReg_CSM);
+
+	if (reg) {
+		snd_printk(KERN_ERR LXP "Problem: Reg_CSM %x.\n", reg);
+
+		/* PCI9056_SPACE0_REMAP */
+		lx_plx_reg_write(chip, ePLX_PCICR, 1);
+
+		reg = lx_dsp_reg_read(chip, eReg_CSM);
+		if (reg) {
+			snd_printk(KERN_ERR LXP "Error: Reg_CSM %x.\n", reg);
+			return -EAGAIN; /* seems to be appropriate */
+		}
+	}
+
+	snd_printd(LXP "Xilinx/MicroBlaze access test successful\n");
+
+	return 0;
+}
+
+/* initialize ethersound */
+static int __devinit lx_init_ethersound_config(struct lx6464es *chip)
+{
+	int i;
+	u32 orig_conf_es = lx_dsp_reg_read(chip, eReg_CONFES);
+
+	u32 default_conf_es = (64 << IOCR_OUTPUTS_OFFSET) |
+		(64 << IOCR_INPUTS_OFFSET) |
+		(FREQ_RATIO_SINGLE_MODE << FREQ_RATIO_OFFSET);
+
+	u32 conf_es = (orig_conf_es & CONFES_READ_PART_MASK)
+		| (default_conf_es & CONFES_WRITE_PART_MASK);
+
+	snd_printdd("->lx_init_ethersound\n");
+
+	chip->freq_ratio = FREQ_RATIO_SINGLE_MODE;
+
+	/*
+	 * write it to the card !
+	 * this actually kicks the ES xilinx, the first time since poweron.
+	 * the MAC address in the Reg_ADMACESMSB Reg_ADMACESLSB registers
+	 * is not ready before this is done, and the bit 2 in Reg_CSES is set.
+	 * */
+	lx_dsp_reg_write(chip, eReg_CONFES, conf_es);
+
+	for (i = 0; i != 1000; ++i) {
+		if (lx_dsp_reg_read(chip, eReg_CSES) & 4) {
+			snd_printd(LXP "ethersound initialized after %dms\n",
+				   i);
+			goto ethersound_initialized;
+		}
+		msleep(1);
+	}
+	snd_printk(KERN_WARNING LXP
+		   "ethersound could not be initialized after %dms\n", i);
+	return -ETIMEDOUT;
+
+ ethersound_initialized:
+	snd_printd(LXP "ethersound initialized\n");
+	return 0;
+}
+
+static int __devinit lx_init_get_version_features(struct lx6464es *chip)
+{
+	u32 dsp_version;
+
+	int err;
+
+	snd_printdd("->lx_init_get_version_features\n");
+
+	err = lx_dsp_get_version(chip, &dsp_version);
+
+	if (err == 0) {
+		u32 freq;
+
+		snd_printk(LXP "DSP version: V%02d.%02d #%d\n",
+			   (dsp_version>>16) & 0xff, (dsp_version>>8) & 0xff,
+			   dsp_version & 0xff);
+
+		/* later: what firmware version do we expect? */
+
+		/* retrieve Play/Rec features */
+		/* done here because we may have to handle alternate
+		 * DSP files. */
+		/* later */
+
+		/* init the EtherSound sample rate */
+		err = lx_dsp_get_clock_frequency(chip, &freq);
+		if (err == 0)
+			chip->board_sample_rate = freq;
+		snd_printd(LXP "actual clock frequency %d\n", freq);
+	} else {
+		snd_printk(KERN_ERR LXP "DSP corrupted \n");
+		err = -EAGAIN;
+	}
+
+	return err;
+}
+
+static int lx_set_granularity(struct lx6464es *chip, u32 gran)
+{
+	int err = 0;
+	u32 snapped_gran = MICROBLAZE_IBL_MIN;
+
+	snd_printdd("->lx_set_granularity\n");
+
+	/* blocksize is a power of 2 */
+	while ((snapped_gran < gran) &&
+	       (snapped_gran < MICROBLAZE_IBL_MAX)) {
+		snapped_gran *= 2;
+	}
+
+	if (snapped_gran == chip->pcm_granularity)
+		return 0;
+
+	err = lx_dsp_set_granularity(chip, snapped_gran);
+	if (err < 0) {
+		snd_printk(KERN_WARNING LXP "could not set granularity\n");
+		err = -EAGAIN;
+	}
+
+	if (snapped_gran != gran)
+		snd_printk(LXP "snapped blocksize to %d\n", snapped_gran);
+
+	snd_printd(LXP "set blocksize on board %d\n", snapped_gran);
+	chip->pcm_granularity = snapped_gran;
+
+	return err;
+}
+
+/* initialize and test the xilinx dsp chip */
+static int __devinit lx_init_dsp(struct lx6464es *chip)
+{
+	int err;
+	u8 mac_address[6];
+	int i;
+
+	snd_printdd("->lx_init_dsp\n");
+
+	snd_printd(LXP "initialize board\n");
+	err = lx_init_xilinx_reset(chip);
+	if (err)
+		return err;
+
+	snd_printd(LXP "testing board\n");
+	err = lx_init_xilinx_test(chip);
+	if (err)
+		return err;
+
+	snd_printd(LXP "initialize ethersound configuration\n");
+	err = lx_init_ethersound_config(chip);
+	if (err)
+		return err;
+
+	lx_irq_enable(chip);
+
+	/** \todo the mac address should be ready by not, but it isn't,
+	 *  so we wait for it */
+	for (i = 0; i != 1000; ++i) {
+		err = lx_dsp_get_mac(chip, mac_address);
+		if (err)
+			return err;
+		if (mac_address[0] || mac_address[1] || mac_address[2] ||
+		    mac_address[3] || mac_address[4] || mac_address[5])
+			goto mac_ready;
+		msleep(1);
+	}
+	return -ETIMEDOUT;
+
+mac_ready:
+	snd_printd(LXP "mac address ready read after: %dms\n", i);
+	snd_printk(LXP "mac address: %02X.%02X.%02X.%02X.%02X.%02X\n",
+		   mac_address[0], mac_address[1], mac_address[2],
+		   mac_address[3], mac_address[4], mac_address[5]);
+
+	err = lx_init_get_version_features(chip);
+	if (err)
+		return err;
+
+	lx_set_granularity(chip, MICROBLAZE_IBL_DEFAULT);
+
+	chip->playback_mute = 0;
+
+	return err;
+}
+
+static struct snd_pcm_ops lx_ops_playback = {
+	.open      = lx_pcm_open,
+	.close     = lx_pcm_close,
+	.ioctl     = snd_pcm_lib_ioctl,
+	.prepare   = lx_pcm_prepare,
+	.hw_params = lx_pcm_hw_params_playback,
+	.hw_free   = lx_pcm_hw_free,
+	.trigger   = lx_pcm_trigger,
+	.pointer   = lx_pcm_stream_pointer,
+};
+
+static struct snd_pcm_ops lx_ops_capture = {
+	.open      = lx_pcm_open,
+	.close     = lx_pcm_close,
+	.ioctl     = snd_pcm_lib_ioctl,
+	.prepare   = lx_pcm_prepare,
+	.hw_params = lx_pcm_hw_params_capture,
+	.hw_free   = lx_pcm_hw_free,
+	.trigger   = lx_pcm_trigger,
+	.pointer   = lx_pcm_stream_pointer,
+};
+
+static int __devinit lx_pcm_create(struct lx6464es *chip)
+{
+	int err;
+	struct snd_pcm *pcm;
+
+	u32 size = 64 *		     /* channels */
+		3 *		     /* 24 bit samples */
+		MAX_STREAM_BUFFER *  /* periods */
+		MICROBLAZE_IBL_MAX * /* frames per period */
+		2;		     /* duplex */
+
+	size = PAGE_ALIGN(size);
+
+	/* hardcoded device name & channel count */
+	err = snd_pcm_new(chip->card, (char *)card_name, 0,
+			  1, 1, &pcm);
+
+	pcm->private_data = chip;
+
+	snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &lx_ops_playback);
+	snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &lx_ops_capture);
+
+	pcm->info_flags = 0;
+	strcpy(pcm->name, card_name);
+
+	err = snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+						    snd_dma_pci_data(chip->pci),
+						    size, size);
+	if (err < 0)
+		return err;
+
+	chip->pcm = pcm;
+	chip->capture_stream.is_capture = 1;
+
+	return 0;
+}
+
+static int lx_control_playback_info(struct snd_kcontrol *kcontrol,
+				    struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
+	uinfo->count = 1;
+	uinfo->value.integer.min = 0;
+	uinfo->value.integer.max = 1;
+	return 0;
+}
+
+static int lx_control_playback_get(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_value *ucontrol)
+{
+	struct lx6464es *chip = snd_kcontrol_chip(kcontrol);
+	ucontrol->value.integer.value[0] = chip->playback_mute;
+	return 0;
+}
+
+static int lx_control_playback_put(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_value *ucontrol)
+{
+	struct lx6464es *chip = snd_kcontrol_chip(kcontrol);
+	int changed = 0;
+	int current_value = chip->playback_mute;
+
+	if (current_value != ucontrol->value.integer.value[0]) {
+		lx_level_unmute(chip, 0, !current_value);
+		chip->playback_mute = !current_value;
+		changed = 1;
+	}
+	return changed;
+}
+
+static struct snd_kcontrol_new lx_control_playback_switch __devinitdata = {
+	.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name = "PCM Playback Switch",
+	.index = 0,
+	.access = SNDRV_CTL_ELEM_ACCESS_READWRITE,
+	.private_value = 0,
+	.info = lx_control_playback_info,
+	.get = lx_control_playback_get,
+	.put = lx_control_playback_put
+};
+
+
+
+static void lx_proc_levels_read(struct snd_info_entry *entry,
+				struct snd_info_buffer *buffer)
+{
+	u32 levels[64];
+	int err;
+	int i, j;
+	struct lx6464es *chip = entry->private_data;
+
+	snd_iprintf(buffer, "capture levels:\n");
+	err = lx_level_peaks(chip, 1, 64, levels);
+	if (err < 0)
+		return;
+
+	for (i = 0; i != 8; ++i) {
+		for (j = 0; j != 8; ++j)
+			snd_iprintf(buffer, "%08x ", levels[i*8+j]);
+		snd_iprintf(buffer, "\n");
+	}
+
+	snd_iprintf(buffer, "\nplayback levels:\n");
+
+	err = lx_level_peaks(chip, 0, 64, levels);
+	if (err < 0)
+		return;
+
+	for (i = 0; i != 8; ++i) {
+		for (j = 0; j != 8; ++j)
+			snd_iprintf(buffer, "%08x ", levels[i*8+j]);
+		snd_iprintf(buffer, "\n");
+	}
+
+	snd_iprintf(buffer, "\n");
+}
+
+static int __devinit lx_proc_create(struct snd_card *card, struct lx6464es *chip)
+{
+	struct snd_info_entry *entry;
+	int err = snd_card_proc_new(card, "levels", &entry);
+	if (err < 0)
+		return err;
+
+	snd_info_set_text_ops(entry, chip, lx_proc_levels_read);
+	return 0;
+}
+
+
+static int __devinit snd_lx6464es_create(struct snd_card *card,
+					 struct pci_dev *pci,
+					 struct lx6464es **rchip)
+{
+	struct lx6464es *chip;
+	int err;
+
+	static struct snd_device_ops ops = {
+		.dev_free = snd_lx6464es_dev_free,
+	};
+
+	snd_printdd("->snd_lx6464es_create\n");
+
+	*rchip = NULL;
+
+	/* enable PCI device */
+	err = pci_enable_device(pci);
+	if (err < 0)
+		return err;
+
+	pci_set_master(pci);
+
+	/* check if we can restrict PCI DMA transfers to 32 bits */
+	err = pci_set_dma_mask(pci, DMA_32BIT_MASK);
+	if (err < 0) {
+		snd_printk(KERN_ERR "architecture does not support "
+			   "32bit PCI busmaster DMA\n");
+		pci_disable_device(pci);
+		return -ENXIO;
+	}
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (chip == NULL) {
+		err = -ENOMEM;
+		goto alloc_failed;
+	}
+
+	chip->card = card;
+	chip->pci = pci;
+	chip->irq = -1;
+
+	/* initialize synchronization structs */
+	spin_lock_init(&chip->lock);
+	spin_lock_init(&chip->msg_lock);
+	mutex_init(&chip->setup_mutex);
+	tasklet_init(&chip->trigger_tasklet, lx_trigger_tasklet,
+		     (unsigned long)chip);
+	tasklet_init(&chip->tasklet_capture, lx_tasklet_capture,
+		     (unsigned long)chip);
+	tasklet_init(&chip->tasklet_playback, lx_tasklet_playback,
+		     (unsigned long)chip);
+
+	/* request resources */
+	err = pci_request_regions(pci, card_name);
+	if (err < 0)
+		goto request_regions_failed;
+
+	/* plx port */
+	chip->port_plx = pci_resource_start(pci, 1);
+	chip->port_plx_remapped = ioport_map(chip->port_plx,
+					     pci_resource_len(pci, 1));
+
+	/* dsp port */
+	chip->port_dsp_bar = pci_ioremap_bar(pci, 2);
+
+	err = request_irq(pci->irq, lx_interrupt, IRQF_SHARED,
+			  card_name, chip);
+	if (err) {
+		snd_printk(KERN_ERR LXP "unable to grab IRQ %d\n", pci->irq);
+		goto request_irq_failed;
+	}
+	chip->irq = pci->irq;
+
+	err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+	if (err < 0)
+		goto device_new_failed;
+
+	err = lx_init_dsp(chip);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "error during DSP initialization\n");
+		return err;
+	}
+
+	err = lx_pcm_create(chip);
+	if (err < 0)
+		return err;
+
+	err = lx_proc_create(card, chip);
+	if (err < 0)
+		return err;
+
+	err = snd_ctl_add(card, snd_ctl_new1(&lx_control_playback_switch,
+					     chip));
+	if (err < 0)
+		return err;
+
+	snd_card_set_dev(card, &pci->dev);
+
+	*rchip = chip;
+	return 0;
+
+device_new_failed:
+	free_irq(pci->irq, chip);
+
+request_irq_failed:
+	pci_release_regions(pci);
+
+request_regions_failed:
+	kfree(chip);
+
+alloc_failed:
+	pci_disable_device(pci);
+
+	return err;
+}
+
+static int __devinit snd_lx6464es_probe(struct pci_dev *pci,
+					const struct pci_device_id *pci_id)
+{
+	static int dev;
+	struct snd_card *card;
+	struct lx6464es *chip;
+	int err;
+
+	snd_printdd("->snd_lx6464es_probe\n");
+
+	if (dev >= SNDRV_CARDS)
+		return -ENODEV;
+	if (!enable[dev]) {
+		dev++;
+		return -ENOENT;
+	}
+
+	card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
+	if (card == NULL)
+		return -ENOMEM;
+
+	err = snd_lx6464es_create(card, pci, &chip);
+	if (err < 0) {
+		snd_printk(KERN_ERR LXP "error during snd_lx6464es_create\n");
+		goto out_free;
+	}
+
+	strcpy(card->driver, "lx6464es");
+	strcpy(card->shortname, "Digigram LX6464ES");
+	sprintf(card->longname, "%s at 0x%lx, 0x%p, irq %i",
+		card->shortname, chip->port_plx,
+		chip->port_dsp_bar, chip->irq);
+
+	err = snd_card_register(card);
+	if (err < 0)
+		goto out_free;
+
+	snd_printdd(LXP "initialization successful\n");
+	pci_set_drvdata(pci, card);
+	dev++;
+	return 0;
+
+out_free:
+	snd_card_free(card);
+	return err;
+
+}
+
+static void __devexit snd_lx6464es_remove(struct pci_dev *pci)
+{
+	snd_card_free(pci_get_drvdata(pci));
+	pci_set_drvdata(pci, NULL);
+}
+
+
+static struct pci_driver driver = {
+	.name =     "Digigram LX6464ES",
+	.id_table = snd_lx6464es_ids,
+	.probe =    snd_lx6464es_probe,
+	.remove = __devexit_p(snd_lx6464es_remove),
+};
+
+
+/* module initialization */
+static int __init mod_init(void)
+{
+	return pci_register_driver(&driver);
+}
+
+static void __exit mod_exit(void)
+{
+	pci_unregister_driver(&driver);
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
diff --git a/sound/pci/lx6464es/lx6464es.h b/sound/pci/lx6464es/lx6464es.h
new file mode 100644
index 00000000000..012c010c8c8
--- /dev/null
+++ b/sound/pci/lx6464es/lx6464es.h
@@ -0,0 +1,114 @@
+/* -*- linux-c -*- *
+ *
+ * ALSA driver for the digigram lx6464es interface
+ *
+ * Copyright (c) 2009 Tim Blechmann <tim@klingt.org>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ */
+
+#ifndef LX6464ES_H
+#define LX6464ES_H
+
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+#include <sound/core.h>
+#include <sound/pcm.h>
+
+#include "lx_core.h"
+
+#define LXP "LX6464ES: "
+
+enum {
+    ES_cmd_free         = 0,    /* no command executing */
+    ES_cmd_processing   = 1,	/* execution of a read/write command */
+    ES_read_pending     = 2,    /* a asynchron read command is pending */
+    ES_read_finishing   = 3,    /* a read command has finished waiting (set by
+				 * Interrupt or CancelIrp) */
+};
+
+enum lx_stream_status {
+	LX_STREAM_STATUS_FREE,
+/* 	LX_STREAM_STATUS_OPEN, */
+	LX_STREAM_STATUS_SCHEDULE_RUN,
+/* 	LX_STREAM_STATUS_STARTED, */
+	LX_STREAM_STATUS_RUNNING,
+	LX_STREAM_STATUS_SCHEDULE_STOP,
+/* 	LX_STREAM_STATUS_STOPPED, */
+/* 	LX_STREAM_STATUS_PAUSED */
+};
+
+
+struct lx_stream {
+	struct snd_pcm_substream  *stream;
+	snd_pcm_uframes_t          frame_pos;
+	enum lx_stream_status      status; /* free, open, running, draining
+					    * pause */
+	int                        is_capture:1;
+};
+
+
+struct lx6464es {
+	struct snd_card        *card;
+	struct pci_dev         *pci;
+	int			irq;
+
+	spinlock_t		lock;        /* interrupt spinlock */
+	struct mutex            setup_mutex; /* mutex used in hw_params, open
+					      * and close */
+
+	struct tasklet_struct   trigger_tasklet; /* trigger tasklet */
+	struct tasklet_struct   tasklet_capture;
+	struct tasklet_struct   tasklet_playback;
+
+	/* ports */
+	unsigned long		port_plx;	   /* io port (size=256) */
+	void __iomem           *port_plx_remapped; /* remapped plx port */
+	void __iomem           *port_dsp_bar;      /* memory port (32-bit,
+						    * non-prefetchable,
+						    * size=8K) */
+
+	/* messaging */
+	spinlock_t		msg_lock;          /* message spinlock */
+	atomic_t	        send_message_locked;
+	struct lx_rmh           rmh;
+
+	/* configuration */
+	uint			freq_ratio : 2;
+	uint                    playback_mute : 1;
+	uint                    hardware_running[2];
+	u32                     board_sample_rate; /* sample rate read from
+						    * board */
+	u32                     sample_rate;	   /* our sample rate */
+	u16                     pcm_granularity;   /* board blocksize */
+
+	/* dma */
+	struct snd_dma_buffer   capture_dma_buf;
+	struct snd_dma_buffer   playback_dma_buf;
+
+	/* pcm */
+	struct snd_pcm         *pcm;
+
+	/* streams */
+	struct lx_stream        capture_stream;
+	struct lx_stream        playback_stream;
+};
+
+
+#endif /* LX6464ES_H */
diff --git a/sound/pci/lx6464es/lx_core.c b/sound/pci/lx6464es/lx_core.c
new file mode 100644
index 00000000000..a9f8f882b10
--- /dev/null
+++ b/sound/pci/lx6464es/lx_core.c
@@ -0,0 +1,1442 @@
+/* -*- linux-c -*- *
+ *
+ * ALSA driver for the digigram lx6464es interface
+ * low-level interface
+ *
+ * Copyright (c) 2009 Tim Blechmann <tim@klingt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* #define RMH_DEBUG 1 */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "lx6464es.h"
+#include "lx_core.h"
+
+/* low-level register access */
+
+static const unsigned long dsp_port_offsets[] = {
+	0,
+	0x400,
+	0x401,
+	0x402,
+	0x403,
+	0x404,
+	0x405,
+	0x406,
+	0x407,
+	0x408,
+	0x409,
+	0x40a,
+	0x40b,
+	0x40c,
+
+	0x410,
+	0x411,
+	0x412,
+	0x413,
+	0x414,
+	0x415,
+	0x416,
+
+	0x420,
+	0x430,
+	0x431,
+	0x432,
+	0x433,
+	0x434,
+	0x440
+};
+
+static void __iomem *lx_dsp_register(struct lx6464es *chip, int port)
+{
+	void __iomem *base_address = chip->port_dsp_bar;
+	return base_address + dsp_port_offsets[port]*4;
+}
+
+unsigned long lx_dsp_reg_read(struct lx6464es *chip, int port)
+{
+	void __iomem *address = lx_dsp_register(chip, port);
+	return ioread32(address);
+}
+
+void lx_dsp_reg_readbuf(struct lx6464es *chip, int port, u32 *data, u32 len)
+{
+	void __iomem *address = lx_dsp_register(chip, port);
+	memcpy_fromio(data, address, len*sizeof(u32));
+}
+
+
+void lx_dsp_reg_write(struct lx6464es *chip, int port, unsigned data)
+{
+	void __iomem *address = lx_dsp_register(chip, port);
+	iowrite32(data, address);
+}
+
+void lx_dsp_reg_writebuf(struct lx6464es *chip, int port, const u32 *data,
+			 u32 len)
+{
+	void __iomem *address = lx_dsp_register(chip, port);
+	memcpy_toio(address, data, len*sizeof(u32));
+}
+
+
+static const unsigned long plx_port_offsets[] = {
+	0x04,
+	0x40,
+	0x44,
+	0x48,
+	0x4c,
+	0x50,
+	0x54,
+	0x58,
+	0x5c,
+	0x64,
+	0x68,
+	0x6C
+};
+
+static void __iomem *lx_plx_register(struct lx6464es *chip, int port)
+{
+	void __iomem *base_address = chip->port_plx_remapped;
+	return base_address + plx_port_offsets[port];
+}
+
+unsigned long lx_plx_reg_read(struct lx6464es *chip, int port)
+{
+	void __iomem *address = lx_plx_register(chip, port);
+	return ioread32(address);
+}
+
+void lx_plx_reg_write(struct lx6464es *chip, int port, u32 data)
+{
+	void __iomem *address = lx_plx_register(chip, port);
+	iowrite32(data, address);
+}
+
+u32 lx_plx_mbox_read(struct lx6464es *chip, int mbox_nr)
+{
+	int index;
+
+	switch (mbox_nr) {
+	case 1:
+		index = ePLX_MBOX1;    break;
+	case 2:
+		index = ePLX_MBOX2;    break;
+	case 3:
+		index = ePLX_MBOX3;    break;
+	case 4:
+		index = ePLX_MBOX4;    break;
+	case 5:
+		index = ePLX_MBOX5;    break;
+	case 6:
+		index = ePLX_MBOX6;    break;
+	case 7:
+		index = ePLX_MBOX7;    break;
+	case 0:			/* reserved for HF flags */
+		snd_BUG();
+	default:
+		return 0xdeadbeef;
+	}
+
+	return lx_plx_reg_read(chip, index);
+}
+
+int lx_plx_mbox_write(struct lx6464es *chip, int mbox_nr, u32 value)
+{
+	int index = -1;
+
+	switch (mbox_nr) {
+	case 1:
+		index = ePLX_MBOX1;    break;
+	case 3:
+		index = ePLX_MBOX3;    break;
+	case 4:
+		index = ePLX_MBOX4;    break;
+	case 5:
+		index = ePLX_MBOX5;    break;
+	case 6:
+		index = ePLX_MBOX6;    break;
+	case 7:
+		index = ePLX_MBOX7;    break;
+	case 0:			/* reserved for HF flags */
+	case 2:			/* reserved for Pipe States
+				 * the DSP keeps an image of it */
+		snd_BUG();
+		return -EBADRQC;
+	}
+
+	lx_plx_reg_write(chip, index, value);
+	return 0;
+}
+
+
+/* rmh */
+
+#ifdef CONFIG_SND_DEBUG
+#define CMD_NAME(a) a
+#else
+#define CMD_NAME(a) NULL
+#endif
+
+#define Reg_CSM_MR			0x00000002
+#define Reg_CSM_MC			0x00000001
+
+struct dsp_cmd_info {
+	u32    dcCodeOp;	/* Op Code of the command (usually 1st 24-bits
+				 * word).*/
+	u16    dcCmdLength;	/* Command length in words of 24 bits.*/
+	u16    dcStatusType;	/* Status type: 0 for fixed length, 1 for
+				 * random. */
+	u16    dcStatusLength;	/* Status length (if fixed).*/
+	char  *dcOpName;
+};
+
+/*
+  Initialization and control data for the Microblaze interface
+  - OpCode:
+    the opcode field of the command set at the proper offset
+  - CmdLength
+    the number of command words
+  - StatusType
+    offset in the status registers: 0 means that the return value may be
+    different from 0, and must be read
+  - StatusLength
+    the number of status words (in addition to the return value)
+*/
+
+static struct dsp_cmd_info dsp_commands[] =
+{
+	{ (CMD_00_INFO_DEBUG << OPCODE_OFFSET)			, 1 /*custom*/
+	  , 1	, 0 /**/		    , CMD_NAME("INFO_DEBUG") },
+	{ (CMD_01_GET_SYS_CFG << OPCODE_OFFSET) 		, 1 /**/
+	  , 1      , 2 /**/		    , CMD_NAME("GET_SYS_CFG") },
+	{ (CMD_02_SET_GRANULARITY << OPCODE_OFFSET)	        , 1 /**/
+	  , 1      , 0 /**/		    , CMD_NAME("SET_GRANULARITY") },
+	{ (CMD_03_SET_TIMER_IRQ << OPCODE_OFFSET)		, 1 /**/
+	  , 1      , 0 /**/		    , CMD_NAME("SET_TIMER_IRQ") },
+	{ (CMD_04_GET_EVENT << OPCODE_OFFSET)			, 1 /**/
+	  , 1      , 0 /*up to 10*/     , CMD_NAME("GET_EVENT") },
+	{ (CMD_05_GET_PIPES << OPCODE_OFFSET)			, 1 /**/
+	  , 1      , 2 /*up to 4*/      , CMD_NAME("GET_PIPES") },
+	{ (CMD_06_ALLOCATE_PIPE << OPCODE_OFFSET)		, 1 /**/
+	  , 0      , 0 /**/		    , CMD_NAME("ALLOCATE_PIPE") },
+	{ (CMD_07_RELEASE_PIPE << OPCODE_OFFSET)		, 1 /**/
+	  , 0      , 0 /**/		    , CMD_NAME("RELEASE_PIPE") },
+	{ (CMD_08_ASK_BUFFERS << OPCODE_OFFSET) 		, 1 /**/
+	  , 1      , MAX_STREAM_BUFFER  , CMD_NAME("ASK_BUFFERS") },
+	{ (CMD_09_STOP_PIPE << OPCODE_OFFSET)			, 1 /**/
+	  , 0      , 0 /*up to 2*/      , CMD_NAME("STOP_PIPE") },
+	{ (CMD_0A_GET_PIPE_SPL_COUNT << OPCODE_OFFSET)	        , 1 /**/
+	  , 1      , 1 /*up to 2*/      , CMD_NAME("GET_PIPE_SPL_COUNT") },
+	{ (CMD_0B_TOGGLE_PIPE_STATE << OPCODE_OFFSET)           , 1 /*up to 5*/
+	  , 1      , 0 /**/		    , CMD_NAME("TOGGLE_PIPE_STATE") },
+	{ (CMD_0C_DEF_STREAM << OPCODE_OFFSET)			, 1 /*up to 4*/
+	  , 1      , 0 /**/		    , CMD_NAME("DEF_STREAM") },
+	{ (CMD_0D_SET_MUTE  << OPCODE_OFFSET)			, 3 /**/
+	  , 1      , 0 /**/		    , CMD_NAME("SET_MUTE") },
+	{ (CMD_0E_GET_STREAM_SPL_COUNT << OPCODE_OFFSET)        , 1/**/
+	  , 1      , 2 /**/		    , CMD_NAME("GET_STREAM_SPL_COUNT") },
+	{ (CMD_0F_UPDATE_BUFFER << OPCODE_OFFSET)		, 3 /*up to 4*/
+	  , 0      , 1 /**/		    , CMD_NAME("UPDATE_BUFFER") },
+	{ (CMD_10_GET_BUFFER << OPCODE_OFFSET)			, 1 /**/
+	  , 1      , 4 /**/		    , CMD_NAME("GET_BUFFER") },
+	{ (CMD_11_CANCEL_BUFFER << OPCODE_OFFSET)		, 1 /**/
+	  , 1      , 1 /*up to 4*/      , CMD_NAME("CANCEL_BUFFER") },
+	{ (CMD_12_GET_PEAK << OPCODE_OFFSET)			, 1 /**/
+	  , 1      , 1 /**/		    , CMD_NAME("GET_PEAK") },
+	{ (CMD_13_SET_STREAM_STATE << OPCODE_OFFSET)	        , 1 /**/
+	  , 1      , 0 /**/		    , CMD_NAME("SET_STREAM_STATE") },
+};
+
+static void lx_message_init(struct lx_rmh *rmh, enum cmd_mb_opcodes cmd)
+{
+	snd_BUG_ON(cmd >= CMD_14_INVALID);
+
+	rmh->cmd[0] = dsp_commands[cmd].dcCodeOp;
+	rmh->cmd_len = dsp_commands[cmd].dcCmdLength;
+	rmh->stat_len = dsp_commands[cmd].dcStatusLength;
+	rmh->dsp_stat = dsp_commands[cmd].dcStatusType;
+	rmh->cmd_idx = cmd;
+	memset(&rmh->cmd[1], 0, (REG_CRM_NUMBER - 1) * sizeof(u32));
+
+#ifdef CONFIG_SND_DEBUG
+	memset(rmh->stat, 0, REG_CRM_NUMBER * sizeof(u32));
+#endif
+#ifdef RMH_DEBUG
+	rmh->cmd_idx = cmd;
+#endif
+}
+
+#ifdef RMH_DEBUG
+#define LXRMH "lx6464es rmh: "
+static void lx_message_dump(struct lx_rmh *rmh)
+{
+	u8 idx = rmh->cmd_idx;
+	int i;
+
+	snd_printk(LXRMH "command %s\n", dsp_commands[idx].dcOpName);
+
+	for (i = 0; i != rmh->cmd_len; ++i)
+		snd_printk(LXRMH "\tcmd[%d] %08x\n", i, rmh->cmd[i]);
+
+	for (i = 0; i != rmh->stat_len; ++i)
+		snd_printk(LXRMH "\tstat[%d]: %08x\n", i, rmh->stat[i]);
+	snd_printk("\n");
+}
+#else
+static inline void lx_message_dump(struct lx_rmh *rmh)
+{}
+#endif
+
+
+
+/* sleep 500 - 100 = 400 times 100us -> the timeout is >= 40 ms */
+#define XILINX_TIMEOUT_MS       40
+#define XILINX_POLL_NO_SLEEP    100
+#define XILINX_POLL_ITERATIONS  150
+
+static int lx_message_send(struct lx6464es *chip, struct lx_rmh *rmh)
+{
+	u32 reg = ED_DSP_TIMED_OUT;
+	int dwloop;
+	int answer_received;
+
+	if (lx_dsp_reg_read(chip, eReg_CSM) & (Reg_CSM_MC | Reg_CSM_MR)) {
+		snd_printk(KERN_ERR LXP "PIOSendMessage eReg_CSM %x\n", reg);
+		return -EBUSY;
+	}
+
+	/* write command */
+	lx_dsp_reg_writebuf(chip, eReg_CRM1, rmh->cmd, rmh->cmd_len);
+
+	snd_BUG_ON(atomic_read(&chip->send_message_locked) != 0);
+	atomic_set(&chip->send_message_locked, 1);
+
+	/* MicoBlaze gogogo */
+	lx_dsp_reg_write(chip, eReg_CSM, Reg_CSM_MC);
+
+	/* wait for interrupt to answer */
+	for (dwloop = 0; dwloop != XILINX_TIMEOUT_MS; ++dwloop) {
+		answer_received = atomic_read(&chip->send_message_locked);
+		if (answer_received == 0)
+			break;
+		msleep(1);
+	}
+
+	if (answer_received == 0) {
+		/* in Debug mode verify Reg_CSM_MR */
+		snd_BUG_ON(!(lx_dsp_reg_read(chip, eReg_CSM) & Reg_CSM_MR));
+
+		/* command finished, read status */
+		if (rmh->dsp_stat == 0)
+			reg = lx_dsp_reg_read(chip, eReg_CRM1);
+		else
+			reg = 0;
+	} else {
+		int i;
+		snd_printk(KERN_WARNING LXP "TIMEOUT lx_message_send! "
+			   "Interrupts disabled?\n");
+
+		/* attente bit Reg_CSM_MR */
+		for (i = 0; i != XILINX_POLL_ITERATIONS; i++) {
+			if ((lx_dsp_reg_read(chip, eReg_CSM) & Reg_CSM_MR)) {
+				if (rmh->dsp_stat == 0)
+					reg = lx_dsp_reg_read(chip, eReg_CRM1);
+				else
+					reg = 0;
+				goto polling_successful;
+			}
+
+			if (i > XILINX_POLL_NO_SLEEP)
+				msleep(1);
+		}
+		snd_printk(KERN_WARNING LXP "TIMEOUT lx_message_send! "
+			   "polling failed\n");
+
+polling_successful:
+		atomic_set(&chip->send_message_locked, 0);
+	}
+
+	if ((reg & ERROR_VALUE) == 0) {
+		/* read response */
+		if (rmh->stat_len) {
+			snd_BUG_ON(rmh->stat_len >= (REG_CRM_NUMBER-1));
+
+			lx_dsp_reg_readbuf(chip, eReg_CRM2, rmh->stat,
+					   rmh->stat_len);
+		}
+	} else
+		snd_printk(KERN_WARNING LXP "lx_message_send: error_value %x\n",
+			   reg);
+
+	/* clear Reg_CSM_MR */
+	lx_dsp_reg_write(chip, eReg_CSM, 0);
+
+	switch (reg) {
+	case ED_DSP_TIMED_OUT:
+		snd_printk(KERN_WARNING LXP "lx_message_send: dsp timeout\n");
+		return -ETIMEDOUT;
+
+	case ED_DSP_CRASHED:
+		snd_printk(KERN_WARNING LXP "lx_message_send: dsp crashed\n");
+		return -EAGAIN;
+	}
+
+	lx_message_dump(rmh);
+	return 0;
+}
+
+static int lx_message_send_atomic(struct lx6464es *chip, struct lx_rmh *rmh)
+{
+	u32 reg = ED_DSP_TIMED_OUT;
+	int dwloop;
+
+	if (lx_dsp_reg_read(chip, eReg_CSM) & (Reg_CSM_MC | Reg_CSM_MR)) {
+		snd_printk(KERN_ERR LXP "PIOSendMessage eReg_CSM %x\n", reg);
+		return -EBUSY;
+	}
+
+	/* write command */
+	lx_dsp_reg_writebuf(chip, eReg_CRM1, rmh->cmd, rmh->cmd_len);
+
+	/* MicoBlaze gogogo */
+	lx_dsp_reg_write(chip, eReg_CSM, Reg_CSM_MC);
+
+	/* wait for interrupt to answer */
+	for (dwloop = 0; dwloop != XILINX_TIMEOUT_MS * 1000; ++dwloop) {
+		if (lx_dsp_reg_read(chip, eReg_CSM) & Reg_CSM_MR) {
+			if (rmh->dsp_stat == 0)
+				reg = lx_dsp_reg_read(chip, eReg_CRM1);
+			else
+				reg = 0;
+			goto polling_successful;
+		} else
+			udelay(1);
+	}
+	snd_printk(KERN_WARNING LXP "TIMEOUT lx_message_send_atomic! "
+		   "polling failed\n");
+
+polling_successful:
+	if ((reg & ERROR_VALUE) == 0) {
+		/* read response */
+		if (rmh->stat_len) {
+			snd_BUG_ON(rmh->stat_len >= (REG_CRM_NUMBER-1));
+			lx_dsp_reg_readbuf(chip, eReg_CRM2, rmh->stat,
+					   rmh->stat_len);
+		}
+	} else
+		snd_printk(LXP "rmh error: %08x\n", reg);
+
+	/* clear Reg_CSM_MR */
+	lx_dsp_reg_write(chip, eReg_CSM, 0);
+
+	switch (reg) {
+	case ED_DSP_TIMED_OUT:
+		snd_printk(KERN_WARNING LXP "lx_message_send: dsp timeout\n");
+		return -ETIMEDOUT;
+
+	case ED_DSP_CRASHED:
+		snd_printk(KERN_WARNING LXP "lx_message_send: dsp crashed\n");
+		return -EAGAIN;
+	}
+
+	lx_message_dump(rmh);
+
+	return reg;
+}
+
+
+/* low-level dsp access */
+int __devinit lx_dsp_get_version(struct lx6464es *chip, u32 *rdsp_version)
+{
+	u16 ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+
+	lx_message_init(&chip->rmh, CMD_01_GET_SYS_CFG);
+	ret = lx_message_send_atomic(chip, &chip->rmh);
+
+	*rdsp_version = chip->rmh.stat[1];
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return ret;
+}
+
+int lx_dsp_get_clock_frequency(struct lx6464es *chip, u32 *rfreq)
+{
+	u16 ret = 0;
+	unsigned long flags;
+	u32 freq_raw = 0;
+	u32 freq = 0;
+	u32 frequency = 0;
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+
+	lx_message_init(&chip->rmh, CMD_01_GET_SYS_CFG);
+	ret = lx_message_send_atomic(chip, &chip->rmh);
+
+	if (ret == 0) {
+		freq_raw = chip->rmh.stat[0] >> FREQ_FIELD_OFFSET;
+		freq = freq_raw & XES_FREQ_COUNT8_MASK;
+
+		if ((freq < XES_FREQ_COUNT8_48_MAX) ||
+		    (freq > XES_FREQ_COUNT8_44_MIN))
+			frequency = 0; /* unknown */
+		else if (freq >= XES_FREQ_COUNT8_44_MAX)
+			frequency = 44100;
+		else
+			frequency = 48000;
+	}
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+
+	*rfreq = frequency * chip->freq_ratio;
+
+	return ret;
+}
+
+int lx_dsp_get_mac(struct lx6464es *chip, u8 *mac_address)
+{
+	u32 macmsb, maclsb;
+
+	macmsb = lx_dsp_reg_read(chip, eReg_ADMACESMSB) & 0x00FFFFFF;
+	maclsb = lx_dsp_reg_read(chip, eReg_ADMACESLSB) & 0x00FFFFFF;
+
+	/* todo: endianess handling */
+	mac_address[5] = ((u8 *)(&maclsb))[0];
+	mac_address[4] = ((u8 *)(&maclsb))[1];
+	mac_address[3] = ((u8 *)(&maclsb))[2];
+	mac_address[2] = ((u8 *)(&macmsb))[0];
+	mac_address[1] = ((u8 *)(&macmsb))[1];
+	mac_address[0] = ((u8 *)(&macmsb))[2];
+
+	return 0;
+}
+
+
+int lx_dsp_set_granularity(struct lx6464es *chip, u32 gran)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+
+	lx_message_init(&chip->rmh, CMD_02_SET_GRANULARITY);
+	chip->rmh.cmd[0] |= gran;
+
+	ret = lx_message_send_atomic(chip, &chip->rmh);
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return ret;
+}
+
+int lx_dsp_read_async_events(struct lx6464es *chip, u32 *data)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+
+	lx_message_init(&chip->rmh, CMD_04_GET_EVENT);
+	chip->rmh.stat_len = 9;	/* we don't necessarily need the full length */
+
+	ret = lx_message_send_atomic(chip, &chip->rmh);
+
+	if (!ret)
+		memcpy(data, chip->rmh.stat, chip->rmh.stat_len * sizeof(u32));
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return ret;
+}
+
+#define CSES_TIMEOUT        100     /* microseconds */
+#define CSES_CE             0x0001
+#define CSES_BROADCAST      0x0002
+#define CSES_UPDATE_LDSV    0x0004
+
+int lx_dsp_es_check_pipeline(struct lx6464es *chip)
+{
+	int i;
+
+	for (i = 0; i != CSES_TIMEOUT; ++i) {
+		/*
+		 * le bit CSES_UPDATE_LDSV est Ã  1 dÃ©s que le macprog
+		 * est pret. il re-passe Ã  0 lorsque le premier read a
+		 * Ã©tÃ© fait. pour l'instant on retire le test car ce bit
+		 * passe a 1 environ 200 Ã  400 ms aprÃ©s que le registre
+		 * confES Ã  Ã©tÃ© Ã©crit (kick du xilinx ES).
+		 *
+		 * On ne teste que le bit CE.
+		 * */
+
+		u32 cses = lx_dsp_reg_read(chip, eReg_CSES);
+
+		if ((cses & CSES_CE) == 0)
+			return 0;
+
+		udelay(1);
+	}
+
+	return -ETIMEDOUT;
+}
+
+
+#define PIPE_INFO_TO_CMD(capture, pipe)					\
+	((u32)((u32)(pipe) | ((capture) ? ID_IS_CAPTURE : 0L)) << ID_OFFSET)
+
+
+
+/* low-level pipe handling */
+int lx_pipe_allocate(struct lx6464es *chip, u32 pipe, int is_capture,
+		     int channels)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_06_ALLOCATE_PIPE);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+	chip->rmh.cmd[0] |= channels;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+
+	if (err != 0)
+		snd_printk(KERN_ERR "lx6464es: could not allocate pipe\n");
+
+	return err;
+}
+
+int lx_pipe_release(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_07_RELEASE_PIPE);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+
+	return err;
+}
+
+int lx_buffer_ask(struct lx6464es *chip, u32 pipe, int is_capture,
+		  u32 *r_needed, u32 *r_freed, u32 *size_array)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+#ifdef CONFIG_SND_DEBUG
+	if (size_array)
+		memset(size_array, 0, sizeof(u32)*MAX_STREAM_BUFFER);
+#endif
+
+	*r_needed = 0;
+	*r_freed = 0;
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_08_ASK_BUFFERS);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	if (!err) {
+		int i;
+		for (i = 0; i < MAX_STREAM_BUFFER; ++i) {
+			u32 stat = chip->rmh.stat[i];
+			if (stat & (BF_EOB << BUFF_FLAGS_OFFSET)) {
+				/* finished */
+				*r_freed += 1;
+				if (size_array)
+					size_array[i] = stat & MASK_DATA_SIZE;
+			} else if ((stat & (BF_VALID << BUFF_FLAGS_OFFSET))
+				   == 0)
+				/* free */
+				*r_needed += 1;
+		}
+
+#if 0
+		snd_printdd(LXP "CMD_08_ASK_BUFFERS: needed %d, freed %d\n",
+			    *r_needed, *r_freed);
+		for (i = 0; i < MAX_STREAM_BUFFER; ++i) {
+			for (i = 0; i != chip->rmh.stat_len; ++i)
+				snd_printdd("  stat[%d]: %x, %x\n", i,
+					    chip->rmh.stat[i],
+					    chip->rmh.stat[i] & MASK_DATA_SIZE);
+		}
+#endif
+	}
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+
+int lx_pipe_stop(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_09_STOP_PIPE);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+static int lx_pipe_toggle_state(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0B_TOGGLE_PIPE_STATE);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+
+int lx_pipe_start(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	int err;
+
+	err = lx_pipe_wait_for_idle(chip, pipe, is_capture);
+	if (err < 0)
+		return err;
+
+	err = lx_pipe_toggle_state(chip, pipe, is_capture);
+
+	return err;
+}
+
+int lx_pipe_pause(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	int err = 0;
+
+	err = lx_pipe_wait_for_start(chip, pipe, is_capture);
+	if (err < 0)
+		return err;
+
+	err = lx_pipe_toggle_state(chip, pipe, is_capture);
+
+	return err;
+}
+
+
+int lx_pipe_sample_count(struct lx6464es *chip, u32 pipe, int is_capture,
+			 u64 *rsample_count)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0A_GET_PIPE_SPL_COUNT);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+	chip->rmh.stat_len = 2;	/* need all words here! */
+
+	err = lx_message_send_atomic(chip, &chip->rmh); /* don't sleep! */
+
+	if (err != 0)
+		snd_printk(KERN_ERR
+			   "lx6464es: could not query pipe's sample count\n");
+	else {
+		*rsample_count = ((u64)(chip->rmh.stat[0] & MASK_SPL_COUNT_HI)
+				  << 24)     /* hi part */
+			+ chip->rmh.stat[1]; /* lo part */
+	}
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+int lx_pipe_state(struct lx6464es *chip, u32 pipe, int is_capture, u16 *rstate)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0A_GET_PIPE_SPL_COUNT);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	if (err != 0)
+		snd_printk(KERN_ERR "lx6464es: could not query pipe's state\n");
+	else
+		*rstate = (chip->rmh.stat[0] >> PSTATE_OFFSET) & 0x0F;
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+static int lx_pipe_wait_for_state(struct lx6464es *chip, u32 pipe,
+				  int is_capture, u16 state)
+{
+	int i;
+
+	/* max 2*PCMOnlyGranularity = 2*1024 at 44100 = < 50 ms:
+	 * timeout 50 ms */
+	for (i = 0; i != 50; ++i) {
+		u16 current_state;
+		int err = lx_pipe_state(chip, pipe, is_capture, &current_state);
+
+		if (err < 0)
+			return err;
+
+		if (current_state == state)
+			return 0;
+
+		mdelay(1);
+	}
+
+	return -ETIMEDOUT;
+}
+
+int lx_pipe_wait_for_start(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	return lx_pipe_wait_for_state(chip, pipe, is_capture, PSTATE_RUN);
+}
+
+int lx_pipe_wait_for_idle(struct lx6464es *chip, u32 pipe, int is_capture)
+{
+	return lx_pipe_wait_for_state(chip, pipe, is_capture, PSTATE_IDLE);
+}
+
+/* low-level stream handling */
+int lx_stream_set_state(struct lx6464es *chip, u32 pipe,
+			       int is_capture, enum stream_state_t state)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_13_SET_STREAM_STATE);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+	chip->rmh.cmd[0] |= state;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+
+	return err;
+}
+
+int lx_stream_set_format(struct lx6464es *chip, struct snd_pcm_runtime *runtime,
+			 u32 pipe, int is_capture)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	u32 channels = runtime->channels;
+
+	if (runtime->channels != channels)
+		snd_printk(KERN_ERR LXP "channel count mismatch: %d vs %d",
+			   runtime->channels, channels);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0C_DEF_STREAM);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	if (runtime->sample_bits == 16)
+		/* 16 bit format */
+		chip->rmh.cmd[0] |= (STREAM_FMT_16b << STREAM_FMT_OFFSET);
+
+	if (snd_pcm_format_little_endian(runtime->format))
+		/* little endian/intel format */
+		chip->rmh.cmd[0] |= (STREAM_FMT_intel << STREAM_FMT_OFFSET);
+
+	chip->rmh.cmd[0] |= channels-1;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+
+	return err;
+}
+
+int lx_stream_state(struct lx6464es *chip, u32 pipe, int is_capture,
+		    int *rstate)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0E_GET_STREAM_SPL_COUNT);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	*rstate = (chip->rmh.stat[0] & SF_START) ? START_STATE : PAUSE_STATE;
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+int lx_stream_sample_position(struct lx6464es *chip, u32 pipe, int is_capture,
+			      u64 *r_bytepos)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0E_GET_STREAM_SPL_COUNT);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	*r_bytepos = ((u64) (chip->rmh.stat[0] & MASK_SPL_COUNT_HI)
+		      << 32)	     /* hi part */
+		+ chip->rmh.stat[1]; /* lo part */
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+/* low-level buffer handling */
+int lx_buffer_give(struct lx6464es *chip, u32 pipe, int is_capture,
+		   u32 buffer_size, u32 buf_address_lo, u32 buf_address_hi,
+		   u32 *r_buffer_index)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0F_UPDATE_BUFFER);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+	chip->rmh.cmd[0] |= BF_NOTIFY_EOB; /* request interrupt notification */
+
+	/* todo: pause request, circular buffer */
+
+	chip->rmh.cmd[1] = buffer_size & MASK_DATA_SIZE;
+	chip->rmh.cmd[2] = buf_address_lo;
+
+	if (buf_address_hi) {
+		chip->rmh.cmd_len = 4;
+		chip->rmh.cmd[3] = buf_address_hi;
+		chip->rmh.cmd[0] |= BF_64BITS_ADR;
+	}
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	if (err == 0) {
+		*r_buffer_index = chip->rmh.stat[0];
+		goto done;
+	}
+
+	if (err == EB_RBUFFERS_TABLE_OVERFLOW)
+		snd_printk(LXP "lx_buffer_give EB_RBUFFERS_TABLE_OVERFLOW\n");
+
+	if (err == EB_INVALID_STREAM)
+		snd_printk(LXP "lx_buffer_give EB_INVALID_STREAM\n");
+
+	if (err == EB_CMD_REFUSED)
+		snd_printk(LXP "lx_buffer_give EB_CMD_REFUSED\n");
+
+ done:
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+int lx_buffer_free(struct lx6464es *chip, u32 pipe, int is_capture,
+		   u32 *r_buffer_size)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_11_CANCEL_BUFFER);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+	chip->rmh.cmd[0] |= MASK_BUFFER_ID; /* ask for the current buffer: the
+					     * microblaze will seek for it */
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	if (err == 0)
+		*r_buffer_size = chip->rmh.stat[0]  & MASK_DATA_SIZE;
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+int lx_buffer_cancel(struct lx6464es *chip, u32 pipe, int is_capture,
+		     u32 buffer_index)
+{
+	int err;
+	unsigned long flags;
+
+	u32 pipe_cmd = PIPE_INFO_TO_CMD(is_capture, pipe);
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_11_CANCEL_BUFFER);
+
+	chip->rmh.cmd[0] |= pipe_cmd;
+	chip->rmh.cmd[0] |= buffer_index;
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+
+/* low-level gain/peak handling
+ *
+ * \todo: can we unmute capture/playback channels independently?
+ *
+ * */
+int lx_level_unmute(struct lx6464es *chip, int is_capture, int unmute)
+{
+	int err;
+	unsigned long flags;
+
+	/* bit set to 1: channel muted */
+	u64 mute_mask = unmute ? 0 : 0xFFFFFFFFFFFFFFFFLLU;
+
+	spin_lock_irqsave(&chip->msg_lock, flags);
+	lx_message_init(&chip->rmh, CMD_0D_SET_MUTE);
+
+	chip->rmh.cmd[0] |= PIPE_INFO_TO_CMD(is_capture, 0);
+
+	chip->rmh.cmd[1] = (u32)(mute_mask >> (u64)32);	       /* hi part */
+	chip->rmh.cmd[2] = (u32)(mute_mask & (u64)0xFFFFFFFF); /* lo part */
+
+	snd_printk("mute %x %x %x\n", chip->rmh.cmd[0], chip->rmh.cmd[1],
+		   chip->rmh.cmd[2]);
+
+	err = lx_message_send_atomic(chip, &chip->rmh);
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+static u32 peak_map[] = {
+	0x00000109, /* -90.308dB */
+	0x0000083B, /* -72.247dB */
+	0x000020C4, /* -60.205dB */
+	0x00008273, /* -48.030dB */
+	0x00020756, /* -36.005dB */
+	0x00040C37, /* -30.001dB */
+	0x00081385, /* -24.002dB */
+	0x00101D3F, /* -18.000dB */
+	0x0016C310, /* -15.000dB */
+	0x002026F2, /* -12.001dB */
+	0x002D6A86, /* -9.000dB */
+	0x004026E6, /* -6.004dB */
+	0x005A9DF6, /* -3.000dB */
+	0x0065AC8B, /* -2.000dB */
+	0x00721481, /* -1.000dB */
+	0x007FFFFF, /* FS */
+};
+
+int lx_level_peaks(struct lx6464es *chip, int is_capture, int channels,
+		   u32 *r_levels)
+{
+	int err = 0;
+	unsigned long flags;
+	int i;
+	spin_lock_irqsave(&chip->msg_lock, flags);
+
+	for (i = 0; i < channels; i += 4) {
+		u32 s0, s1, s2, s3;
+
+		lx_message_init(&chip->rmh, CMD_12_GET_PEAK);
+		chip->rmh.cmd[0] |= PIPE_INFO_TO_CMD(is_capture, i);
+
+		err = lx_message_send_atomic(chip, &chip->rmh);
+
+		if (err == 0) {
+			s0 = peak_map[chip->rmh.stat[0] & 0x0F];
+			s1 = peak_map[(chip->rmh.stat[0] >>  4) & 0xf];
+			s2 = peak_map[(chip->rmh.stat[0] >>  8) & 0xf];
+			s3 = peak_map[(chip->rmh.stat[0] >>  12) & 0xf];
+		} else
+			s0 = s1 = s2 = s3 = 0;
+
+		r_levels[0] = s0;
+		r_levels[1] = s1;
+		r_levels[2] = s2;
+		r_levels[3] = s3;
+
+		r_levels += 4;
+	}
+
+	spin_unlock_irqrestore(&chip->msg_lock, flags);
+	return err;
+}
+
+/* interrupt handling */
+#define PCX_IRQ_NONE 0
+#define IRQCS_ACTIVE_PCIDB  0x00002000L         /* Bit nÃÂ¸ 13 */
+#define IRQCS_ENABLE_PCIIRQ 0x00000100L         /* Bit nÃÂ¸ 08 */
+#define IRQCS_ENABLE_PCIDB  0x00000200L         /* Bit nÃÂ¸ 09 */
+
+static u32 lx_interrupt_test_ack(struct lx6464es *chip)
+{
+	u32 irqcs = lx_plx_reg_read(chip, ePLX_IRQCS);
+
+	/* Test if PCI Doorbell interrupt is active */
+	if (irqcs & IRQCS_ACTIVE_PCIDB)	{
+		u32 temp;
+		irqcs = PCX_IRQ_NONE;
+
+		while ((temp = lx_plx_reg_read(chip, ePLX_L2PCIDB))) {
+			/* RAZ interrupt */
+			irqcs |= temp;
+			lx_plx_reg_write(chip, ePLX_L2PCIDB, temp);
+		}
+
+		return irqcs;
+	}
+	return PCX_IRQ_NONE;
+}
+
+static int lx_interrupt_ack(struct lx6464es *chip, u32 *r_irqsrc,
+			    int *r_async_pending, int *r_async_escmd)
+{
+	u32 irq_async;
+	u32 irqsrc = lx_interrupt_test_ack(chip);
+
+	if (irqsrc == PCX_IRQ_NONE)
+		return 0;
+
+	*r_irqsrc = irqsrc;
+
+	irq_async = irqsrc & MASK_SYS_ASYNC_EVENTS; /* + EtherSound response
+						     * (set by xilinx) + EOB */
+
+	if (irq_async & MASK_SYS_STATUS_ESA) {
+		irq_async &= ~MASK_SYS_STATUS_ESA;
+		*r_async_escmd = 1;
+	}
+
+	if (irqsrc & MASK_SYS_STATUS_CMD_DONE)
+		/* xilinx command notification */
+		atomic_set(&chip->send_message_locked, 0);
+
+	if (irq_async) {
+		/* snd_printd("interrupt: async event pending\n"); */
+		*r_async_pending = 1;
+	}
+
+	return 1;
+}
+
+static int lx_interrupt_handle_async_events(struct lx6464es *chip, u32 irqsrc,
+					    int *r_freq_changed,
+					    u64 *r_notified_in_pipe_mask,
+					    u64 *r_notified_out_pipe_mask)
+{
+	int err;
+	u32 stat[9];		/* answer from CMD_04_GET_EVENT */
+
+	/* On peut optimiser pour ne pas lire les evenements vides
+	 * les mots de rÃÂ©ponse sont dans l'ordre suivant :
+	 * Stat[0]	mot de status gÃÂ©nÃÂ©ral
+	 * Stat[1]	fin de buffer OUT pF
+	 * Stat[2]	fin de buffer OUT pf
+	 * Stat[3]	fin de buffer IN pF
+	 * Stat[4]	fin de buffer IN pf
+	 * Stat[5]	underrun poid fort
+	 * Stat[6]	underrun poid faible
+	 * Stat[7]	overrun poid fort
+	 * Stat[8]	overrun poid faible
+	 * */
+
+	u64 orun_mask;
+	u64 urun_mask;
+#if 0
+	int has_underrun   = (irqsrc & MASK_SYS_STATUS_URUN) ? 1 : 0;
+	int has_overrun    = (irqsrc & MASK_SYS_STATUS_ORUN) ? 1 : 0;
+#endif
+	int eb_pending_out = (irqsrc & MASK_SYS_STATUS_EOBO) ? 1 : 0;
+	int eb_pending_in  = (irqsrc & MASK_SYS_STATUS_EOBI) ? 1 : 0;
+
+	*r_freq_changed = (irqsrc & MASK_SYS_STATUS_FREQ) ? 1 : 0;
+
+	err = lx_dsp_read_async_events(chip, stat);
+	if (err < 0)
+		return err;
+
+	if (eb_pending_in) {
+		*r_notified_in_pipe_mask = ((u64)stat[3] << 32)
+			+ stat[4];
+		snd_printdd(LXP "interrupt: EOBI pending %llx\n",
+			    *r_notified_in_pipe_mask);
+	}
+	if (eb_pending_out) {
+		*r_notified_out_pipe_mask = ((u64)stat[1] << 32)
+			+ stat[2];
+		snd_printdd(LXP "interrupt: EOBO pending %llx\n",
+			    *r_notified_out_pipe_mask);
+	}
+
+	orun_mask = ((u64)stat[7] << 32) + stat[8];
+	urun_mask = ((u64)stat[5] << 32) + stat[6];
+
+	/* todo: handle xrun notification */
+
+	return err;
+}
+
+static int lx_interrupt_request_new_buffer(struct lx6464es *chip,
+					   struct lx_stream *lx_stream)
+{
+	struct snd_pcm_substream *substream = lx_stream->stream;
+	int is_capture = lx_stream->is_capture;
+	int err;
+	unsigned long flags;
+
+	const u32 channels = substream->runtime->channels;
+	const u32 bytes_per_frame = channels * 3;
+	const u32 period_size = substream->runtime->period_size;
+	const u32 period_bytes = period_size * bytes_per_frame;
+	const u32 pos = lx_stream->frame_pos;
+	const u32 next_pos = ((pos+1) == substream->runtime->periods) ?
+		0 : pos + 1;
+
+	dma_addr_t buf = substream->dma_buffer.addr + pos * period_bytes;
+	u32 buf_hi = 0;
+	u32 buf_lo = 0;
+	u32 buffer_index = 0;
+
+	u32 needed, freed;
+	u32 size_array[MAX_STREAM_BUFFER];
+
+	snd_printdd("->lx_interrupt_request_new_buffer\n");
+
+	spin_lock_irqsave(&chip->lock, flags);
+
+	err = lx_buffer_ask(chip, 0, is_capture, &needed, &freed, size_array);
+	snd_printdd(LXP "interrupt: needed %d, freed %d\n", needed, freed);
+
+	unpack_pointer(buf, &buf_lo, &buf_hi);
+	err = lx_buffer_give(chip, 0, is_capture, period_bytes, buf_lo, buf_hi,
+			     &buffer_index);
+	snd_printdd(LXP "interrupt: gave buffer index %x on %p (%d bytes)\n",
+		    buffer_index, (void *)buf, period_bytes);
+
+	lx_stream->frame_pos = next_pos;
+	spin_unlock_irqrestore(&chip->lock, flags);
+
+	return err;
+}
+
+void lx_tasklet_playback(unsigned long data)
+{
+	struct lx6464es *chip = (struct lx6464es *)data;
+	struct lx_stream *lx_stream = &chip->playback_stream;
+	int err;
+
+	snd_printdd("->lx_tasklet_playback\n");
+
+	err = lx_interrupt_request_new_buffer(chip, lx_stream);
+	if (err < 0)
+		snd_printk(KERN_ERR LXP
+			   "cannot request new buffer for playback\n");
+
+	snd_pcm_period_elapsed(lx_stream->stream);
+}
+
+void lx_tasklet_capture(unsigned long data)
+{
+	struct lx6464es *chip = (struct lx6464es *)data;
+	struct lx_stream *lx_stream = &chip->capture_stream;
+	int err;
+
+	snd_printdd("->lx_tasklet_capture\n");
+	err = lx_interrupt_request_new_buffer(chip, lx_stream);
+	if (err < 0)
+		snd_printk(KERN_ERR LXP
+			   "cannot request new buffer for capture\n");
+
+	snd_pcm_period_elapsed(lx_stream->stream);
+}
+
+
+
+static int lx_interrupt_handle_audio_transfer(struct lx6464es *chip,
+					      u64 notified_in_pipe_mask,
+					      u64 notified_out_pipe_mask)
+{
+	int err = 0;
+
+	if (notified_in_pipe_mask) {
+		snd_printdd(LXP "requesting audio transfer for capture\n");
+		tasklet_hi_schedule(&chip->tasklet_capture);
+	}
+
+	if (notified_out_pipe_mask) {
+		snd_printdd(LXP "requesting audio transfer for playback\n");
+		tasklet_hi_schedule(&chip->tasklet_playback);
+	}
+
+	return err;
+}
+
+
+irqreturn_t lx_interrupt(int irq, void *dev_id)
+{
+	struct lx6464es *chip = dev_id;
+	int async_pending, async_escmd;
+	u32 irqsrc;
+
+	spin_lock(&chip->lock);
+
+	snd_printdd("**************************************************\n");
+
+	if (!lx_interrupt_ack(chip, &irqsrc, &async_pending, &async_escmd)) {
+		spin_unlock(&chip->lock);
+		snd_printdd("IRQ_NONE\n");
+		return IRQ_NONE; /* this device did not cause the interrupt */
+	}
+
+	if (irqsrc & MASK_SYS_STATUS_CMD_DONE)
+		goto exit;
+
+#if 0
+	if (irqsrc & MASK_SYS_STATUS_EOBI)
+		snd_printdd(LXP "interrupt: EOBI\n");
+
+	if (irqsrc & MASK_SYS_STATUS_EOBO)
+		snd_printdd(LXP "interrupt: EOBO\n");
+
+	if (irqsrc & MASK_SYS_STATUS_URUN)
+		snd_printdd(LXP "interrupt: URUN\n");
+
+	if (irqsrc & MASK_SYS_STATUS_ORUN)
+		snd_printdd(LXP "interrupt: ORUN\n");
+#endif
+
+	if (async_pending) {
+		u64 notified_in_pipe_mask = 0;
+		u64 notified_out_pipe_mask = 0;
+		int freq_changed;
+		int err;
+
+		/* handle async events */
+		err = lx_interrupt_handle_async_events(chip, irqsrc,
+						       &freq_changed,
+						       &notified_in_pipe_mask,
+						       &notified_out_pipe_mask);
+		if (err)
+			snd_printk(KERN_ERR LXP
+				   "error handling async events\n");
+
+		err = lx_interrupt_handle_audio_transfer(chip,
+							 notified_in_pipe_mask,
+							 notified_out_pipe_mask
+			);
+		if (err)
+			snd_printk(KERN_ERR LXP
+				   "error during audio transfer\n");
+	}
+
+	if (async_escmd) {
+#if 0
+		/* backdoor for ethersound commands
+		 *
+		 * for now, we do not need this
+		 *
+		 * */
+
+		snd_printdd("lx6464es: interrupt requests escmd handling\n");
+#endif
+	}
+
+exit:
+	spin_unlock(&chip->lock);
+	return IRQ_HANDLED;	/* this device caused the interrupt */
+}
+
+
+static void lx_irq_set(struct lx6464es *chip, int enable)
+{
+	u32 reg = lx_plx_reg_read(chip, ePLX_IRQCS);
+
+	/* enable/disable interrupts
+	 *
+	 * Set the Doorbell and PCI interrupt enable bits
+	 *
+	 * */
+	if (enable)
+		reg |=  (IRQCS_ENABLE_PCIIRQ | IRQCS_ENABLE_PCIDB);
+	else
+		reg &= ~(IRQCS_ENABLE_PCIIRQ | IRQCS_ENABLE_PCIDB);
+	lx_plx_reg_write(chip, ePLX_IRQCS, reg);
+}
+
+void lx_irq_enable(struct lx6464es *chip)
+{
+	snd_printdd("->lx_irq_enable\n");
+	lx_irq_set(chip, 1);
+}
+
+void lx_irq_disable(struct lx6464es *chip)
+{
+	snd_printdd("->lx_irq_disable\n");
+	lx_irq_set(chip, 0);
+}
diff --git a/sound/pci/lx6464es/lx_core.h b/sound/pci/lx6464es/lx_core.h
new file mode 100644
index 00000000000..6bd9cbbbc68
--- /dev/null
+++ b/sound/pci/lx6464es/lx_core.h
@@ -0,0 +1,242 @@
+/* -*- linux-c -*- *
+ *
+ * ALSA driver for the digigram lx6464es interface
+ * low-level interface
+ *
+ * Copyright (c) 2009 Tim Blechmann <tim@klingt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ */
+
+#ifndef LX_CORE_H
+#define LX_CORE_H
+
+#include <linux/interrupt.h>
+
+#include "lx_defs.h"
+
+#define REG_CRM_NUMBER		12
+
+struct lx6464es;
+
+/* low-level register access */
+
+/* dsp register access */
+enum {
+	eReg_BASE,
+	eReg_CSM,
+	eReg_CRM1,
+	eReg_CRM2,
+	eReg_CRM3,
+	eReg_CRM4,
+	eReg_CRM5,
+	eReg_CRM6,
+	eReg_CRM7,
+	eReg_CRM8,
+	eReg_CRM9,
+	eReg_CRM10,
+	eReg_CRM11,
+	eReg_CRM12,
+
+	eReg_ICR,
+	eReg_CVR,
+	eReg_ISR,
+	eReg_RXHTXH,
+	eReg_RXMTXM,
+	eReg_RHLTXL,
+	eReg_RESETDSP,
+
+	eReg_CSUF,
+	eReg_CSES,
+	eReg_CRESMSB,
+	eReg_CRESLSB,
+	eReg_ADMACESMSB,
+	eReg_ADMACESLSB,
+	eReg_CONFES,
+
+	eMaxPortLx
+};
+
+unsigned long lx_dsp_reg_read(struct lx6464es *chip, int port);
+void lx_dsp_reg_readbuf(struct lx6464es *chip, int port, u32 *data, u32 len);
+void lx_dsp_reg_write(struct lx6464es *chip, int port, unsigned data);
+void lx_dsp_reg_writebuf(struct lx6464es *chip, int port, const u32 *data,
+			 u32 len);
+
+/* plx register access */
+enum {
+    ePLX_PCICR,
+
+    ePLX_MBOX0,
+    ePLX_MBOX1,
+    ePLX_MBOX2,
+    ePLX_MBOX3,
+    ePLX_MBOX4,
+    ePLX_MBOX5,
+    ePLX_MBOX6,
+    ePLX_MBOX7,
+
+    ePLX_L2PCIDB,
+    ePLX_IRQCS,
+    ePLX_CHIPSC,
+
+    eMaxPort
+};
+
+unsigned long lx_plx_reg_read(struct lx6464es *chip, int port);
+void lx_plx_reg_write(struct lx6464es *chip, int port, u32 data);
+
+/* rhm */
+struct lx_rmh {
+	u16	cmd_len;	/* length of the command to send (WORDs) */
+	u16	stat_len;	/* length of the status received (WORDs) */
+	u16	dsp_stat;	/* status type, RMP_SSIZE_XXX */
+	u16	cmd_idx;	/* index of the command */
+	u32	cmd[REG_CRM_NUMBER];
+	u32	stat[REG_CRM_NUMBER];
+};
+
+
+/* low-level dsp access */
+int __devinit lx_dsp_get_version(struct lx6464es *chip, u32 *rdsp_version);
+int lx_dsp_get_clock_frequency(struct lx6464es *chip, u32 *rfreq);
+int lx_dsp_set_granularity(struct lx6464es *chip, u32 gran);
+int lx_dsp_read_async_events(struct lx6464es *chip, u32 *data);
+int lx_dsp_get_mac(struct lx6464es *chip, u8 *mac_address);
+
+
+/* low-level pipe handling */
+int lx_pipe_allocate(struct lx6464es *chip, u32 pipe, int is_capture,
+		     int channels);
+int lx_pipe_release(struct lx6464es *chip, u32 pipe, int is_capture);
+int lx_pipe_sample_count(struct lx6464es *chip, u32 pipe, int is_capture,
+			 u64 *rsample_count);
+int lx_pipe_state(struct lx6464es *chip, u32 pipe, int is_capture, u16 *rstate);
+int lx_pipe_stop(struct lx6464es *chip, u32 pipe, int is_capture);
+int lx_pipe_start(struct lx6464es *chip, u32 pipe, int is_capture);
+int lx_pipe_pause(struct lx6464es *chip, u32 pipe, int is_capture);
+
+int lx_pipe_wait_for_start(struct lx6464es *chip, u32 pipe, int is_capture);
+int lx_pipe_wait_for_idle(struct lx6464es *chip, u32 pipe, int is_capture);
+
+/* low-level stream handling */
+int lx_stream_set_format(struct lx6464es *chip, struct snd_pcm_runtime *runtime,
+			 u32 pipe, int is_capture);
+int lx_stream_state(struct lx6464es *chip, u32 pipe, int is_capture,
+		    int *rstate);
+int lx_stream_sample_position(struct lx6464es *chip, u32 pipe, int is_capture,
+			      u64 *r_bytepos);
+
+int lx_stream_set_state(struct lx6464es *chip, u32 pipe,
+			int is_capture, enum stream_state_t state);
+
+static inline int lx_stream_start(struct lx6464es *chip, u32 pipe,
+				  int is_capture)
+{
+	snd_printdd("->lx_stream_start\n");
+	return lx_stream_set_state(chip, pipe, is_capture, SSTATE_RUN);
+}
+
+static inline int lx_stream_pause(struct lx6464es *chip, u32 pipe,
+				  int is_capture)
+{
+	snd_printdd("->lx_stream_pause\n");
+	return lx_stream_set_state(chip, pipe, is_capture, SSTATE_PAUSE);
+}
+
+static inline int lx_stream_stop(struct lx6464es *chip, u32 pipe,
+				 int is_capture)
+{
+	snd_printdd("->lx_stream_stop\n");
+	return lx_stream_set_state(chip, pipe, is_capture, SSTATE_STOP);
+}
+
+/* low-level buffer handling */
+int lx_buffer_ask(struct lx6464es *chip, u32 pipe, int is_capture,
+		  u32 *r_needed, u32 *r_freed, u32 *size_array);
+int lx_buffer_give(struct lx6464es *chip, u32 pipe, int is_capture,
+		   u32 buffer_size, u32 buf_address_lo, u32 buf_address_hi,
+		   u32 *r_buffer_index);
+int lx_buffer_free(struct lx6464es *chip, u32 pipe, int is_capture,
+		   u32 *r_buffer_size);
+int lx_buffer_cancel(struct lx6464es *chip, u32 pipe, int is_capture,
+		     u32 buffer_index);
+
+/* low-level gain/peak handling */
+int lx_level_unmute(struct lx6464es *chip, int is_capture, int unmute);
+int lx_level_peaks(struct lx6464es *chip, int is_capture, int channels,
+		   u32 *r_levels);
+
+
+/* interrupt handling */
+irqreturn_t lx_interrupt(int irq, void *dev_id);
+void lx_irq_enable(struct lx6464es *chip);
+void lx_irq_disable(struct lx6464es *chip);
+
+void lx_tasklet_capture(unsigned long data);
+void lx_tasklet_playback(unsigned long data);
+
+
+/* Stream Format Header Defines (for LIN and IEEE754) */
+#define HEADER_FMT_BASE		HEADER_FMT_BASE_LIN
+#define HEADER_FMT_BASE_LIN	0xFED00000
+#define HEADER_FMT_BASE_FLOAT	0xFAD00000
+#define HEADER_FMT_MONO		0x00000080 /* bit 23 in header_lo. WARNING: old
+					    * bit 22 is ignored in float
+					    * format */
+#define HEADER_FMT_INTEL	0x00008000
+#define HEADER_FMT_16BITS	0x00002000
+#define HEADER_FMT_24BITS	0x00004000
+#define HEADER_FMT_UPTO11	0x00000200 /* frequency is less or equ. to 11k.
+					    * */
+#define HEADER_FMT_UPTO32	0x00000100 /* frequency is over 11k and less
+					    * then 32k.*/
+
+
+#define BIT_FMP_HEADER          23
+#define BIT_FMP_SD              22
+#define BIT_FMP_MULTICHANNEL    19
+
+#define START_STATE             1
+#define PAUSE_STATE             0
+
+
+
+
+
+/* from PcxAll_e.h */
+/* Start/Pause condition for pipes (PCXStartPipe, PCXPausePipe) */
+#define START_PAUSE_IMMEDIATE           0
+#define START_PAUSE_ON_SYNCHRO          1
+#define START_PAUSE_ON_TIME_CODE        2
+
+
+/* Pipe / Stream state */
+#define START_STATE             1
+#define PAUSE_STATE             0
+
+static inline void unpack_pointer(dma_addr_t ptr, u32 *r_low, u32 *r_high)
+{
+	*r_low = (u32)(ptr & 0xffffffff);
+#if BITS_PER_LONG == 32
+	*r_high = 0;
+#else
+	*r_high = (u32)((u64)ptr>>32);
+#endif
+}
+
+#endif /* LX_CORE_H */
diff --git a/sound/pci/lx6464es/lx_defs.h b/sound/pci/lx6464es/lx_defs.h
new file mode 100644
index 00000000000..49d36bdd512
--- /dev/null
+++ b/sound/pci/lx6464es/lx_defs.h
@@ -0,0 +1,376 @@
+/* -*- linux-c -*- *
+ *
+ * ALSA driver for the digigram lx6464es interface
+ * adapted upstream headers
+ *
+ * Copyright (c) 2009 Tim Blechmann <tim@klingt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ */
+
+#ifndef LX_DEFS_H
+#define LX_DEFS_H
+
+/* code adapted from ethersound.h */
+#define	XES_FREQ_COUNT8_MASK    0x00001FFF /* compteur 25MHz entre 8 ech. */
+#define	XES_FREQ_COUNT8_44_MIN  0x00001288 /* 25M /
+					    * [ 44k - ( 44.1k + 48k ) / 2 ]
+					    * * 8 */
+#define	XES_FREQ_COUNT8_44_MAX	0x000010F0 /* 25M / [ ( 44.1k + 48k ) / 2 ]
+					    * * 8 */
+#define	XES_FREQ_COUNT8_48_MAX	0x00000F08 /* 25M /
+					    * [ 48k + ( 44.1k + 48k ) / 2 ]
+					    * * 8 */
+
+/* code adapted from LXES_registers.h */
+
+#define IOCR_OUTPUTS_OFFSET 0	/* (rw) offset for the number of OUTs in the
+				 * ConfES register. */
+#define IOCR_INPUTS_OFFSET  8	/* (rw) offset for the number of INs in the
+				 * ConfES register. */
+#define FREQ_RATIO_OFFSET  19	/* (rw) offset for frequency ratio in the
+				 * ConfES register. */
+#define	FREQ_RATIO_SINGLE_MODE 0x01 /* value for single mode frequency ratio:
+				     * sample rate = frequency rate. */
+
+#define CONFES_READ_PART_MASK	0x00070000
+#define CONFES_WRITE_PART_MASK	0x00F80000
+
+/* code adapted from if_drv_mb.h */
+
+#define MASK_SYS_STATUS_ERROR	(1L << 31) /* events that lead to a PCI irq if
+					    * not yet pending */
+#define MASK_SYS_STATUS_URUN	(1L << 30)
+#define MASK_SYS_STATUS_ORUN	(1L << 29)
+#define MASK_SYS_STATUS_EOBO	(1L << 28)
+#define MASK_SYS_STATUS_EOBI	(1L << 27)
+#define MASK_SYS_STATUS_FREQ	(1L << 26)
+#define MASK_SYS_STATUS_ESA	(1L << 25) /* reserved, this is set by the
+					    * XES */
+#define MASK_SYS_STATUS_TIMER	(1L << 24)
+
+#define MASK_SYS_ASYNC_EVENTS	(MASK_SYS_STATUS_ERROR |		\
+				 MASK_SYS_STATUS_URUN  |		\
+				 MASK_SYS_STATUS_ORUN  |		\
+				 MASK_SYS_STATUS_EOBO  |		\
+				 MASK_SYS_STATUS_EOBI  |		\
+				 MASK_SYS_STATUS_FREQ  |		\
+				 MASK_SYS_STATUS_ESA)
+
+#define MASK_SYS_PCI_EVENTS		(MASK_SYS_ASYNC_EVENTS |	\
+					 MASK_SYS_STATUS_TIMER)
+
+#define MASK_SYS_TIMER_COUNT	0x0000FFFF
+
+#define MASK_SYS_STATUS_EOT_PLX		(1L << 22) /* event that remains
+						    * internal: reserved fo end
+						    * of plx dma */
+#define MASK_SYS_STATUS_XES		(1L << 21) /* event that remains
+						    * internal: pending XES
+						    * IRQ */
+#define MASK_SYS_STATUS_CMD_DONE	(1L << 20) /* alternate command
+						    * management: notify driver
+						    * instead of polling */
+
+
+#define MAX_STREAM_BUFFER 5	/* max amount of stream buffers. */
+
+#define MICROBLAZE_IBL_MIN		 32
+#define MICROBLAZE_IBL_DEFAULT	        128
+#define MICROBLAZE_IBL_MAX		512
+/* #define MASK_GRANULARITY		(2*MICROBLAZE_IBL_MAX-1) */
+
+
+
+/* command opcodes, see reference for details */
+
+/*
+ the capture bit position in the object_id field in driver commands
+ depends upon the number of managed channels. For now, 64 IN + 64 OUT are
+ supported. HOwever, the communication protocol forsees 1024 channels, hence
+ bit 10 indicates a capture (input) object).
+*/
+#define ID_IS_CAPTURE (1L << 10)
+#define ID_OFFSET	13	/* object ID is at the 13th bit in the
+				 * 1st command word.*/
+#define ID_CH_MASK    0x3F
+#define OPCODE_OFFSET	24	/* offset of the command opcode in the first
+				 * command word.*/
+
+enum cmd_mb_opcodes {
+	CMD_00_INFO_DEBUG	        = 0x00,
+	CMD_01_GET_SYS_CFG		= 0x01,
+	CMD_02_SET_GRANULARITY		= 0x02,
+	CMD_03_SET_TIMER_IRQ		= 0x03,
+	CMD_04_GET_EVENT		= 0x04,
+	CMD_05_GET_PIPES		= 0x05,
+
+	CMD_06_ALLOCATE_PIPE            = 0x06,
+	CMD_07_RELEASE_PIPE		= 0x07,
+	CMD_08_ASK_BUFFERS		= 0x08,
+	CMD_09_STOP_PIPE		= 0x09,
+	CMD_0A_GET_PIPE_SPL_COUNT	= 0x0a,
+	CMD_0B_TOGGLE_PIPE_STATE	= 0x0b,
+
+	CMD_0C_DEF_STREAM		= 0x0c,
+	CMD_0D_SET_MUTE			= 0x0d,
+	CMD_0E_GET_STREAM_SPL_COUNT     = 0x0e,
+	CMD_0F_UPDATE_BUFFER		= 0x0f,
+	CMD_10_GET_BUFFER		= 0x10,
+	CMD_11_CANCEL_BUFFER		= 0x11,
+	CMD_12_GET_PEAK			= 0x12,
+	CMD_13_SET_STREAM_STATE		= 0x13,
+	CMD_14_INVALID			= 0x14,
+};
+
+/* pipe states */
+enum pipe_state_t {
+	PSTATE_IDLE	= 0,	/* the pipe is not processed in the XES_IRQ
+				 * (free or stopped, or paused). */
+	PSTATE_RUN	= 1,	/* sustained play/record state. */
+	PSTATE_PURGE	= 2,	/* the ES channels are now off, render pipes do
+				 * not DMA, record pipe do a last DMA. */
+	PSTATE_ACQUIRE	= 3,	/* the ES channels are now on, render pipes do
+				 * not yet increase their sample count, record
+				 * pipes do not DMA. */
+	PSTATE_CLOSING	= 4,	/* the pipe is releasing, and may not yet
+				 * receive an "alloc" command. */
+};
+
+/* stream states */
+enum stream_state_t {
+	SSTATE_STOP	=  0x00,       /* setting to stop resets the stream spl
+					* count.*/
+	SSTATE_RUN	= (0x01 << 0), /* start DMA and spl count handling. */
+	SSTATE_PAUSE	= (0x01 << 1), /* pause DMA and spl count handling. */
+};
+
+/* buffer flags */
+enum buffer_flags {
+	BF_VALID	= 0x80,	/* set if the buffer is valid, clear if free.*/
+	BF_CURRENT	= 0x40,	/* set if this is the current buffer (there is
+				 * always a current buffer).*/
+	BF_NOTIFY_EOB	= 0x20,	/* set if this buffer must cause a PCI event
+				 * when finished.*/
+	BF_CIRCULAR	= 0x10,	/* set if buffer[1] must be copied to buffer[0]
+				 * by the end of this buffer.*/
+	BF_64BITS_ADR	= 0x08,	/* set if the hi part of the address is valid.*/
+	BF_xx		= 0x04,	/* future extension.*/
+	BF_EOB		= 0x02,	/* set if finished, but not yet free.*/
+	BF_PAUSE	= 0x01,	/* pause stream at buffer end.*/
+	BF_ZERO		= 0x00,	/* no flags (init).*/
+};
+
+/**
+*	Stream Flags definitions
+*/
+enum stream_flags {
+	SF_ZERO		= 0x00000000, /* no flags (stream invalid). */
+	SF_VALID	= 0x10000000, /* the stream has a valid DMA_conf
+				       * info (setstreamformat). */
+	SF_XRUN		= 0x20000000, /* the stream is un x-run state. */
+	SF_START	= 0x40000000, /* the DMA is running.*/
+	SF_ASIO		= 0x80000000, /* ASIO.*/
+};
+
+
+#define MASK_SPL_COUNT_HI 0x00FFFFFF /* 4 MSBits are status bits */
+#define PSTATE_OFFSET             28 /* 4 MSBits are status bits */
+
+
+#define MASK_STREAM_HAS_MAPPING	(1L << 12)
+#define MASK_STREAM_IS_ASIO	(1L <<  9)
+#define STREAM_FMT_OFFSET	10   /* the stream fmt bits start at the 10th
+				      * bit in the command word. */
+
+#define STREAM_FMT_16b          0x02
+#define STREAM_FMT_intel        0x01
+
+#define FREQ_FIELD_OFFSET	15  /* offset of the freq field in the response
+				     * word */
+
+#define BUFF_FLAGS_OFFSET	  24 /*  offset of the buffer flags in the
+				      *  response word. */
+#define MASK_DATA_SIZE	  0x00FFFFFF /* this must match the field size of
+				      * datasize in the buffer_t structure. */
+
+#define MASK_BUFFER_ID	        0xFF /* the cancel command awaits a buffer ID,
+				      * may be 0xFF for "current". */
+
+
+/* code adapted from PcxErr_e.h */
+
+/* Bits masks */
+
+#define ERROR_MASK              0x8000
+
+#define SOURCE_MASK             0x7800
+
+#define E_SOURCE_BOARD          0x4000 /* 8 >> 1 */
+#define E_SOURCE_DRV            0x2000 /* 4 >> 1 */
+#define E_SOURCE_API            0x1000 /* 2 >> 1 */
+/* Error tools */
+#define E_SOURCE_TOOLS          0x0800 /* 1 >> 1 */
+/* Error pcxaudio */
+#define E_SOURCE_AUDIO          0x1800 /* 3 >> 1 */
+/* Error virtual pcx */
+#define E_SOURCE_VPCX           0x2800 /* 5 >> 1 */
+/* Error dispatcher */
+#define E_SOURCE_DISPATCHER     0x3000 /* 6 >> 1 */
+/* Error from CobraNet firmware */
+#define E_SOURCE_COBRANET       0x3800 /* 7 >> 1 */
+
+#define E_SOURCE_USER           0x7800
+
+#define CLASS_MASK              0x0700
+
+#define CODE_MASK               0x00FF
+
+/* Bits values */
+
+/* Values for the error/warning bit */
+#define ERROR_VALUE             0x8000
+#define WARNING_VALUE           0x0000
+
+/* Class values */
+#define E_CLASS_GENERAL                  0x0000
+#define E_CLASS_INVALID_CMD              0x0100
+#define E_CLASS_INVALID_STD_OBJECT       0x0200
+#define E_CLASS_RSRC_IMPOSSIBLE          0x0300
+#define E_CLASS_WRONG_CONTEXT            0x0400
+#define E_CLASS_BAD_SPECIFIC_PARAMETER   0x0500
+#define E_CLASS_REAL_TIME_ERROR          0x0600
+#define E_CLASS_DIRECTSHOW               0x0700
+#define E_CLASS_FREE                     0x0700
+
+
+/* Complete DRV error code for the general class */
+#define ED_GN           (ERROR_VALUE | E_SOURCE_DRV | E_CLASS_GENERAL)
+#define ED_CONCURRENCY                  (ED_GN | 0x01)
+#define ED_DSP_CRASHED                  (ED_GN | 0x02)
+#define ED_UNKNOWN_BOARD                (ED_GN | 0x03)
+#define ED_NOT_INSTALLED                (ED_GN | 0x04)
+#define ED_CANNOT_OPEN_SVC_MANAGER      (ED_GN | 0x05)
+#define ED_CANNOT_READ_REGISTRY         (ED_GN | 0x06)
+#define ED_DSP_VERSION_MISMATCH         (ED_GN | 0x07)
+#define ED_UNAVAILABLE_FEATURE          (ED_GN | 0x08)
+#define ED_CANCELLED                    (ED_GN | 0x09)
+#define ED_NO_RESPONSE_AT_IRQA          (ED_GN | 0x10)
+#define ED_INVALID_ADDRESS              (ED_GN | 0x11)
+#define ED_DSP_CORRUPTED                (ED_GN | 0x12)
+#define ED_PENDING_OPERATION            (ED_GN | 0x13)
+#define ED_NET_ALLOCATE_MEMORY_IMPOSSIBLE   (ED_GN | 0x14)
+#define ED_NET_REGISTER_ERROR               (ED_GN | 0x15)
+#define ED_NET_THREAD_ERROR                 (ED_GN | 0x16)
+#define ED_NET_OPEN_ERROR                   (ED_GN | 0x17)
+#define ED_NET_CLOSE_ERROR                  (ED_GN | 0x18)
+#define ED_NET_NO_MORE_PACKET               (ED_GN | 0x19)
+#define ED_NET_NO_MORE_BUFFER               (ED_GN | 0x1A)
+#define ED_NET_SEND_ERROR                   (ED_GN | 0x1B)
+#define ED_NET_RECEIVE_ERROR                (ED_GN | 0x1C)
+#define ED_NET_WRONG_MSG_SIZE               (ED_GN | 0x1D)
+#define ED_NET_WAIT_ERROR                   (ED_GN | 0x1E)
+#define ED_NET_EEPROM_ERROR                 (ED_GN | 0x1F)
+#define ED_INVALID_RS232_COM_NUMBER         (ED_GN | 0x20)
+#define ED_INVALID_RS232_INIT               (ED_GN | 0x21)
+#define ED_FILE_ERROR                       (ED_GN | 0x22)
+#define ED_INVALID_GPIO_CMD                 (ED_GN | 0x23)
+#define ED_RS232_ALREADY_OPENED             (ED_GN | 0x24)
+#define ED_RS232_NOT_OPENED                 (ED_GN | 0x25)
+#define ED_GPIO_ALREADY_OPENED              (ED_GN | 0x26)
+#define ED_GPIO_NOT_OPENED                  (ED_GN | 0x27)
+#define ED_REGISTRY_ERROR                   (ED_GN | 0x28) /* <- NCX */
+#define ED_INVALID_SERVICE                  (ED_GN | 0x29) /* <- NCX */
+
+#define ED_READ_FILE_ALREADY_OPENED	    (ED_GN | 0x2a) /* <- Decalage
+							    * pour RCX
+							    * (old 0x28)
+							    * */
+#define ED_READ_FILE_INVALID_COMMAND	    (ED_GN | 0x2b) /* ~ */
+#define ED_READ_FILE_INVALID_PARAMETER	    (ED_GN | 0x2c) /* ~ */
+#define ED_READ_FILE_ALREADY_CLOSED	    (ED_GN | 0x2d) /* ~ */
+#define ED_READ_FILE_NO_INFORMATION	    (ED_GN | 0x2e) /* ~ */
+#define ED_READ_FILE_INVALID_HANDLE	    (ED_GN | 0x2f) /* ~ */
+#define ED_READ_FILE_END_OF_FILE	    (ED_GN | 0x30) /* ~ */
+#define ED_READ_FILE_ERROR	            (ED_GN | 0x31) /* ~ */
+
+#define ED_DSP_CRASHED_EXC_DSPSTACK_OVERFLOW (ED_GN | 0x32) /* <- Decalage pour
+							     * PCX (old 0x14) */
+#define ED_DSP_CRASHED_EXC_SYSSTACK_OVERFLOW (ED_GN | 0x33) /* ~ */
+#define ED_DSP_CRASHED_EXC_ILLEGAL           (ED_GN | 0x34) /* ~ */
+#define ED_DSP_CRASHED_EXC_TIMER_REENTRY     (ED_GN | 0x35) /* ~ */
+#define ED_DSP_CRASHED_EXC_FATAL_ERROR       (ED_GN | 0x36) /* ~ */
+
+#define ED_FLASH_PCCARD_NOT_PRESENT          (ED_GN | 0x37)
+
+#define ED_NO_CURRENT_CLOCK                  (ED_GN | 0x38)
+
+/* Complete DRV error code for real time class */
+#define ED_RT           (ERROR_VALUE | E_SOURCE_DRV | E_CLASS_REAL_TIME_ERROR)
+#define ED_DSP_TIMED_OUT                (ED_RT | 0x01)
+#define ED_DSP_CHK_TIMED_OUT            (ED_RT | 0x02)
+#define ED_STREAM_OVERRUN               (ED_RT | 0x03)
+#define ED_DSP_BUSY                     (ED_RT | 0x04)
+#define ED_DSP_SEMAPHORE_TIME_OUT       (ED_RT | 0x05)
+#define ED_BOARD_TIME_OUT               (ED_RT | 0x06)
+#define ED_XILINX_ERROR                 (ED_RT | 0x07)
+#define ED_COBRANET_ITF_NOT_RESPONDING  (ED_RT | 0x08)
+
+/* Complete BOARD error code for the invaid standard object class */
+#define EB_ISO          (ERROR_VALUE | E_SOURCE_BOARD | \
+			 E_CLASS_INVALID_STD_OBJECT)
+#define EB_INVALID_EFFECT               (EB_ISO | 0x00)
+#define EB_INVALID_PIPE                 (EB_ISO | 0x40)
+#define EB_INVALID_STREAM               (EB_ISO | 0x80)
+#define EB_INVALID_AUDIO                (EB_ISO | 0xC0)
+
+/* Complete BOARD error code for impossible resource allocation class */
+#define EB_RI           (ERROR_VALUE | E_SOURCE_BOARD | E_CLASS_RSRC_IMPOSSIBLE)
+#define EB_ALLOCATE_ALL_STREAM_TRANSFERT_BUFFERS_IMPOSSIBLE (EB_RI | 0x01)
+#define EB_ALLOCATE_PIPE_SAMPLE_BUFFER_IMPOSSIBLE           (EB_RI | 0x02)
+
+#define EB_ALLOCATE_MEM_STREAM_IMPOSSIBLE		\
+	EB_ALLOCATE_ALL_STREAM_TRANSFERT_BUFFERS_IMPOSSIBLE
+#define EB_ALLOCATE_MEM_PIPE_IMPOSSIBLE			\
+	EB_ALLOCATE_PIPE_SAMPLE_BUFFER_IMPOSSIBLE
+
+#define EB_ALLOCATE_DIFFERED_CMD_IMPOSSIBLE     (EB_RI | 0x03)
+#define EB_TOO_MANY_DIFFERED_CMD                (EB_RI | 0x04)
+#define EB_RBUFFERS_TABLE_OVERFLOW              (EB_RI | 0x05)
+#define EB_ALLOCATE_EFFECTS_IMPOSSIBLE          (EB_RI | 0x08)
+#define EB_ALLOCATE_EFFECT_POS_IMPOSSIBLE       (EB_RI | 0x09)
+#define EB_RBUFFER_NOT_AVAILABLE                (EB_RI | 0x0A)
+#define EB_ALLOCATE_CONTEXT_LIII_IMPOSSIBLE     (EB_RI | 0x0B)
+#define EB_STATUS_DIALOG_IMPOSSIBLE             (EB_RI | 0x1D)
+#define EB_CONTROL_CMD_IMPOSSIBLE               (EB_RI | 0x1E)
+#define EB_STATUS_SEND_IMPOSSIBLE               (EB_RI | 0x1F)
+#define EB_ALLOCATE_PIPE_IMPOSSIBLE             (EB_RI | 0x40)
+#define EB_ALLOCATE_STREAM_IMPOSSIBLE           (EB_RI | 0x80)
+#define EB_ALLOCATE_AUDIO_IMPOSSIBLE            (EB_RI | 0xC0)
+
+/* Complete BOARD error code for wrong call context class */
+#define EB_WCC          (ERROR_VALUE | E_SOURCE_BOARD | E_CLASS_WRONG_CONTEXT)
+#define EB_CMD_REFUSED                  (EB_WCC | 0x00)
+#define EB_START_STREAM_REFUSED         (EB_WCC | 0xFC)
+#define EB_SPC_REFUSED                  (EB_WCC | 0xFD)
+#define EB_CSN_REFUSED                  (EB_WCC | 0xFE)
+#define EB_CSE_REFUSED                  (EB_WCC | 0xFF)
+
+
+
+
+#endif /* LX_DEFS_H */
-- 
cgit v1.2.3-70-g09d2


From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 08:54:16 -0400
Subject: tracing: consolidate trace and trace_event headers

Impact: clean up

Neil Horman (et. al.) criticized the way the trace events were broken up
into two files. The reason for that was that ftrace needed to separate out
the declarations from where the #include <linux/tracepoint.h> was used.
It then dawned on me that the tracepoint.h header only needs to define the
TRACE_EVENT macro if it is not already defined.

The solution is simply to test if TRACE_EVENT is defined, and if it is not
then the linux/tracepoint.h header can define it. This change consolidates
all the <traces>.h and <traces>_event_types.h into the <traces>.h file.

Reported-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Theodore Tso <tytso@mit.edu>
Reported-by: Jiaying Zhang <jiayingz@google.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h          |   9 +-
 include/trace/irq.h                 |  51 +++++-
 include/trace/irq_event_types.h     |  55 ------
 include/trace/kmem.h                | 189 +++++++++++++++++++-
 include/trace/lockdep.h             |  52 +++++-
 include/trace/lockdep_event_types.h |  57 ------
 include/trace/sched.h               | 333 ++++++++++++++++++++++++++++++++++-
 include/trace/sched_event_types.h   | 337 ------------------------------------
 include/trace/skb.h                 |  36 +++-
 include/trace/skb_event_types.h     |  38 ----
 include/trace/trace_event_types.h   |   7 -
 kernel/trace/events.c               |   1 +
 kernel/trace/trace_events_stage_1.h |   4 +-
 kernel/trace/trace_events_stage_2.h |   8 +-
 kernel/trace/trace_events_stage_3.h |   4 +-
 15 files changed, 663 insertions(+), 518 deletions(-)
 delete mode 100644 include/trace/irq_event_types.h
 delete mode 100644 include/trace/lockdep_event_types.h
 delete mode 100644 include/trace/sched_event_types.h
 delete mode 100644 include/trace/skb_event_types.h
 delete mode 100644 include/trace/trace_event_types.h

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d35a7ee7611..4353f3f7e62 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,6 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
+#ifndef DECLARE_TRACE
+
 #define TP_PROTO(args...)	args
 #define TP_ARGS(args...)		args
 
@@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 { }
 #endif /* CONFIG_TRACEPOINTS */
+#endif /* DECLARE_TRACE */
 
 /*
  * Connect a probe to a tracepoint.
@@ -154,10 +157,13 @@ static inline void tracepoint_synchronize_unregister(void)
 }
 
 #define PARAMS(args...) args
+
+#ifndef TRACE_FORMAT
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
-
+#ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
  *
@@ -262,5 +268,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
 #endif
diff --git a/include/trace/irq.h b/include/trace/irq.h
index ff5d4495dc3..04ab4c65222 100644
--- a/include/trace/irq.h
+++ b/include/trace/irq.h
@@ -1,9 +1,54 @@
-#ifndef _TRACE_IRQ_H
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_IRQ_H
 
-#include <linux/interrupt.h>
 #include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+/*
+ * Tracepoint for entry of interrupt handler:
+ */
+TRACE_FORMAT(irq_handler_entry,
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name)
+	);
+
+/*
+ * Tracepoint for return of an interrupt handler:
+ */
+TRACE_EVENT(irq_handler_exit,
+
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+
+	TP_ARGS(irq, action, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq	)
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->irq	= irq;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
 
-#include <trace/irq_event_types.h>
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
deleted file mode 100644
index 85964ebd47e..00000000000
--- a/include/trace/irq_event_types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-
-/* use <trace/irq.h> instead */
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-	TP_PROTO(int irq, struct irqaction *action),
-	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-	TP_PROTO(int irq, struct irqaction *action, int ret),
-
-	TP_ARGS(irq, action, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	irq	)
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->irq	= irq;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-TRACE_FORMAT(softirq_exit,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index 46efc2423f0..d7d12189e5c 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -1,9 +1,192 @@
-#ifndef _TRACE_KMEM_H
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_KMEM_H
 
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
-#include <trace/kmem_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
 
-#endif /* _TRACE_KMEM_H */
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+#endif
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 5ca67df87f2..8ee7900b38c 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -1,9 +1,57 @@
-#ifndef _TRACE_LOCKDEP_H
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_LOCKDEP_H
 
 #include <linux/lockdep.h>
 #include <linux/tracepoint.h>
 
-#include <trace/lockdep_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
 
 #endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
deleted file mode 100644
index 863f1e4583a..00000000000
--- a/include/trace/lockdep_event_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
-
-TRACE_FORMAT(lock_release,
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-
-	TP_ARGS(lock, ip, waittime),
-
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(unsigned long, wait_usec)
-		__field(unsigned long, wait_nsec_rem)
-	),
-	TP_fast_assign(
-		__entry->name = lock->name;
-		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
-		__entry->wait_usec = (unsigned long) waittime;
-	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
-				       __entry->wait_nsec_rem)
-);
-
-#endif
-#endif
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/sched.h b/include/trace/sched.h
index 4e372a1a29b..5b1cf4a2846 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -1,9 +1,336 @@
-#ifndef _TRACE_SCHED_H
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_SCHED_H
 
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-#include <trace/sched_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched
 
-#endif
+/*
+ * Tracepoint for calling kthread_stop, performed to end a kthread:
+ */
+TRACE_EVENT(sched_kthread_stop,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
+);
+
+/*
+ * Tracepoint for the return value of the kthread stopping:
+ */
+TRACE_EVENT(sched_kthread_stop_ret,
+
+	TP_PROTO(int ret),
+
+	TP_ARGS(ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ret	= ret;
+	),
+
+	TP_printk("ret %d", __entry->ret)
+);
+
+/*
+ * Tracepoint for waiting on task to unschedule:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wait_task,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->prio	= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for waking up a task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for waking up a new task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup_new,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
+		 struct task_struct *next),
+
+	TP_ARGS(rq, prev, next),
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+);
+
+/*
+ * Tracepoint for a task being migrated:
+ */
+TRACE_EVENT(sched_migrate_task,
+
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+
+	TP_ARGS(p, orig_cpu, dest_cpu),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	orig_cpu		)
+		__field(	int,	dest_cpu		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->orig_cpu	= orig_cpu;
+		__entry->dest_cpu	= dest_cpu;
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
+);
+
+/*
+ * Tracepoint for freeing a task:
+ */
+TRACE_EVENT(sched_process_free,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a task exiting:
+ */
+TRACE_EVENT(sched_process_exit,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a waiting task:
+ */
+TRACE_EVENT(sched_process_wait,
+
+	TP_PROTO(struct pid *pid),
+
+	TP_ARGS(pid),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->pid		= pid_nr(pid);
+		__entry->prio		= current->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for do_fork:
+ */
+TRACE_EVENT(sched_process_fork,
+
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+
+	TP_ARGS(parent, child),
+
+	TP_STRUCT__entry(
+		__array(	char,	parent_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	parent_pid			)
+		__array(	char,	child_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	child_pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
+		__entry->parent_pid	= parent->pid;
+		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
+		__entry->child_pid	= child->pid;
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
+);
+
+/*
+ * Tracepoint for sending a signal:
+ */
+TRACE_EVENT(sched_signal_send,
+
+	TP_PROTO(int sig, struct task_struct *p),
+
+	TP_ARGS(sig, p),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig			)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->sig	= sig;
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SCHED_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
deleted file mode 100644
index 63547dc1125..00000000000
--- a/include/trace/sched_event_types.h
+++ /dev/null
@@ -1,337 +0,0 @@
-
-/* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sched
-
-/*
- * Tracepoint for calling kthread_stop, performed to end a kthread:
- */
-TRACE_EVENT(sched_kthread_stop,
-
-	TP_PROTO(struct task_struct *t),
-
-	TP_ARGS(t),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
-		__entry->pid	= t->pid;
-	),
-
-	TP_printk("task %s:%d", __entry->comm, __entry->pid)
-);
-
-/*
- * Tracepoint for the return value of the kthread stopping:
- */
-TRACE_EVENT(sched_kthread_stop_ret,
-
-	TP_PROTO(int ret),
-
-	TP_ARGS(ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->ret	= ret;
-	),
-
-	TP_printk("ret %d", __entry->ret)
-);
-
-/*
- * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wait_task,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p),
-
-	TP_ARGS(rq, p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->prio	= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup_new,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_switch,
-
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		 struct task_struct *next),
-
-	TP_ARGS(rq, prev, next),
-
-	TP_STRUCT__entry(
-		__array(	char,	prev_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	prev_pid			)
-		__field(	int,	prev_prio			)
-		__array(	char,	next_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	next_pid			)
-		__field(	int,	next_prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
-	),
-
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
-);
-
-/*
- * Tracepoint for a task being migrated:
- */
-TRACE_EVENT(sched_migrate_task,
-
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-
-	TP_ARGS(p, orig_cpu, dest_cpu),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	orig_cpu		)
-		__field(	int,	dest_cpu		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
-		__entry->dest_cpu	= dest_cpu;
-	),
-
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu)
-);
-
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a task exiting:
- */
-TRACE_EVENT(sched_process_exit,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a waiting task:
- */
-TRACE_EVENT(sched_process_wait,
-
-	TP_PROTO(struct pid *pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-		__entry->pid		= pid_nr(pid);
-		__entry->prio		= current->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for do_fork:
- */
-TRACE_EVENT(sched_process_fork,
-
-	TP_PROTO(struct task_struct *parent, struct task_struct *child),
-
-	TP_ARGS(parent, child),
-
-	TP_STRUCT__entry(
-		__array(	char,	parent_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	parent_pid			)
-		__array(	char,	child_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	child_pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
-		__entry->parent_pid	= parent->pid;
-		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
-		__entry->child_pid	= child->pid;
-	),
-
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid)
-);
-
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-	TP_PROTO(int sig, struct task_struct *p),
-
-	TP_ARGS(sig, p),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig			)
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->sig	= sig;
-	),
-
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/skb.h b/include/trace/skb.h
index d2de7174a6e..e6fd281f7f8 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -1,9 +1,37 @@
-#ifndef _TRACE_SKB_H_
-#define _TRACE_SKB_H_
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
 
 #include <linux/skbuff.h>
 #include <linux/tracepoint.h>
 
-#include <trace/skb_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
 
-#endif
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h
deleted file mode 100644
index 4a1c504c0e1..00000000000
--- a/include/trace/skb_event_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-
-/* use <trace/skb.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM skb
-
-/*
- * Tracepoint for free an sk_buff:
- */
-TRACE_EVENT(kfree_skb,
-
-	TP_PROTO(struct sk_buff *skb, void *location),
-
-	TP_ARGS(skb, location),
-
-	TP_STRUCT__entry(
-		__field(	void *,		skbaddr		)
-		__field(	unsigned short,	protocol	)
-		__field(	void *,		location	)
-	),
-
-	TP_fast_assign(
-		__entry->skbaddr = skb;
-		if (skb) {
-			__entry->protocol = ntohs(skb->protocol);
-		}
-		__entry->location = location;
-	),
-
-	TP_printk("skbaddr=%p protocol=%u location=%p",
-		__entry->skbaddr, __entry->protocol, __entry->location)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
deleted file mode 100644
index 552a50e169a..00000000000
--- a/include/trace/trace_event_types.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* trace/<type>_event_types.h here */
-
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
-#include <trace/lockdep_event_types.h>
-#include <trace/skb_event_types.h>
-#include <trace/kmem_event_types.h>
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 246f2aa6dc4..5a35a914f0e 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,7 @@
 
 #include "trace_output.h"
 
+#define TRACE_HEADER_MULTI_READ
 #include "trace_events_stage_1.h"
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 38985f9b379..475f46a047a 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -1,7 +1,7 @@
 /*
  * Stage 1 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
@@ -36,4 +36,4 @@
 	};							\
 	static struct ftrace_event_call event_##name
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 59cfd7dfe68..aa4a67a0656 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -1,7 +1,7 @@
 /*
  * Stage 2 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
  * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
@@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	return TRACE_TYPE_HANDLED;					\
 }
 	
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 /*
  * Setup the showing format of trace point.
@@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef __field
 #define __field(type, item)						\
@@ -167,4 +167,4 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 5bb1b7ffbdb..45c04e1f38d 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -1,7 +1,7 @@
 /*
  * Stage 3 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * static void ftrace_event_<call>(proto)
  * {
@@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef _TRACE_PROFILE
 #undef _TRACE_PROFILE_INIT
-- 
cgit v1.2.3-70-g09d2


From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 Apr 2009 16:53:05 +0200
Subject: wait: don't use __wake_up_common()

'777c6c5 wait: prevent exclusive waiter starvation' made
__wake_up_common() global to be used from abort_exclusive_wait().

It was needed to do a wake-up with the waitqueue lock held while
passing down a key to the wake-up function.

Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and
__wake_up_sync_key()' there is an appropriate wrapper for this case:
__wake_up_locked_key().

Use it here and make __wake_up_common() private to the scheduler
again.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/wait.h | 2 --
 kernel/sched.c       | 2 +-
 kernel/wait.c        | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5d631c17eae..67d4e89b62d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 	list_del(&old->task_list);
 }
 
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/kernel/sched.c b/kernel/sched.c
index 36d213bca47..92b4b56ad09 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c..ea7c3b4275c 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_common(q, mode, 1, 0, key);
+		__wake_up_locked_key(q, mode, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
-- 
cgit v1.2.3-70-g09d2


From 72c6a9870f901045f2464c3dc6ee8914bfdc07aa Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 14 Apr 2009 17:33:57 +0200
Subject: rculist.h: introduce list_entry_rcu() and list_first_entry_rcu()

I've run into the situation where I need to use list_first_entry with
rcu-guarded list. This patch introduces this.

Also simplify list_for_each_entry_rcu() to use new list_entry_rcu()
instead of list_entry().

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
LKML-Reference: <20090414153356.GC3999@psychotron.englab.brq.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rculist.h | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e649bd3f2c9..5710f43bbc9 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	at->prev = last;
 }
 
+/**
+ * list_entry_rcu - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_entry_rcu(ptr, type, member) \
+	container_of(rcu_dereference(ptr), type, member)
+
+/**
+ * list_first_entry_rcu - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_first_entry_rcu(ptr, type, member) \
+	list_entry_rcu((ptr)->next, type, member)
+
 #define __list_for_each_rcu(pos, head) \
 	for (pos = rcu_dereference((head)->next); \
 		pos != (head); \
@@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member) \
-	for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
+	for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
 		prefetch(pos->member.next), &pos->member != (head); \
-		pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
+		pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 11 Apr 2009 12:59:57 -0400
Subject: tracing: make trace_seq operations available for core kernel

In the process to make TRACE_EVENT macro work for modules, the trace_seq
operations must be available for core kernel code.

These operations are quite useful and can be used for other implementations.

The main idea is that we create a trace_seq handle that acts very much
like the seq_file handle.

	struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL);

	trace_seq_init(s);
	trace_seq_printf(s, "some data %d\n", variable);

	printk("%s", s->buffer);

The main use is to allow a top level function call several other functions
that may store printf like data into the buffer. Then at the end, the top
level function can process all the data with any method it would like to.
It could be passed to userspace, output via printk or even use seq_file:

	trace_seq_to_user(s, ubuf, cnt);
	seq_puts(m, s->buffer);

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h   | 89 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        | 15 +-------
 kernel/trace/trace_output.h | 16 +-------
 3 files changed, 92 insertions(+), 28 deletions(-)
 create mode 100644 include/linux/trace_seq.h

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
new file mode 100644
index 00000000000..28051da876d
--- /dev/null
+++ b/include/linux/trace_seq.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_TRACE_SEQ_H
+#define _LINUX_TRACE_SEQ_H
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE.
+ */
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+/*
+ * Currently only defined when tracing is enabled.
+ */
+#ifdef CONFIG_TRACING
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+
+#else /* CONFIG_TRACING */
+static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)))
+{
+	return 0;
+}
+static inline int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	return 0;
+}
+
+static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+}
+static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt)
+{
+	return 0;
+}
+static inline int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	return 0;
+}
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c);
+{
+	return 0;
+}
+static inline int
+trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+	return 0;
+}
+static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				       size_t len)
+{
+	return 0;
+}
+static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	return NULL;
+}
+static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	return 0;
+}
+#endif /* CONFIG_TRACING */
+
+#endif /* _LINUX_TRACE_SEQ_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b05b6ac982a..1882846b738 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,6 +12,8 @@
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
 
+#include <linux/trace_seq.h>
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -423,19 +425,6 @@ struct tracer {
 	struct tracer_stat	*stats;
 };
 
-struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
-	unsigned int		len;
-	unsigned int		readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 
 #define TRACE_PIPE_ALL_CPU	-1
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 91630217fb4..5c7cbfb65c7 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,6 +1,7 @@
 #ifndef __TRACE_EVENTS_H
 #define __TRACE_EVENTS_H
 
+#include <linux/trace_seq.h>
 #include "trace.h"
 
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
@@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
-extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
-
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-				size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
 				 struct trace_seq *s, unsigned long sym_flags);
 extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-- 
cgit v1.2.3-70-g09d2


From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 11:20:49 -0400
Subject: tracing/events: move declarations from trace directory to core
 include

In preparation to allowing trace events to happen in modules, we need
to move some of the local declarations in the kernel/trace directory
into include/linux.

This patch simply moves the declarations and performs no context changes.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 146 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 120 +----------------------------------
 kernel/trace/trace_output.h  |  14 -----
 3 files changed, 147 insertions(+), 133 deletions(-)
 create mode 100644 include/linux/ftrace_event.h

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
new file mode 100644
index 00000000000..496b76d9f9d
--- /dev/null
+++ b/include/linux/ftrace_event.h
@@ -0,0 +1,146 @@
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/trace_seq.h>
+#include <linux/ring_buffer.h>
+
+
+struct trace_array;
+struct tracer;
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	unsigned char		type;
+	unsigned char		flags;
+	unsigned char		preempt_count;
+	int			pid;
+	int			tgid;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+	u64			ts;
+
+	unsigned long		iter_flags;
+	loff_t			pos;
+	long			idx;
+
+	cpumask_var_t		started;
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
+struct trace_event {
+	struct hlist_node	node;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
+	TRACE_TYPE_HANDLED	= 1,
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
+};
+
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+struct ftrace_event_call {
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
+	int			n_preds;
+	struct filter_pred	**preds;
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
+};
+
+#define MAX_FILTER_PRED		8
+#define MAX_FILTER_STR_VAL	128
+
+extern int init_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
+
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+			      char *name, int offset, int size);
+
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
+#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1882846b738..6bcdf4af9b2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
 #include <trace/power.h>
 
 #include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -43,20 +44,6 @@ enum trace_type {
 	__TRACE_LAST_TYPE,
 };
 
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-	unsigned char		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-	int			tgid;
-};
-
 /*
  * Function trace entry - function address and parent function addres:
  */
@@ -265,8 +252,6 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 };
 
-struct trace_iterator;
-
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void);
 		__ftrace_bad_type();					\
 	} while (0)
 
-/* Return values for print_line callback */
-enum print_line_t {
-	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
-	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
-	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
-};
-
-
 /*
  * An option specific to a tracer. This is a boolean value.
  * The bit is the bit index that sets its value on the
@@ -428,31 +404,6 @@ struct tracer {
 
 #define TRACE_PIPE_ALL_CPU	-1
 
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-	struct trace_array	*tr;
-	struct tracer		*trace;
-	void			*private;
-	int			cpu_file;
-	struct mutex		mutex;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
-
-	/* The below is zeroed out in pipe_read */
-	struct trace_seq	seq;
-	struct trace_entry	*ent;
-	int			cpu;
-	u64			ts;
-
-	unsigned long		iter_flags;
-	loff_t			pos;
-	long			idx;
-
-	cpumask_var_t		started;
-};
-
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
@@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-				  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
-
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
@@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
@@ -790,28 +731,6 @@ struct ftrace_event_field {
 	int			size;
 };
 
-struct ftrace_event_call {
-	char			*name;
-	char			*system;
-	struct dentry		*dir;
-	int			enabled;
-	int			(*regfunc)(void);
-	void			(*unregfunc)(void);
-	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct trace_seq *s);
-	int			(*define_fields)(void);
-	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-	atomic_t	profile_count;
-	int		(*profile_enable)(struct ftrace_event_call *);
-	void		(*profile_disable)(struct ftrace_event_call *);
-#endif
-};
-
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
@@ -825,9 +744,6 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED		8
-#define MAX_FILTER_STR_VAL	128
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -845,9 +761,6 @@ struct filter_pred {
 	int clear;
 };
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size);
-extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
 extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
@@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
-extern int filter_current_check_discard(struct ftrace_event_call *call,
-					void *rec,
-					struct ring_buffer_event *event);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
@@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)				\
-do {									\
-	__trace_printk_check_format(fmt, ##args);			\
-	tracing_record_cmdline(current);				\
-	if (__builtin_constant_p(fmt)) {				\
-		static const char *trace_printk_fmt			\
-		  __attribute__((section("__trace_printk_fmt"))) =	\
-			__builtin_constant_p(fmt) ? fmt : NULL;		\
-									\
-		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
-	} else								\
-		__trace_printk(ip, fmt, ##args);			\
-} while (0)
-
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 5c7cbfb65c7..6e220a8e570 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -4,18 +4,6 @@
 #include <linux/trace_seq.h>
 #include "trace.h"
 
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-					      int flags);
-
-struct trace_event {
-	struct hlist_node	node;
-	int			type;
-	trace_print_func	trace;
-	trace_print_func	raw;
-	trace_print_func	hex;
-	trace_print_func	binary;
-};
-
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
@@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
 extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
-- 
cgit v1.2.3-70-g09d2


From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 13:52:20 -0400
Subject: tracing/events: convert event call sites to use a link list

Impact: makes it possible to define events in modules

The events are created by reading down the section that they are linked
in by the macros. But this is not scalable to modules. This patch converts
the manipulations to use a global link list, and on boot up it adds
the items in the section to the list.

This change will allow modules to add their tracing events to the list as
well.

Note, this change alone does not permit modules to use the TRACE_EVENT macros,
but the change is needed for them to eventually do so.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h       |  1 +
 kernel/trace/trace.h               | 13 +---------
 kernel/trace/trace_event_profile.c |  4 +--
 kernel/trace/trace_events.c        | 51 +++++++++++++++++++++++---------------
 kernel/trace/trace_events_filter.c |  8 +++---
 5 files changed, 39 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 496b76d9f9d..17810853b4f 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -83,6 +83,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 struct ftrace_event_call {
+	struct list_head	list;
 	char			*name;
 	char			*system;
 	struct dentry		*dir;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bcdf4af9b2..8817c18ef97 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,11 +739,6 @@ struct event_subsystem {
 	struct filter_pred	**preds;
 };
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
+extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 199de9c7422..7bf2ad65eee 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_enable(event);
 	}
@@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_disable(event);
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ead68ac9919..5c66aaff07c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,8 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+LIST_HEAD(ftrace_events);
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size)
 {
@@ -54,16 +56,14 @@ err:
 
 static void ftrace_clear_events(void)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+	struct ftrace_event_call *call;
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		call++;
 	}
 }
 
@@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	char *event = NULL, *sub = NULL, *match;
 	int ret = -EINVAL;
 
@@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	for_each_event(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next = call;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
 	for (;;) {
-		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		if (list == &ftrace_events)
 			return NULL;
 
+		call = list_entry(list, struct ftrace_event_call, list);
+
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
@@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		if (call->regfunc)
 			break;
 
-		call++;
-		next = call;
+		list = list->next;
 	}
 
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
  retry:
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+	if (list == &ftrace_events)
 		return NULL;
 
+	call = list_entry(list, struct ftrace_event_call, list);
+
 	if (!call->enabled) {
-		call++;
+		list = list->next;
 		goto retry;
 	}
 
-	next = call;
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	if (!ret) {
 		struct seq_file *m = file->private_data;
 
-		m->private = __start_ftrace_events;
+		m->private = ftrace_events.next;
 	}
 	return ret;
 }
@@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	return 0;
 }
 
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
@@ -830,6 +840,7 @@ static __init int event_trace_init(void)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+		list_add(&call->list, &ftrace_events);
 		event_create_dir(call, d_events);
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index de42dad42a8..d30b06b02b4 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -223,7 +223,7 @@ oom:
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	int i;
 
 	if (system->n_preds) {
@@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 		system->n_preds = 0;
 	}
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
@@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 
 	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
@@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 	system->preds[system->n_preds] = pred;
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
 
 		if (!call->define_fields)
-- 
cgit v1.2.3-70-g09d2


From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 14:53:50 -0400
Subject: tracing/events: add support for modules to TRACE_EVENT

Impact: allow modules to add TRACE_EVENTS on load

This patch adds the final hooks to allow modules to use the TRACE_EVENT
macro. A notifier and a data structure are used to link the TRACE_EVENTs
defined in the module to connect them with the ftrace event tracing system.

It also adds the necessary automated clean ups to the trace events when a
module is removed.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |   3 +
 include/linux/module.h       |   4 ++
 include/linux/trace_seq.h    |   2 +
 include/trace/ftrace.h       |   1 +
 kernel/module.c              |   7 +++
 kernel/trace/trace_events.c  | 128 ++++++++++++++++++++++++++++++++-----------
 6 files changed, 113 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 17810853b4f..75f3ac01a87 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -7,6 +7,7 @@
 
 struct trace_array;
 struct tracer;
+struct dentry;
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
@@ -87,6 +88,7 @@ struct ftrace_event_call {
 	char			*name;
 	char			*system;
 	struct dentry		*dir;
+	struct trace_event	*event;
 	int			enabled;
 	int			(*regfunc)(void);
 	void			(*unregfunc)(void);
@@ -97,6 +99,7 @@ struct ftrace_event_call {
 	struct list_head	fields;
 	int			n_preds;
 	struct filter_pred	**preds;
+	void			*mod;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
diff --git a/include/linux/module.h b/include/linux/module.h
index 627ac082e2a..6155fa44168 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -337,6 +337,10 @@ struct module
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	struct ftrace_event_call *trace_events;
+	unsigned int num_trace_events;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 28051da876d..15ca2c71af1 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_TRACE_SEQ_H
 #define _LINUX_TRACE_SEQ_H
 
+#include <linux/fs.h>
+
 /*
  * Trace sequences are used to allow a function to call several other functions
  * to create a string of data to use (up to a max of PAGE_SIZE.
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 955b967acd7..60c5323bee6 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -477,6 +477,7 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.event			= &ftrace_event_type_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d9..a0394706f10 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
 */
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
@@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod,
 					sizeof(*mod->tracepoints),
 					&mod->num_tracepoints);
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+					 "_ftrace_events",
+					 sizeof(*mod->trace_events),
+					 &mod->num_trace_events);
+#endif
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8b9e621b80b..a4b177720a6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 		return d_events;
 	}
 
-	system->name = name;
+	system->name = kstrdup(name, GFP_KERNEL);
+	if (!system->name) {
+		debugfs_remove(system->entry);
+		kfree(system);
+		return d_events;
+	}
+
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
@@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
 	if (call->raw_init) {
@@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	if (call->regfunc) {
-		entry = debugfs_create_file("enable", 0644, call->dir, call,
-					    &ftrace_enable_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/enable' entry\n", call->name);
-	}
+	if (call->regfunc)
+		entry = trace_create_file("enable", 0644, call->dir, call,
+					  &ftrace_enable_fops);
 
-	if (call->id) {
-		entry = debugfs_create_file("id", 0444, call->dir, call,
-				&ftrace_event_id_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs '%s/id' entry\n",
-					call->name);
-	}
+	if (call->id)
+		entry = trace_create_file("id", 0444, call->dir, call,
+					  &ftrace_event_id_fops);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
-		entry = debugfs_create_file("filter", 0644, call->dir, call,
-					    &ftrace_event_filter_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/filter' entry\n", call->name);
+		entry = trace_create_file("filter", 0644, call->dir, call,
+					  &ftrace_event_filter_fops);
 	}
 
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
 
-	entry = debugfs_create_file("format", 0444, call->dir, call,
-				    &ftrace_event_format_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/format' entry\n", call->name);
+	entry = trace_create_file("format", 0444, call->dir, call,
+				  &ftrace_event_format_fops);
+
+	return 0;
+}
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+static void trace_module_add_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *start, *end;
+	struct dentry *d_events;
+
+	start = mod->trace_events;
+	end = mod->trace_events + mod->num_trace_events;
+
+	if (start == end)
+		return;
+
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return;
+
+	for_each_event(call, start, end) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		call->mod = mod;
+		list_add(&call->list, &ftrace_events);
+		event_create_dir(call, d_events);
+	}
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *p;
+
+	list_for_each_entry_safe(call, p, &ftrace_events, list) {
+		if (call->mod == mod) {
+			if (call->enabled) {
+				call->enabled = 0;
+				call->unregfunc();
+			}
+			if (call->event)
+				unregister_ftrace_event(call->event);
+			debugfs_remove_recursive(call->dir);
+			list_del(&call->list);
+		}
+	}
+}
+
+int trace_module_notify(struct notifier_block *self,
+			unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	mutex_lock(&event_mutex);
+	switch (val) {
+	case MODULE_STATE_COMING:
+		trace_module_add_events(mod);
+		break;
+	case MODULE_STATE_GOING:
+		trace_module_remove_events(mod);
+		break;
+	}
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
 
+struct notifier_block trace_module_nb = {
+	.notifier_call = trace_module_notify,
+	.priority = 0,
+};
+
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static __init int event_trace_init(void)
 {
 	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
+	int ret;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -837,7 +897,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	for_each_event(call) {
+	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
@@ -845,6 +905,10 @@ static __init int event_trace_init(void)
 		event_create_dir(call, d_events);
 	}
 
+	ret = register_module_notifier(&trace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace events module notifier\n");
+
 	return 0;
 }
 fs_initcall(event_trace_init);
-- 
cgit v1.2.3-70-g09d2


From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 19:39:12 -0400
Subject: tracing/events: move trace point headers into include/trace/events

Impact: clean up

Create a sub directory in include/trace called events to keep the
trace point headers in their own separate directory. Only headers that
declare trace points should be defined in this directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kmemtrace.h         |   2 +-
 include/trace/define_trace.h      |   2 +-
 include/trace/events/irq.h        |  57 +++++++
 include/trace/events/kmem.h       | 194 ++++++++++++++++++++++
 include/trace/events/lockdep.h    |  60 +++++++
 include/trace/events/sched.h      | 339 ++++++++++++++++++++++++++++++++++++++
 include/trace/events/skb.h        |  40 +++++
 include/trace/irq.h               |  57 -------
 include/trace/kmem.h              | 194 ----------------------
 include/trace/lockdep.h           |  60 -------
 include/trace/sched.h             | 339 --------------------------------------
 include/trace/skb.h               |  40 -----
 kernel/exit.c                     |   2 +-
 kernel/fork.c                     |   3 +-
 kernel/irq/handle.c               |   2 +-
 kernel/kthread.c                  |   2 +-
 kernel/lockdep.c                  |   2 +-
 kernel/sched.c                    |   2 +-
 kernel/signal.c                   |   2 +-
 kernel/softirq.c                  |   2 +-
 kernel/trace/ftrace.c             |   2 +-
 kernel/trace/trace_sched_switch.c |   2 +-
 kernel/trace/trace_sched_wakeup.c |   2 +-
 mm/util.c                         |   2 +-
 net/core/drop_monitor.c           |   2 +-
 net/core/net-traces.c             |   2 +-
 net/core/skbuff.c                 |   2 +-
 27 files changed, 708 insertions(+), 707 deletions(-)
 create mode 100644 include/trace/events/irq.h
 create mode 100644 include/trace/events/kmem.h
 create mode 100644 include/trace/events/lockdep.h
 create mode 100644 include/trace/events/sched.h
 create mode 100644 include/trace/events/skb.h
 delete mode 100644 include/trace/irq.h
 delete mode 100644 include/trace/kmem.h
 delete mode 100644 include/trace/lockdep.h
 delete mode 100644 include/trace/sched.h
 delete mode 100644 include/trace/skb.h

(limited to 'include/linux')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
index 15c45a27a92..b616d3930c3 100644
--- a/include/linux/kmemtrace.h
+++ b/include/linux/kmemtrace.h
@@ -9,7 +9,7 @@
 
 #ifdef __KERNEL__
 
-#include <trace/kmem.h>
+#include <trace/events/kmem.h>
 
 #ifdef CONFIG_KMEMTRACE
 extern void kmemtrace_init(void);
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 980eb66a6e3..18869417109 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -43,7 +43,7 @@
 #endif
 
 #ifndef TRACE_INCLUDE_PATH
-# define __TRACE_INCLUDE(system) <trace/system.h>
+# define __TRACE_INCLUDE(system) <trace/events/system.h>
 # define UNDEF_TRACE_INCLUDE_FILE
 #else
 # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
new file mode 100644
index 00000000000..75e3468e449
--- /dev/null
+++ b/include/trace/events/irq.h
@@ -0,0 +1,57 @@
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+/*
+ * Tracepoint for entry of interrupt handler:
+ */
+TRACE_FORMAT(irq_handler_entry,
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name)
+	);
+
+/*
+ * Tracepoint for return of an interrupt handler:
+ */
+TRACE_EVENT(irq_handler_exit,
+
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+
+	TP_ARGS(irq, action, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq	)
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->irq	= irq;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
new file mode 100644
index 00000000000..c22c42f980b
--- /dev/null
+++ b/include/trace/events/kmem.h
@@ -0,0 +1,194 @@
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KMEM_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+#endif /* _TRACE_KMEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
new file mode 100644
index 00000000000..45e326b5c7f
--- /dev/null
+++ b/include/trace/events/lockdep.h
@@ -0,0 +1,60 @@
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockdep
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
+
+#endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
new file mode 100644
index 00000000000..ffa1cab586b
--- /dev/null
+++ b/include/trace/events/sched.h
@@ -0,0 +1,339 @@
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched
+
+/*
+ * Tracepoint for calling kthread_stop, performed to end a kthread:
+ */
+TRACE_EVENT(sched_kthread_stop,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
+);
+
+/*
+ * Tracepoint for the return value of the kthread stopping:
+ */
+TRACE_EVENT(sched_kthread_stop_ret,
+
+	TP_PROTO(int ret),
+
+	TP_ARGS(ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ret	= ret;
+	),
+
+	TP_printk("ret %d", __entry->ret)
+);
+
+/*
+ * Tracepoint for waiting on task to unschedule:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wait_task,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->prio	= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for waking up a task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for waking up a new task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup_new,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
+		 struct task_struct *next),
+
+	TP_ARGS(rq, prev, next),
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+);
+
+/*
+ * Tracepoint for a task being migrated:
+ */
+TRACE_EVENT(sched_migrate_task,
+
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+
+	TP_ARGS(p, orig_cpu, dest_cpu),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	orig_cpu		)
+		__field(	int,	dest_cpu		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->orig_cpu	= orig_cpu;
+		__entry->dest_cpu	= dest_cpu;
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
+);
+
+/*
+ * Tracepoint for freeing a task:
+ */
+TRACE_EVENT(sched_process_free,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a task exiting:
+ */
+TRACE_EVENT(sched_process_exit,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a waiting task:
+ */
+TRACE_EVENT(sched_process_wait,
+
+	TP_PROTO(struct pid *pid),
+
+	TP_ARGS(pid),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->pid		= pid_nr(pid);
+		__entry->prio		= current->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for do_fork:
+ */
+TRACE_EVENT(sched_process_fork,
+
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+
+	TP_ARGS(parent, child),
+
+	TP_STRUCT__entry(
+		__array(	char,	parent_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	parent_pid			)
+		__array(	char,	child_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	child_pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
+		__entry->parent_pid	= parent->pid;
+		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
+		__entry->child_pid	= child->pid;
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
+);
+
+/*
+ * Tracepoint for sending a signal:
+ */
+TRACE_EVENT(sched_signal_send,
+
+	TP_PROTO(int sig, struct task_struct *p),
+
+	TP_ARGS(sig, p),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig			)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->sig	= sig;
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
new file mode 100644
index 00000000000..1e8fabb57c0
--- /dev/null
+++ b/include/trace/events/skb.h
@@ -0,0 +1,40 @@
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/irq.h b/include/trace/irq.h
deleted file mode 100644
index 75e3468e449..00000000000
--- a/include/trace/irq.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_IRQ_H
-
-#include <linux/tracepoint.h>
-#include <linux/interrupt.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-	TP_PROTO(int irq, struct irqaction *action),
-	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-	TP_PROTO(int irq, struct irqaction *action, int ret),
-
-	TP_ARGS(irq, action, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	irq	)
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->irq	= irq;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-TRACE_FORMAT(softirq_exit,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-#endif /*  _TRACE_IRQ_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
deleted file mode 100644
index c22c42f980b..00000000000
--- a/include/trace/kmem.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KMEM_H
-
-#include <linux/types.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kmem
-
-TRACE_EVENT(kmalloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmem_cache_alloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmalloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kmem_cache_alloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kfree,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-
-TRACE_EVENT(kmem_cache_free,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-#endif /* _TRACE_KMEM_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
deleted file mode 100644
index 45e326b5c7f..00000000000
--- a/include/trace/lockdep.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_LOCKDEP_H
-
-#include <linux/lockdep.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lockdep
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
-
-TRACE_FORMAT(lock_release,
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-
-	TP_ARGS(lock, ip, waittime),
-
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(unsigned long, wait_usec)
-		__field(unsigned long, wait_nsec_rem)
-	),
-	TP_fast_assign(
-		__entry->name = lock->name;
-		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
-		__entry->wait_usec = (unsigned long) waittime;
-	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
-				       __entry->wait_nsec_rem)
-);
-
-#endif
-#endif
-
-#endif /* _TRACE_LOCKDEP_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/sched.h b/include/trace/sched.h
deleted file mode 100644
index ffa1cab586b..00000000000
--- a/include/trace/sched.h
+++ /dev/null
@@ -1,339 +0,0 @@
-#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_SCHED_H
-
-#include <linux/sched.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sched
-
-/*
- * Tracepoint for calling kthread_stop, performed to end a kthread:
- */
-TRACE_EVENT(sched_kthread_stop,
-
-	TP_PROTO(struct task_struct *t),
-
-	TP_ARGS(t),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
-		__entry->pid	= t->pid;
-	),
-
-	TP_printk("task %s:%d", __entry->comm, __entry->pid)
-);
-
-/*
- * Tracepoint for the return value of the kthread stopping:
- */
-TRACE_EVENT(sched_kthread_stop_ret,
-
-	TP_PROTO(int ret),
-
-	TP_ARGS(ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->ret	= ret;
-	),
-
-	TP_printk("ret %d", __entry->ret)
-);
-
-/*
- * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wait_task,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p),
-
-	TP_ARGS(rq, p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->prio	= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup_new,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_switch,
-
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		 struct task_struct *next),
-
-	TP_ARGS(rq, prev, next),
-
-	TP_STRUCT__entry(
-		__array(	char,	prev_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	prev_pid			)
-		__field(	int,	prev_prio			)
-		__array(	char,	next_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	next_pid			)
-		__field(	int,	next_prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
-	),
-
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
-);
-
-/*
- * Tracepoint for a task being migrated:
- */
-TRACE_EVENT(sched_migrate_task,
-
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-
-	TP_ARGS(p, orig_cpu, dest_cpu),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	orig_cpu		)
-		__field(	int,	dest_cpu		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
-		__entry->dest_cpu	= dest_cpu;
-	),
-
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu)
-);
-
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a task exiting:
- */
-TRACE_EVENT(sched_process_exit,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a waiting task:
- */
-TRACE_EVENT(sched_process_wait,
-
-	TP_PROTO(struct pid *pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-		__entry->pid		= pid_nr(pid);
-		__entry->prio		= current->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for do_fork:
- */
-TRACE_EVENT(sched_process_fork,
-
-	TP_PROTO(struct task_struct *parent, struct task_struct *child),
-
-	TP_ARGS(parent, child),
-
-	TP_STRUCT__entry(
-		__array(	char,	parent_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	parent_pid			)
-		__array(	char,	child_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	child_pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
-		__entry->parent_pid	= parent->pid;
-		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
-		__entry->child_pid	= child->pid;
-	),
-
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid)
-);
-
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-	TP_PROTO(int sig, struct task_struct *p),
-
-	TP_ARGS(sig, p),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig			)
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->sig	= sig;
-	),
-
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid)
-);
-
-#endif /* _TRACE_SCHED_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/skb.h b/include/trace/skb.h
deleted file mode 100644
index 1e8fabb57c0..00000000000
--- a/include/trace/skb.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_SKB_H
-
-#include <linux/skbuff.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM skb
-
-/*
- * Tracepoint for free an sk_buff:
- */
-TRACE_EVENT(kfree_skb,
-
-	TP_PROTO(struct sk_buff *skb, void *location),
-
-	TP_ARGS(skb, location),
-
-	TP_STRUCT__entry(
-		__field(	void *,		skbaddr		)
-		__field(	unsigned short,	protocol	)
-		__field(	void *,		location	)
-	),
-
-	TP_fast_assign(
-		__entry->skbaddr = skb;
-		if (skb) {
-			__entry->protocol = ntohs(skb->protocol);
-		}
-		__entry->location = location;
-	),
-
-	TP_printk("skbaddr=%p protocol=%u location=%p",
-		__entry->skbaddr, __entry->protocol, __entry->location)
-);
-
-#endif /* _TRACE_SKB_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 2fe9d2c7eee..cab535c427b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 4bebf263923..085f73ebcea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,7 +61,6 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
-#include <trace/sched.h>
 #include <linux/magic.h>
 
 #include <asm/pgtable.h>
@@ -71,6 +70,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/sched.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 983d8be8dff..37c63633e78 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -20,7 +20,7 @@
 #include <linux/bootmem.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/irq.h>
+#include <trace/events/irq.h>
 
 #include "internals.h"
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e1c76924545..41c88fe4050 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,7 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #define KTHREAD_NICE_LEVEL (-5)
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 257f21a76c5..47b201ecc6d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -48,7 +48,7 @@
 #include "lockdep_internals.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/lockdep.h>
+#include <trace/events/lockdep.h>
 
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index e6d4518d47e..9f7ffd00b6e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,7 +79,7 @@
 #include "sched_cpupri.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d5703ff003..94ec0a4dde0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/param.h>
 #include <asm/uaccess.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a2d9b458ac2..7ab9dfd8d08 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
-#include <trace/irq.h>
+#include <trace/events/irq.h>
 
 #include <asm/irq.h>
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8e6a0b5c994..a2348898858 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,7 +29,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9d8cccdfaa0..a98106dd979 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153..b8b13c5540f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/mm/util.c b/mm/util.c
index 0e74a22791c..6794a336e9a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/kmem.h>
+#include <trace/events/kmem.h>
 
 /**
  * kstrdup - allocate space for and copy an existing string
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9fd0dc3cca9..b75b6cea49d 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -23,7 +23,7 @@
 #include <linux/bitops.h>
 #include <net/genetlink.h>
 
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include <asm/unaligned.h>
 
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 80177205947..499a67eaf3a 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -24,6 +24,6 @@
 #include <asm/bitops.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce6356cd9f7..12806b84445 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,7 +65,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include "kmap_skb.h"
 
-- 
cgit v1.2.3-70-g09d2


From 05725f7eb4b8acb147c5fc7b91397b1f6bcab00d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 14 Apr 2009 20:17:16 +0200
Subject: rculist: use list_entry_rcu in places where it's appropriate

Use previously introduced list_entry_rcu instead of an open-coded
list_entry + rcu_dereference combination.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
LKML-Reference: <20090414181715.GA3634@psychotron.englab.brq.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h           | 8 +++++---
 ipc/sem.c                       | 4 ++--
 security/integrity/ima/ima_fs.c | 4 ++--
 security/smack/smackfs.c        | 8 ++++----
 4 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049..886df41e745 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,6 +77,7 @@ struct sched_param {
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
 #include <linux/time.h>
@@ -2010,7 +2011,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+#define next_task(p) \
+	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
@@ -2049,8 +2051,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
-	return list_entry(rcu_dereference(p->thread_group.next),
-			  struct task_struct, thread_group);
+	return list_entry_rcu(p->thread_group.next,
+			      struct task_struct, thread_group);
 }
 
 static inline int thread_group_empty(struct task_struct *p)
diff --git a/ipc/sem.c b/ipc/sem.c
index 16a2189e96f..87c2b641fd7 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk)
 		int i;
 
 		rcu_read_lock();
-		un = list_entry(rcu_dereference(ulp->list_proc.next),
-					struct sem_undo, list_proc);
+		un = list_entry_rcu(ulp->list_proc.next,
+				    struct sem_undo, list_proc);
 		if (&un->list_proc == &ulp->list_proc)
 			semid = -1;
 		 else
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index ffbe259700b..510186f0b72 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -84,8 +84,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos)
 	 * against concurrent list-extension
 	 */
 	rcu_read_lock();
-	qe = list_entry(rcu_dereference(qe->later.next),
-			struct ima_queue_entry, later);
+	qe = list_entry_rcu(qe->later.next,
+			    struct ima_queue_entry, later);
 	rcu_read_unlock();
 	(*pos)++;
 
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index e03a7e19c73..11d2cb19d7a 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -734,8 +734,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 		return;
 	}
 
-	m = list_entry(rcu_dereference(smk_netlbladdr_list.next),
-			 struct smk_netlbladdr, list);
+	m = list_entry_rcu(smk_netlbladdr_list.next,
+			   struct smk_netlbladdr, list);
 
 	/* the comparison '>' is a bit hacky, but works */
 	if (new->smk_mask.s_addr > m->smk_mask.s_addr) {
@@ -748,8 +748,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 			list_add_rcu(&new->list, &m->list);
 			return;
 		}
-		m_next = list_entry(rcu_dereference(m->list.next),
-				 struct smk_netlbladdr, list);
+		m_next = list_entry_rcu(m->list.next,
+					struct smk_netlbladdr, list);
 		if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) {
 			list_add_rcu(&new->list, &m->list);
 			return;
-- 
cgit v1.2.3-70-g09d2


From e6a1a89d572c31b62d6dcf11a371c7323852d9b2 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 15 Apr 2009 18:22:41 +0900
Subject: dma-debug: add dma_debug_resize_entries() to adjust the number of
 dma_debug_entries

We use a static value for the number of dma_debug_entries. It can be
overwritten by a kernel command line option.

Some IOMMUs (e.g. GART) can't set an appropriate value by a kernel
command line option because they can't know such value until they
finish initializing up their hardware.

This patch adds dma_debug_resize_entries() enables IOMMUs to adjust
the number of dma_debug_entries anytime.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: fujita.tomonori@lab.ntt.co.jp
Cc: akpm@linux-foundation.org
LKML-Reference: <20090415182234R.fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/dma-debug.h |  7 +++++
 lib/dma-debug.c           | 72 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 73 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 28d53cb7b5a..171ad8aedc8 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus);
 
 extern void dma_debug_init(u32 num_entries);
 
+extern int dma_debug_resize_entries(u32 num_entries);
+
 extern void debug_dma_map_page(struct device *dev, struct page *page,
 			       size_t offset, size_t size,
 			       int direction, dma_addr_t dma_addr,
@@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries)
 {
 }
 
+static inline int dma_debug_resize_entries(u32 num_entries)
+{
+	return 0;
+}
+
 static inline void debug_dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
 				      int direction, dma_addr_t dma_addr,
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d3da7edc034..5d61019330c 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -85,6 +85,7 @@ static u32 show_num_errors = 1;
 
 static u32 num_free_entries;
 static u32 min_free_entries;
+static u32 nr_total_entries;
 
 /* number of preallocated entries requested by kernel cmdline */
 static u32 req_entries;
@@ -257,6 +258,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 	put_hash_bucket(bucket, &flags);
 }
 
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+	struct dma_debug_entry *entry;
+
+	entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+	list_del(&entry->list);
+	memset(entry, 0, sizeof(*entry));
+
+	num_free_entries -= 1;
+	if (num_free_entries < min_free_entries)
+		min_free_entries = num_free_entries;
+
+	return entry;
+}
+
 /* struct dma_entry allocator
  *
  * The next two functions implement the allocator for
@@ -276,9 +292,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 		goto out;
 	}
 
-	entry = list_entry(free_entries.next, struct dma_debug_entry, list);
-	list_del(&entry->list);
-	memset(entry, 0, sizeof(*entry));
+	entry = __dma_entry_alloc();
 
 #ifdef CONFIG_STACKTRACE
 	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
@@ -286,9 +300,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	entry->stacktrace.skip = 2;
 	save_stack_trace(&entry->stacktrace);
 #endif
-	num_free_entries -= 1;
-	if (num_free_entries < min_free_entries)
-		min_free_entries = num_free_entries;
 
 out:
 	spin_unlock_irqrestore(&free_entries_lock, flags);
@@ -310,6 +321,53 @@ static void dma_entry_free(struct dma_debug_entry *entry)
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 }
 
+int dma_debug_resize_entries(u32 num_entries)
+{
+	int i, delta, ret = 0;
+	unsigned long flags;
+	struct dma_debug_entry *entry;
+	LIST_HEAD(tmp);
+
+	spin_lock_irqsave(&free_entries_lock, flags);
+
+	if (nr_total_entries < num_entries) {
+		delta = num_entries - nr_total_entries;
+
+		spin_unlock_irqrestore(&free_entries_lock, flags);
+
+		for (i = 0; i < delta; i++) {
+			entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+			if (!entry)
+				break;
+
+			list_add_tail(&entry->list, &tmp);
+		}
+
+		spin_lock_irqsave(&free_entries_lock, flags);
+
+		list_splice(&tmp, &free_entries);
+		nr_total_entries += i;
+		num_free_entries += i;
+	} else {
+		delta = nr_total_entries - num_entries;
+
+		for (i = 0; i < delta && !list_empty(&free_entries); i++) {
+			entry = __dma_entry_alloc();
+			kfree(entry);
+		}
+
+		nr_total_entries -= i;
+	}
+
+	if (nr_total_entries != num_entries)
+		ret = 1;
+
+	spin_unlock_irqrestore(&free_entries_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_debug_resize_entries);
+
 /*
  * DMA-API debugging init code
  *
@@ -490,6 +548,8 @@ void dma_debug_init(u32 num_entries)
 		return;
 	}
 
+	nr_total_entries = num_free_entries;
+
 	printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
 }
 
-- 
cgit v1.2.3-70-g09d2


From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001
From: Shawn Du <duyuyang@gmail.com>
Date: Tue, 14 Apr 2009 13:58:56 +0800
Subject: blktrace: support per-partition tracing

Though one can specify '-d /dev/sda1' when using blktrace, it still
traces the whole sda.

To support per-partition tracing, when we start tracing, we initialize
bt->start_lba and bt->end_lba to the start and end sector of that
partition.

Note some actions are per device, thus we don't filter 0-sector events.

The original patch and discussion can be found here:
	http://marc.info/?l=linux-btrace&m=122949374214540&w=2

Signed-off-by: Shawn Du <duyuyang@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42620.4050701@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/compat_ioctl.c         |  2 +-
 drivers/scsi/sg.c            |  1 +
 include/linux/blktrace_api.h | 24 +++++++++++++-----------
 kernel/trace/blktrace.c      | 29 +++++++++++++++++++++--------
 4 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615dea46..f8c218cd08e 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
 	memcpy(&buts.name, &cbuts.name, 32);
 
 	mutex_lock(&bdev->bd_mutex);
-	ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts);
+	ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
 	mutex_unlock(&bdev->bd_mutex);
 	if (ret)
 		return ret;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 82312df9b0b..49c98730bb8 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return blk_trace_setup(sdp->device->request_queue,
 				       sdp->disk->disk_name,
 				       MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
+				       NULL,
 				       (char *)arg);
 	case BLKTRACESTART:
 		return blk_trace_startstop(sdp->device->request_queue, 1);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index d960889e92e..267edc4017e 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -165,8 +165,9 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q,
-	char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern int do_blk_trace_setup(struct request_queue *q, char *name,
+			      dev_t dev, struct block_device *bdev,
+			      struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 /**
@@ -193,6 +194,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			   struct block_device *bdev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
@@ -200,15 +202,15 @@ extern int blk_trace_remove(struct request_queue *q);
 extern struct attribute_group blk_trace_attr_group;
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
-#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
-#define blk_trace_shutdown(q)			do { } while (0)
-#define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
-#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
-#define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
-#define blk_trace_startstop(q, start)		(-ENOTTY)
-#define blk_trace_remove(q)			(-ENOTTY)
-#define blk_add_trace_msg(q, fmt, ...)		do { } while (0)
-
+# define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+# define blk_trace_shutdown(q)				do { } while (0)
+# define do_blk_trace_setup(q, name, dev, bdev, buts)	(-ENOTTY)
+# define blk_add_driver_data(q, rq, data, len)		do {} while (0)
+# define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
+# define blk_trace_startstop(q, start)			(-ENOTTY)
+# define blk_trace_remove(q)				(-ENOTTY)
+# define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2b98195b338..e932654cf59 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
+	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 		return 1;
 	if (bt->pid && pid != bt->pid)
 		return 1;
@@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(rw, DISCARD);
 
 	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
+	if (act_log_check(bt, what, sector, pid))
 		return;
 	cpu = raw_smp_processor_id();
 
@@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = {
  * Setup everything required to start tracing
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
+		       struct block_device *bdev,
+		       struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
+	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else
 		bt->end_lba = -1ULL;
 
+	/* overwrite with user settings */
+	if (buts->start_lba)
+		bt->start_lba = buts->start_lba;
+	if (buts->end_lba)
+		bt->end_lba = buts->end_lba;
+
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
@@ -505,6 +517,7 @@ err:
 }
 
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
 		    char __user *arg)
 {
 	struct blk_user_trace_setup buts;
@@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (ret)
 		return -EFAULT;
 
-	ret = do_blk_trace_setup(q, name, dev, &buts);
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 		return ret;
 
@@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 	case BLKTRACESTART:
 		start = 1;
-- 
cgit v1.2.3-70-g09d2


From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 14:00:05 +0800
Subject: blktrace: add trace/ to /sys/block/sda

Impact: allow ftrace-plugin blktrace to trace device-mapper devices

To trace a single partition:
  # echo 1 > /sys/block/sda/sda1/enable

To trace the whole sda instead:
  # echo 1 > /sys/block/sda/enable

Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace
can't be used to trace device-mapper devices.

Now:

  # echo 1 > /sys/block/dm-0/trace/enable
  echo: write error: No such device or address
  # mount -t ext4 /dev/dm-0 /mnt
  # echo 1 > /sys/block/dm-0/trace/enable
  # echo blk > /debug/tracing/current_tracer

Reported-by: Theodore Tso <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42665.6020506@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-sysfs.c            | 7 ++++++-
 include/linux/blktrace_api.h | 6 ++++++
 kernel/trace/blktrace.c      | 5 +++++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 73f36beff5c..8653d710b39 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -387,16 +387,21 @@ struct kobj_type blk_queue_ktype = {
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
+	struct device *dev = disk_to_dev(disk);
 
 	struct request_queue *q = disk->queue;
 
 	if (WARN_ON(!q))
 		return -ENXIO;
 
+	ret = blk_trace_init_sysfs(dev);
+	if (ret)
+		return ret;
+
 	if (!q->request_fn)
 		return 0;
 
-	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj),
 			  "%s", "queue");
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 267edc4017e..62763c95285 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
+extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
 
@@ -210,6 +211,11 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
+static inline int blk_trace_init_sysfs(struct device *dev)
+{
+	return 0;
+}
+
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif /* __KERNEL__ */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d1098988052..8e7c5da3a3e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1644,3 +1644,8 @@ out:
 	return ret ? ret : count;
 }
 
+int blk_trace_init_sysfs(struct device *dev)
+{
+	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
-- 
cgit v1.2.3-70-g09d2


From 76620aafd66f0004829764940c5466144969cffc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Apr 2009 02:02:07 -0700
Subject: gro: New frags interface to avoid copying shinfo

It turns out that copying a 16-byte area at ~800k times a second
can be really expensive :) This patch redesigns the frags GRO
interface to avoid copying that area twice.

The two disciples of the frags interface have been converted.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/cxgb3/adapter.h |  2 +-
 drivers/net/cxgb3/sge.c     | 53 +++++++++++++++++------------
 drivers/net/sfc/rx.c        | 26 ++++++++++-----
 include/linux/if_vlan.h     |  6 ++--
 include/linux/netdevice.h   | 22 ++++++------
 net/8021q/vlan_core.c       |  4 +--
 net/core/dev.c              | 81 ++++++++++++++++++++-------------------------
 7 files changed, 101 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 714df2b675e..322434ac42f 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -195,7 +195,7 @@ struct sge_qset {		/* an SGE queue set */
 	struct sge_rspq rspq;
 	struct sge_fl fl[SGE_RXQ_PER_SET];
 	struct sge_txq txq[SGE_TXQ_PER_SET];
-	struct napi_gro_fraginfo lro_frag_tbl;
+	int nomem;
 	int lro_enabled;
 	void *lro_va;
 	struct net_device *netdev;
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 26d3587f339..73d569e758e 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -654,7 +654,8 @@ static void t3_reset_qset(struct sge_qset *q)
 	q->txq_stopped = 0;
 	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
 	q->rx_reclaim_timer.function = NULL;
-	q->lro_frag_tbl.nr_frags = q->lro_frag_tbl.len = 0;
+	q->nomem = 0;
+	napi_free_frags(&q->napi);
 }
 
 
@@ -2074,20 +2075,19 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 			 struct sge_fl *fl, int len, int complete)
 {
 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+	struct sk_buff *skb = NULL;
 	struct cpl_rx_pkt *cpl;
-	struct skb_frag_struct *rx_frag = qs->lro_frag_tbl.frags;
-	int nr_frags = qs->lro_frag_tbl.nr_frags;
-	int frag_len = qs->lro_frag_tbl.len;
+	struct skb_frag_struct *rx_frag;
+	int nr_frags;
 	int offset = 0;
 
-	if (!nr_frags) {
-		offset = 2 + sizeof(struct cpl_rx_pkt);
-		qs->lro_va = cpl = sd->pg_chunk.va + 2;
+	if (!qs->nomem) {
+		skb = napi_get_frags(&qs->napi);
+		qs->nomem = !skb;
 	}
 
 	fl->credits--;
 
-	len -= offset;
 	pci_dma_sync_single_for_cpu(adap->pdev,
 				    pci_unmap_addr(sd, dma_addr),
 				    fl->buf_size - SGE_PG_RSVD,
@@ -2100,21 +2100,38 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 			       fl->alloc_size,
 			       PCI_DMA_FROMDEVICE);
 
+	if (!skb) {
+		put_page(sd->pg_chunk.page);
+		if (complete)
+			qs->nomem = 0;
+		return;
+	}
+
+	rx_frag = skb_shinfo(skb)->frags;
+	nr_frags = skb_shinfo(skb)->nr_frags;
+
+	if (!nr_frags) {
+		offset = 2 + sizeof(struct cpl_rx_pkt);
+		qs->lro_va = sd->pg_chunk.va + 2;
+	}
+	len -= offset;
+
 	prefetch(qs->lro_va);
 
 	rx_frag += nr_frags;
 	rx_frag->page = sd->pg_chunk.page;
 	rx_frag->page_offset = sd->pg_chunk.offset + offset;
 	rx_frag->size = len;
-	frag_len += len;
-	qs->lro_frag_tbl.nr_frags++;
-	qs->lro_frag_tbl.len = frag_len;
 
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+	skb_shinfo(skb)->nr_frags++;
 
 	if (!complete)
 		return;
 
-	qs->lro_frag_tbl.ip_summed = CHECKSUM_UNNECESSARY;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	cpl = qs->lro_va;
 
 	if (unlikely(cpl->vlan_valid)) {
@@ -2123,15 +2140,11 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 		struct vlan_group *grp = pi->vlan_grp;
 
 		if (likely(grp != NULL)) {
-			vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan),
-				       &qs->lro_frag_tbl);
-			goto out;
+			vlan_gro_frags(&qs->napi, grp, ntohs(cpl->vlan));
+			return;
 		}
 	}
-	napi_gro_frags(&qs->napi, &qs->lro_frag_tbl);
-
-out:
-	qs->lro_frag_tbl.nr_frags = qs->lro_frag_tbl.len = 0;
+	napi_gro_frags(&qs->napi);
 }
 
 /**
@@ -2300,8 +2313,6 @@ no_mem:
 			if (fl->use_pages) {
 				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
 
-				prefetch(&qs->lro_frag_tbl);
-
 				prefetch(addr);
 #if L1_CACHE_BYTES < 128
 				prefetch(addr + L1_CACHE_BYTES);
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index 66d7fe3db3e..01f9432c31e 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -450,17 +450,27 @@ static void efx_rx_packet_lro(struct efx_channel *channel,
 
 	/* Pass the skb/page into the LRO engine */
 	if (rx_buf->page) {
-		struct napi_gro_fraginfo info;
+		struct sk_buff *skb = napi_get_frags(napi);
 
-		info.frags[0].page = rx_buf->page;
-		info.frags[0].page_offset = efx_rx_buf_offset(rx_buf);
-		info.frags[0].size = rx_buf->len;
-		info.nr_frags = 1;
-		info.ip_summed = CHECKSUM_UNNECESSARY;
-		info.len = rx_buf->len;
+		if (!skb) {
+			put_page(rx_buf->page);
+			goto out;
+		}
+
+		skb_shinfo(skb)->frags[0].page = rx_buf->page;
+		skb_shinfo(skb)->frags[0].page_offset =
+			efx_rx_buf_offset(rx_buf);
+		skb_shinfo(skb)->frags[0].size = rx_buf->len;
+		skb_shinfo(skb)->nr_frags = 1;
+
+		skb->len = rx_buf->len;
+		skb->data_len = rx_buf->len;
+		skb->truesize += rx_buf->len;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-		napi_gro_frags(napi, &info);
+		napi_gro_frags(napi);
 
+out:
 		EFX_BUG_ON_PARANOID(rx_buf->skb);
 		rx_buf->page = NULL;
 	} else {
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index e1ff5b14310..7ff9af1d0f0 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -118,8 +118,7 @@ extern int vlan_hwaccel_do_receive(struct sk_buff *skb);
 extern int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 			    unsigned int vlan_tci, struct sk_buff *skb);
 extern int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-			  unsigned int vlan_tci,
-			  struct napi_gro_fraginfo *info);
+			  unsigned int vlan_tci);
 
 #else
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -154,8 +153,7 @@ static inline int vlan_gro_receive(struct napi_struct *napi,
 }
 
 static inline int vlan_gro_frags(struct napi_struct *napi,
-				 struct vlan_group *grp, unsigned int vlan_tci,
-				 struct napi_gro_fraginfo *info)
+				 struct vlan_group *grp, unsigned int vlan_tci)
 {
 	return NET_RX_DROP;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f4a75..54db3ebf219 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1047,14 +1047,6 @@ struct packet_type {
 	struct list_head	list;
 };
 
-struct napi_gro_fraginfo {
-	skb_frag_t frags[MAX_SKB_FRAGS];
-	unsigned int nr_frags;
-	unsigned int ip_summed;
-	unsigned int len;
-	__wsum csum;
-};
-
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 
@@ -1442,12 +1434,18 @@ extern int		napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_reuse_skb(struct napi_struct *napi,
 				       struct sk_buff *skb);
-extern struct sk_buff *	napi_fraginfo_skb(struct napi_struct *napi,
-					  struct napi_gro_fraginfo *info);
+extern struct sk_buff *	napi_get_frags(struct napi_struct *napi);
 extern int		napi_frags_finish(struct napi_struct *napi,
 					  struct sk_buff *skb, int ret);
-extern int		napi_gro_frags(struct napi_struct *napi,
-				       struct napi_gro_fraginfo *info);
+extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
+extern int		napi_gro_frags(struct napi_struct *napi);
+
+static inline void napi_free_frags(struct napi_struct *napi)
+{
+	kfree_skb(napi->skb);
+	napi->skb = NULL;
+}
+
 extern void		netif_nit_deliver(struct sk_buff *skb);
 extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 654e45f5719..c1f51e4a01b 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -114,9 +114,9 @@ int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 EXPORT_SYMBOL(vlan_gro_receive);
 
 int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-		   unsigned int vlan_tci, struct napi_gro_fraginfo *info)
+		   unsigned int vlan_tci)
 {
-	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
 		return NET_RX_DROP;
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d17e0..619fa141b8f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2519,16 +2519,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_reuse_skb);
 
-struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
-				  struct napi_gro_fraginfo *info)
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
 {
 	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
-	struct ethhdr *eth;
-	skb_frag_t *frag;
-	int i;
-
-	napi->skb = NULL;
 
 	if (!skb) {
 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
@@ -2536,47 +2530,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 			goto out;
 
 		skb_reserve(skb, NET_IP_ALIGN);
-	}
-
-	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
-	frag = &info->frags[info->nr_frags - 1];
 
-	for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
-		skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
-				   frag->size);
-		frag++;
+		napi->skb = skb;
 	}
-	skb_shinfo(skb)->nr_frags = info->nr_frags;
-
-	skb->data_len = info->len;
-	skb->len += info->len;
-	skb->truesize += info->len;
-
-	skb_reset_mac_header(skb);
-	skb_gro_reset_offset(skb);
-
-	eth = skb_gro_header(skb, sizeof(*eth));
-	if (!eth) {
-		napi_reuse_skb(napi, skb);
-		skb = NULL;
-		goto out;
-	}
-
-	skb_gro_pull(skb, sizeof(*eth));
-
-	/*
-	 * This works because the only protocols we care about don't require
-	 * special handling.  We'll fix it up properly at the end.
-	 */
-	skb->protocol = eth->h_proto;
-
-	skb->ip_summed = info->ip_summed;
-	skb->csum = info->csum;
 
 out:
 	return skb;
 }
-EXPORT_SYMBOL(napi_fraginfo_skb);
+EXPORT_SYMBOL(napi_get_frags);
 
 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
@@ -2606,9 +2567,39 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 }
 EXPORT_SYMBOL(napi_frags_finish);
 
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+	struct sk_buff *skb = napi->skb;
+	struct ethhdr *eth;
+
+	napi->skb = NULL;
+
+	skb_reset_mac_header(skb);
+	skb_gro_reset_offset(skb);
+
+	eth = skb_gro_header(skb, sizeof(*eth));
+	if (!eth) {
+		napi_reuse_skb(napi, skb);
+		skb = NULL;
+		goto out;
+	}
+
+	skb_gro_pull(skb, sizeof(*eth));
+
+	/*
+	 * This works because the only protocols we care about don't require
+	 * special handling.  We'll fix it up properly at the end.
+	 */
+	skb->protocol = eth->h_proto;
+
+out:
+	return skb;
+}
+EXPORT_SYMBOL(napi_frags_skb);
+
+int napi_gro_frags(struct napi_struct *napi)
 {
-	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
 		return NET_RX_DROP;
@@ -2712,7 +2703,7 @@ void netif_napi_del(struct napi_struct *napi)
 	struct sk_buff *skb, *next;
 
 	list_del_init(&napi->dev_list);
-	kfree_skb(napi->skb);
+	napi_free_frags(napi);
 
 	for (skb = napi->gro_list; skb; skb = next) {
 		next = skb->next;
-- 
cgit v1.2.3-70-g09d2


From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:24:06 -0400
Subject: ftrace: use module notifier for function tracer

The hooks in the module code for the function tracer must be called
before any of that module code runs. The function tracer hooks
modify the module (replacing calls to mcount to nops). If the code
is executed while the change occurs, then the CPU can take a GPF.

To handle the above with a bit of paranoia, I originally implemented
the hooks as calls directly from the module code.

After examining the notifier calls, it looks as though the start up
notify is called before any of the module's code is executed. This makes
the use of the notify safe with ftrace.

Only the startup notify is required to be "safe". The shutdown simply
removes the entries from the ftrace function list, and does not modify
any code.

This change has another benefit. It removes a issue with a reverse dependency
in the mutexes of ftrace_lock and module_mutex.

[ Impact: fix lock dependency bug, cleanup ]

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  7 ----
 include/linux/module.h |  4 +++
 kernel/module.c        | 19 ++++-------
 kernel/trace/ftrace.c  | 90 +++++++++++++++++++++++++++++++++++---------------
 4 files changed, 75 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 53869bef610..97c83e1bc58 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
-extern void ftrace_release(void *start, unsigned long size);
-
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
@@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(struct module *mod,
-			       unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
-static inline void
-ftrace_init_module(struct module *mod,
-		   unsigned long *start, unsigned long *end) { }
 #endif
 
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 6155fa44168..a8f2c0aa4c3 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -341,6 +341,10 @@ struct module
 	struct ftrace_event_call *trace_events;
 	unsigned int num_trace_events;
 #endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	unsigned long *ftrace_callsites;
+	unsigned int num_ftrace_callsites;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
diff --git a/kernel/module.c b/kernel/module.c
index a0394706f10..2383e60fcf3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1490,9 +1490,6 @@ static void free_module(struct module *mod)
 	/* Free any allocated parameters. */
 	destroy_params(mod->kp, mod->num_kp);
 
-	/* release any pointers to mcount in this module */
-	ftrace_release(mod->module_core, mod->core_size);
-
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
@@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-	unsigned long *mseg;
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod,
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
 #endif
-
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	/* sechdrs[0].sh_size is always zero */
+	mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+					     "__mcount_loc",
+					     sizeof(*mod->ftrace_callsites),
+					     &mod->num_ftrace_callsites);
+#endif
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod,
 			dynamic_debug_setup(debug, num_debug);
 	}
 
-	/* sechdrs[0].sh_size is always zero */
-	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mod, mseg, mseg + num_mcount);
-
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 		goto cleanup;
@@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod,
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
-	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a2348898858..5b606f45b6c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
 	rec->flags |= FTRACE_FL_FREE;
 }
 
-void ftrace_release(void *start, unsigned long size)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
-	unsigned long s = (unsigned long)start;
-	unsigned long e = s + size;
-
-	if (ftrace_disabled || !start)
-		return;
-
-	mutex_lock(&ftrace_lock);
-	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e)) {
-			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
-			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
-	mutex_unlock(&ftrace_lock);
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
@@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod,
 	return 0;
 }
 
-void ftrace_init_module(struct module *mod,
-			unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long s = (unsigned long)start;
+	unsigned long e = (unsigned long)end;
+
+	if (ftrace_disabled || !start || start == end)
+		return;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+			ftrace_free_rec(rec);
+		}
+	} while_for_each_ftrace_rec();
+	mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
 	ftrace_convert_nops(mod, start, end);
 }
 
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ftrace_init_module(mod, mod->ftrace_callsites,
+				   mod->ftrace_callsites +
+				   mod->num_ftrace_callsites);
+		break;
+	case MODULE_STATE_GOING:
+		ftrace_release(mod->ftrace_callsites,
+			       mod->ftrace_callsites +
+			       mod->num_ftrace_callsites);
+		break;
+	}
+
+	return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+	.notifier_call = ftrace_module_notify,
+	.priority = 0,
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
@@ -2791,6 +2825,10 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
+	ret = register_module_notifier(&ftrace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace ftrace module notifier\n");
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
cgit v1.2.3-70-g09d2


From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 16:53:47 -0400
Subject: tracing/events/ring-buffer: expose format of ring buffer headers to
 users

Currently, every thing needed to read the binary output from the
ring buffers is available, with the exception of the way the ring
buffers handles itself internally.

This patch creates two special files in the debugfs/tracing/events
directory:

 # cat /debug/tracing/events/header_page
        field: u64 timestamp;   offset:0;       size:8;
        field: local_t commit;  offset:8;       size:8;
        field: char data;       offset:16;      size:4080;

 # cat /debug/tracing/events/header_event
        type        :    2 bits
        len         :    3 bits
        time_delta  :   27 bits
        array       :   32 bits

        padding     : type == 0
        time_extend : type == 1
        data        : type == 3

This is to allow a userspace app to see if the ring buffer format changes
or not.

[ Impact: allow userspace apps to know of ringbuffer format changes ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  5 +++++
 kernel/trace/ring_buffer.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f0aa486d131..fac8f1ac6f4 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -166,6 +166,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 			  size_t len, int cpu, int full);
 
+struct trace_seq;
+
+int ring_buffer_print_entry_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_seq *s);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f935bd5ec3e..84a6055f37c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -21,6 +21,28 @@
 
 #include "trace.h"
 
+/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
+	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+	ret = trace_seq_printf(s, "\n");
+	ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+			       RINGBUF_TYPE_PADDING);
+	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+			       RINGBUF_TYPE_TIME_EXTEND);
+	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
+			       RINGBUF_TYPE_DATA);
+
+	return ret;
+}
+
 /*
  * The ring buffer is made up of a list of pages. A separate list of pages is
  * allocated for each CPU. A writer may only write to a buffer that is
@@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+	struct buffer_data_page field;
+	int ret;
+
+	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+			       "offset:0;\tsize:%u;\n",
+			       (unsigned int)sizeof(field.time_stamp));
+
+	ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       (unsigned int)sizeof(field.commit));
+
+	ret = trace_seq_printf(s, "\tfield: char data;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), data),
+			       (unsigned int)BUF_PAGE_SIZE);
+
+	return ret;
+}
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f81d6eec4e4..7163a2bb021 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	func(s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_show_header_fops = {
+	.open = tracing_open_generic,
+	.read = show_header,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -909,6 +938,15 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
cgit v1.2.3-70-g09d2


From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 21:41:52 -0400
Subject: tracing: add same level recursion detection

The tracing infrastructure allows for recursion. That is, an interrupt
may interrupt the act of tracing an event, and that interrupt may very well
perform its own trace. This is a recursive trace, and is fine to do.

The problem arises when there is a bug, and the utility doing the trace
calls something that recurses back into the tracer. This recursion is not
caused by an external event like an interrupt, but by code that is not
expected to recurse. The result could be a lockup.

This patch adds a bitmask to the task structure that keeps track
of the trace recursion. To find the interrupt depth, the following
algorithm is used:

  level = hardirq_count() + softirq_count() + in_nmi;

Here, level will be the depth of interrutps and softirqs, and even handles
the nmi. Then the corresponding bit is set in the recursion bitmask.
If the bit was already set, we know we had a recursion at the same level
and we warn about it and fail the writing to the buffer.

After the data has been committed to the buffer, we clear the bit.
No atomics are needed. The only races are with interrupts and they reset
the bitmask before returning anywy.

[ Impact: detect same irq level trace recursion ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h     |  7 +++++++
 include/linux/init_task.h  |  1 +
 include/linux/sched.h      |  4 +++-
 kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 97c83e1bc58..39b95c56587 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -488,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 extern int ftrace_dump_on_oops;
 
+#ifdef CONFIG_PREEMPT
+#define INIT_TRACE_RECURSION		.trace_recursion = 0,
+#endif
+
 #endif /* CONFIG_TRACING */
 
+#ifndef INIT_TRACE_RECURSION
+#define INIT_TRACE_RECURSION
+#endif
 
 #ifdef CONFIG_HW_BRANCH_TRACER
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfb93337e9..6fc21852986 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -187,6 +187,7 @@ extern struct cred init_cred;
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049..7ede5e49091 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1428,7 +1428,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
-#endif
+	/* bitmask of trace recursion */
+	unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 84a6055f37c..b421b0ea911 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static int trace_irq_level(void)
+{
+	return hardirq_count() + softirq_count() + in_nmi();
+}
+
+static int trace_recursive_lock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	if (unlikely(current->trace_recursion & (1 << level))) {
+		/* Disable all tracing before we do anything else */
+		tracing_off_permanent();
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	current->trace_recursion |= 1 << level;
+
+	return 0;
+}
+
+static void trace_recursive_unlock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+
+	current->trace_recursion &= ~(1 << level);
+}
+
 static DEFINE_PER_CPU(int, rb_need_resched);
 
 /**
@@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	/* If we are tracing schedule, we don't want to recurse */
 	resched = ftrace_preempt_disable();
 
+	if (trace_recursive_lock())
+		goto out_nocheck;
+
 	cpu = raw_smp_processor_id();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	return event;
 
  out:
+	trace_recursive_unlock();
+
+ out_nocheck:
 	ftrace_preempt_enable(resched);
 	return NULL;
 }
@@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
cgit v1.2.3-70-g09d2


From bd3ce6556072bdc8ea66dfd5448e184f189bdc7f Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Fri, 17 Apr 2009 20:12:35 -0700
Subject: Input: rotary_encoder - add support for REL_* axes

The rotary encoder driver only supports returning input events
for ABS_* axes, this adds support for REL_* axes.  The relative
axis input event is reported as -1 for each counter-clockwise
step and +1 for each clockwise step.

The ability to clamp the position of ABS_* axes between 0 and
a maximum of "steps" has also been added.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 Documentation/input/rotary-encoder.txt |  9 ++++-
 drivers/input/misc/rotary_encoder.c    | 61 +++++++++++++++++++++++-----------
 include/linux/rotary_encoder.h         |  2 ++
 3 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
index 435102a26d9..3a6aec40c0b 100644
--- a/Documentation/input/rotary-encoder.txt
+++ b/Documentation/input/rotary-encoder.txt
@@ -67,7 +67,12 @@ data with it.
 struct rotary_encoder_platform_data is declared in
 include/linux/rotary-encoder.h and needs to be filled with the number of
 steps the encoder has and can carry information about externally inverted
-signals (because of used invertig buffer or other reasons).
+signals (because of an inverting buffer or other reasons). The encoder
+can be set up to deliver input information as either an absolute or relative
+axes. For relative axes the input event returns +/-1 for each step. For
+absolute axes the position of the encoder can either roll over between zero
+and the number of steps or will clamp at the maximum and zero depending on
+the configuration.
 
 Because GPIO to IRQ mapping is platform specific, this information must
 be given in seperately to the driver. See the example below.
@@ -85,6 +90,8 @@ be given in seperately to the driver. See the example below.
 static struct rotary_encoder_platform_data my_rotary_encoder_info = {
 	.steps		= 24,
 	.axis		= ABS_X,
+	.relative_axis	= false,
+	.rollover	= false,
 	.gpio_a		= GPIO_ROTARY_A,
 	.gpio_b		= GPIO_ROTARY_B,
 	.inverted_a	= 0,
diff --git a/drivers/input/misc/rotary_encoder.c b/drivers/input/misc/rotary_encoder.c
index 5bb3ab51b8c..c806fbf1e17 100644
--- a/drivers/input/misc/rotary_encoder.c
+++ b/drivers/input/misc/rotary_encoder.c
@@ -26,13 +26,17 @@
 #define DRV_NAME "rotary-encoder"
 
 struct rotary_encoder {
-	unsigned int irq_a;
-	unsigned int irq_b;
-	unsigned int pos;
-	unsigned int armed;
-	unsigned int dir;
 	struct input_dev *input;
 	struct rotary_encoder_platform_data *pdata;
+
+	unsigned int axis;
+	unsigned int pos;
+
+	unsigned int irq_a;
+	unsigned int irq_b;
+
+	bool armed;
+	unsigned char dir;	/* 0 - clockwise, 1 - CCW */
 };
 
 static irqreturn_t rotary_encoder_irq(int irq, void *dev_id)
@@ -53,21 +57,32 @@ static irqreturn_t rotary_encoder_irq(int irq, void *dev_id)
 		if (!encoder->armed)
 			break;
 
-		if (encoder->dir) {
-			/* turning counter-clockwise */
-			encoder->pos += pdata->steps;
-			encoder->pos--;
-			encoder->pos %= pdata->steps;
+		if (pdata->relative_axis) {
+			input_report_rel(encoder->input, pdata->axis,
+					 encoder->dir ? -1 : 1);
 		} else {
-			/* turning clockwise */
-			encoder->pos++;
-			encoder->pos %= pdata->steps;
+			unsigned int pos = encoder->pos;
+
+			if (encoder->dir) {
+				/* turning counter-clockwise */
+				if (pdata->rollover)
+					pos += pdata->steps;
+				if (pos)
+					pos--;
+			} else {
+				/* turning clockwise */
+				if (pdata->rollover || pos < pdata->steps)
+					pos++;
+			}
+			if (pdata->rollover)
+				pos %= pdata->steps;
+			encoder->pos = pos;
+			input_report_abs(encoder->input, pdata->axis,
+					 encoder->pos);
 		}
-
-		input_report_abs(encoder->input, pdata->axis, encoder->pos);
 		input_sync(encoder->input);
 
-		encoder->armed = 0;
+		encoder->armed = false;
 		break;
 
 	case 0x1:
@@ -77,7 +92,7 @@ static irqreturn_t rotary_encoder_irq(int irq, void *dev_id)
 		break;
 
 	case 0x3:
-		encoder->armed = 1;
+		encoder->armed = true;
 		break;
 	}
 
@@ -113,9 +128,15 @@ static int __devinit rotary_encoder_probe(struct platform_device *pdev)
 	input->name = pdev->name;
 	input->id.bustype = BUS_HOST;
 	input->dev.parent = &pdev->dev;
-	input->evbit[0] = BIT_MASK(EV_ABS);
-	input_set_abs_params(encoder->input,
-			     pdata->axis, 0, pdata->steps, 0, 1);
+
+	if (pdata->relative_axis) {
+		input->evbit[0] = BIT_MASK(EV_REL);
+		input->relbit[0] = BIT_MASK(pdata->axis);
+	} else {
+		input->evbit[0] = BIT_MASK(EV_ABS);
+		input_set_abs_params(encoder->input,
+				     pdata->axis, 0, pdata->steps, 0, 1);
+	}
 
 	err = input_register_device(input);
 	if (err) {
diff --git a/include/linux/rotary_encoder.h b/include/linux/rotary_encoder.h
index 12d63a30c34..215278b8df2 100644
--- a/include/linux/rotary_encoder.h
+++ b/include/linux/rotary_encoder.h
@@ -8,6 +8,8 @@ struct rotary_encoder_platform_data {
 	unsigned int gpio_b;
 	unsigned int inverted_a;
 	unsigned int inverted_b;
+	bool relative_axis;
+	bool rollover;
 };
 
 #endif /* __ROTARY_ENCODER_H__ */
-- 
cgit v1.2.3-70-g09d2


From 8e668b5b3455207e4540fc7ccab9ecf70142f288 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 17 Apr 2009 17:17:55 -0400
Subject: tracing: remove format attribute of inline function

Due to a cut and paste error, I added the gcc attribute for printf
format to the static inline stub of trace_seq_printf.

This will cause a compile failure.

[ Impact: fix compiler error when CONFIG_TRACING is off ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: =?ISO-8859-15?Q?Fr=E9d=E9ric_Weisbecker?= <fweisbec@gmail.com>
LKML-Reference: <alpine.DEB.2.00.0904171717080.1016@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_seq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 15ca2c71af1..37db9bdfbc1 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -42,7 +42,6 @@ extern int trace_seq_path(struct trace_seq *s, struct path *path);
 
 #else /* CONFIG_TRACING */
 static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)))
 {
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 46a802e852c2106d755f697819ea80cd3ffdc222 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide kill unused ide_cmd->special

Impact: removal of unused field

No one uses ide_cmd->special anymore.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ide.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fffb078..846a1e13240 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -324,7 +324,6 @@ struct ide_cmd {
 	unsigned int		cursg_ofs;
 
 	struct request		*rq;		/* copy of request */
-	void			*special;	/* valid_t generally */
 };
 
 /* ATAPI packet command flags */
-- 
cgit v1.2.3-70-g09d2


From a1df5169f9bf08f6067029bfb840a05e282b1b97 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide: add helpers for preparing sense requests

This is in preparation of removing the queueing of a sense request out
of the IRQ handler path.

Use struct request_sense as a general sense buffer for all ATAPI
devices ide-{floppy,tape,cd}.

tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as
      it can cause deadlock.  Converted to use inline struct request
      and blk_rq_init().

    * Added xfer / cdb len selection depending on device type.

    * All sense prep logics folded into ide_prep_sense() which never
      fails.

    * hwif->rq clearing and sense_rq used handling moved into
      ide_queue_sense_rq().

    * blk_rq_map_kern() conversion is moved to later patch.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ide.h     | 11 +++++++++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2894577237b..c6e03485a63 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+	struct request_sense *sense = &drive->sense_data;
+	struct request *sense_rq = &drive->sense_rq;
+	unsigned int cmd_len, sense_len;
+
+	debug_log("%s: enter\n", __func__);
+
+	switch (drive->media) {
+	case ide_floppy:
+		cmd_len = 255;
+		sense_len = 18;
+		break;
+	case ide_tape:
+		cmd_len = 20;
+		sense_len = 20;
+		break;
+	default:
+		cmd_len = 18;
+		sense_len = 18;
+	}
+
+	BUG_ON(sense_len > sizeof(*sense));
+
+	if (blk_sense_request(rq) || drive->sense_rq_armed)
+		return;
+
+	memset(sense, 0, sizeof(*sense));
+
+	blk_rq_init(rq->q, sense_rq);
+	sense_rq->rq_disk = rq->rq_disk;
+
+	sense_rq->data = sense;
+	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+	sense_rq->cmd[4] = cmd_len;
+	sense_rq->data_len = sense_len;
+
+	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_flags |= REQ_PREEMPT;
+
+	if (drive->media == ide_tape)
+		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+	drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+	BUG_ON(!drive->sense_rq_armed);
+
+	drive->sense_rq.special = special;
+	drive->sense_rq_armed = false;
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, &drive->sense_rq,
+			ELEVATOR_INSERT_FRONT, 0);
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
  * We queue a request sense packet command in the head of the request list.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 846a1e13240..a69ccac5641 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -602,6 +605,11 @@ struct ide_drive_s {
 
 	struct ide_atapi_pc request_sense_pc;
 	struct request request_sense_rq;
+
+	/* current sense rq and buffer */
+	bool sense_rq_armed;
+	struct request sense_rq;
+	struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 6b544fcc8cd0a04eb42de9d1ecdd345e979d6ada Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-atapi: convert ide-{floppy,tape} to using preallocated sense
 buffer

Since we're issuing REQ_TYPE_SENSE now we need to allow those types of
rqs in the ->do_request callbacks. As a future improvement, sense_len
assignment might be unified across all ATAPI devices. Borislav to
check with specs and test.

As a result, get rid of ide_queue_pc_head() and
drive->request_sense_rq.

tj: * Init request sense ide_atapi_pc from sense request.  In the
      longer timer, it would probably better to fold
      ide_create_request_sense_cmd() into its only current user -
      ide_floppy_get_format_progress().

    * ide_retry_pc() no longer takes @disk.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c  | 48 ++++++++++++++----------------------------------
 drivers/ide/ide-floppy.c |  4 +++-
 drivers/ide/ide-tape.c   |  7 +++++--
 include/linux/ide.h      |  3 +--
 4 files changed, 23 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c6e03485a63..972c522516f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-			      struct ide_atapi_pc *pc, struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = (char *)pc;
-	rq->rq_disk = disk;
-
-	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
-	}
-
-	memcpy(rq->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 /*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
@@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-	struct request *rq = &drive->request_sense_rq;
+	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	ide_create_request_sense_cmd(drive, pc);
+
+	/* init pc from sense_rq */
+	ide_init_pc(pc);
+	memcpy(pc->c, sense_rq->cmd, 12);
+	pc->buf = sense_rq->data;
+	pc->req_xfer = sense_rq->data_len;
+
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	ide_queue_pc_head(drive, disk, pc, rq);
+
+	ide_queue_sense_rq(drive, pc);
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
 			/* Retry operation */
-			ide_retry_pc(drive, rq->rq_disk);
+			ide_retry_pc(drive);
 
 			/* queued, but not started */
 			return ide_stopped;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 94600331a27..d3302cc891e 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq)) {
+	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
 		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
@@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aadf53cfac6..8324dfa78a3 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
 						tape->name);
 			/* Retry operation */
-			ide_retry_pc(drive, tape->disk);
+			ide_retry_pc(drive);
 			return ide_stopped;
 		}
 		pc->error = 0;
@@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			(unsigned long long)rq->sector, rq->nr_sectors,
 			rq->current_nr_sectors);
 
-	if (!blk_special_request(rq)) {
+	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
@@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a69ccac5641..9e67ccac3c1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -604,7 +604,6 @@ struct ide_drive_s {
 	unsigned long atapi_flags;
 
 	struct ide_atapi_pc request_sense_pc;
-	struct request request_sense_rq;
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
@@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
 void ide_queue_sense_rq(ide_drive_t *drive, void *special);
-- 
cgit v1.2.3-70-g09d2


From 5c4be57249e2e09136446597d2fe2a967c6ffef0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd,atapi: use bio for internal commands

Impact: unify request data buffer handling

rq->data is used mostly to pass kernel buffer through request queue
without using bio.  There are only a couple of places which still do
this in kernel and converting to bio isn't difficult.

This patch converts ide-cd and atapi to use bio instead of rq->data
for request sense and internal pc commands.  With previous change to
unify sense request handling, this is relatively easily achieved by
adding blk_rq_map_kern() during sense_rq prep and PC issue.

If blk_rq_map_kern() fails for sense, the error is deferred till sense
issue and aborts the failed command which triggered the sense.  Note
that this is a slim possibility as sense prep is done on each command
issue, so for the above condition to actually trigger, all preps since
the last sense issue till the issue of the request which would require
a sense should fail.

* do_request functions might sleep now.  This should be okay as ide
  request_fn - do_ide_request() - is invoked only from make_request
  and plug work.  Make sure this is the case by adding might_sleep()
  to do_ide_request().

* Functions which access the read sense data before the sense request
  is complete now should access bio_data(sense_rq->bio) as the sense
  buffer might have been copied during blk_rq_map_kern().

* ide-tape updated to map sg.

* cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC
  special case.  Simplified.

* tp_ops->output/input_data path dropped from ide_pc_intr().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 52 +++++++++++++++++++++++++++++--------------------
 drivers/ide/ide-cd.c    | 28 +++++++++++++-------------
 drivers/ide/ide-io.c    |  3 +++
 drivers/ide/ide-tape.c  |  3 +++
 include/linux/ide.h     |  2 +-
 5 files changed, 52 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 972c522516f..5cefe12f562 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+					GFP_NOIO);
+		if (error)
+			goto put_req;
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
 	blk_put_request(rq);
-
 	return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	struct request *sense_rq = &drive->sense_rq;
 	unsigned int cmd_len, sense_len;
+	int err;
 
 	debug_log("%s: enter\n", __func__);
 
@@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
-	sense_rq->rq_disk = rq->rq_disk;
 
-	sense_rq->data = sense;
+	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+			      GFP_NOIO);
+	if (unlikely(err)) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: failed to map sense buffer\n",
+			       drive->name);
+		return;
+	}
+
+	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->data_len = sense_len;
-
 	sense_rq->cmd_type = REQ_TYPE_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
@@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ide_prep_sense);
 
-void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
-	BUG_ON(!drive->sense_rq_armed);
+	/* deferred failure from ide_prep_sense() */
+	if (!drive->sense_rq_armed) {
+		printk(KERN_WARNING "%s: failed queue sense request\n",
+		       drive->name);
+		return -ENOMEM;
+	}
 
 	drive->sense_rq.special = special;
 	drive->sense_rq_armed = false;
@@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	elv_add_request(drive->queue, &drive->sense_rq,
 			ELEVATOR_INSERT_FRONT, 0);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
@@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive)
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
-	pc->buf = sense_rq->data;
+	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
 	pc->req_xfer = sense_rq->data_len;
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	ide_queue_sense_rq(drive, pc);
+	if (ide_queue_sense_rq(drive, pc))
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xferfunc;
 	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
@@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape)
+			if (drive->media == ide_tape && !rq->bio)
 				done = ide_rq_bytes(rq); /* FIXME */
 			else
 				done = blk_rq_bytes(rq);
@@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-	if (drive->media == ide_floppy && pc->buf == NULL) {
+	if (drive->media == ide_tape && pc->bh)
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
+	else {
 		done = min_t(unsigned int, bcount, cmd->nleft);
 		ide_pio_bytes(drive, cmd, write, done);
-	} else if (drive->media == ide_tape && pc->bh) {
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	} else {
-		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-		xferfunc(drive, NULL, pc->cur_pos, done);
 	}
 
 	/* Update the current position */
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 7b21c7eac5b..392a5bdf2fd 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
 	 * For REQ_TYPE_SENSE, "rq->special" points to the original
-	 * failed request
+	 * failed request.  Also, the sense data should be read
+	 * directly from rq which might be different from the original
+	 * sense buffer if it got copied during mapping.
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct request_sense *sense = &drive->sense_data;
+	void *sense = bio_data(rq->bio);
 
 	if (failed) {
 		if (failed->sense) {
@@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		ide_queue_sense_rq(drive, NULL);
+		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
 	return 1;
 
 end_request:
@@ -412,8 +414,7 @@ end_request:
 
 		hwif->rq = NULL;
 
-		ide_queue_sense_rq(drive, rq);
-		return 1;
+		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
 }
@@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq->cmd_flags |= cmd_flags;
 		rq->timeout = timeout;
 		if (buffer) {
-			rq->data = buffer;
-			rq->data_len = *bufflen;
+			error = blk_rq_map_kern(drive->queue, rq, buffer,
+						*bufflen, GFP_NOIO);
+			if (error) {
+				blk_put_request(rq);
+				return error;
+			}
 		}
 
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	drive->dma = 0;
 
 	/* sg request */
-	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+	if (rq->bio) {
 		struct request_queue *q = drive->queue;
+		char *buf = bio_data(rq->bio);
 		unsigned int alignment;
-		char *buf;
-
-		if (rq->bio)
-			buf = bio_data(rq->bio);
-		else
-			buf = rq->data;
 
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9b9e8b1aae5..3245c2dbda3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q)
 
 	spin_unlock_irq(q->queue_lock);
 
+	/* HLD do_request() callback might sleep, make sure it's okay */
+	might_sleep();
+
 	if (ide_lock_host(host, hwif))
 		goto plug_device_2;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8324dfa78a3..9b762a2d5d9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -850,6 +850,9 @@ out:
 
 	cmd.rq = rq;
 
+	ide_init_sg_cmd(&cmd, pc->req_xfer);
+	ide_map_sg(drive, &cmd);
+
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9e67ccac3c1..1957461ac76 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
-- 
cgit v1.2.3-70-g09d2


From 6d7003877c2f0578f1c08f66d05c3f72ef4ae596 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-atapi: kill unused fields and callbacks

Impact: remove fields and code paths which are no longer necessary

Now that ide-tape uses standard mechanisms to transfer data, special
case handling for bh handling can be dropped from ide-atapi.  Drop the
followings.

* pc->cur_pos, b_count, bh and b_data
* drive->pc_update_buffers() and pc_io_buffers().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 17 ++++-------------
 drivers/ide/ide-tape.c  |  1 -
 include/linux/ide.h     | 12 ------------
 3 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b9dd4503cbc..afe5a432387 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
+		} else
 			pc->xferred = pc->req_xfer;
-			if (drive->pc_update_buffers)
-				drive->pc_update_buffers(drive, pc);
-		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (drive->media == ide_tape && pc->bh)
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	else {
-		done = min_t(unsigned int, bcount, cmd->nleft);
-		ide_pio_bytes(drive, cmd, write, done);
-	}
+	done = min_t(unsigned int, bcount, cmd->nleft);
+	ide_pio_bytes(drive, cmd, write, done);
 
-	/* Update the current position */
+	/* Update transferred byte count */
 	pc->xferred += done;
-	pc->cur_pos += done;
 
 	bcount -= done;
 
@@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 		/* We haven't transferred any data yet */
 		pc->xferred = 0;
-		pc->cur_pos = pc->buf;
 
 		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2599579e417..8dfc68892d6 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1957461ac76..34c128f0a33 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,11 +362,7 @@ struct ide_atapi_pc {
 
 	/* data buffer */
 	u8 *buf;
-	/* current buffer position */
-	u8 *cur_pos;
 	int buf_size;
-	/* missing/available data on the current buffer */
-	int b_count;
 
 	/* the corresponding request */
 	struct request *rq;
@@ -379,10 +375,6 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-	/* idetape only */
-	struct idetape_bh *bh;
-	char *b_data;
-
 	unsigned long timeout;
 };
 
@@ -595,10 +587,6 @@ struct ide_drive_s {
 	/* callback for packet commands */
 	int  (*pc_callback)(struct ide_drive_s *, int);
 
-	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-			      unsigned int, int);
-
 	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
 	unsigned long atapi_flags;
-- 
cgit v1.2.3-70-g09d2


From 937582382c71b75b29fbb92615629494e1a05ac0 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:14 +0800
Subject: x86, intr-remap: enable interrupt remapping early

Currently, when x2apic is not enabled, interrupt remapping
will be enabled in init_dmars(), where it is too late to remap
ioapic interrupts, that is, ioapic interrupts are really in
compatibility mode, not remappable mode.

This patch always enables interrupt remapping before ioapic
setup, it guarantees all interrupts will be remapped when
interrupt remapping is enabled. Thus it doesn't need to set
the compatibility interrupt bit.

[ Impact: refactor intr-remap init sequence, enable fuller remap mode ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-4-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h  |  7 ++--
 arch/x86/kernel/apic/apic.c  | 76 +++++++++++++++++++++-----------------------
 drivers/pci/intel-iommu.c    |  9 ------
 drivers/pci/intr_remapping.c | 28 ++++++++--------
 include/linux/dmar.h         |  1 +
 5 files changed, 54 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d4cb7e590c0..fbdd65446c7 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -169,7 +169,6 @@ static inline u64 native_x2apic_icr_read(void)
 extern int x2apic, x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
 static inline int x2apic_enabled(void)
 {
@@ -190,18 +189,18 @@ static inline void check_x2apic(void)
 static inline void enable_x2apic(void)
 {
 }
-static inline void enable_IR_x2apic(void)
-{
-}
 static inline int x2apic_enabled(void)
 {
 	return 0;
 }
 
 #define	x2apic	0
+#define	x2apic_preenabled 0
 
 #endif
 
+extern void enable_IR_x2apic(void);
+
 extern int get_physical_broadcast(void);
 
 extern void apic_disable(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 83e47febcc8..0cf1eea750c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -141,6 +141,8 @@ static int x2apic_preenabled;
 static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
+	if (x2apic_enabled())
+		panic("Bios already enabled x2apic, can't enforce nox2apic");
 	disable_x2apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 	return 0;
@@ -1345,6 +1347,7 @@ void enable_x2apic(void)
 		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
 	}
 }
+#endif /* CONFIG_X86_X2APIC */
 
 void __init enable_IR_x2apic(void)
 {
@@ -1353,32 +1356,21 @@ void __init enable_IR_x2apic(void)
 	unsigned long flags;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
-	if (!cpu_has_x2apic)
-		return;
-
-	if (!x2apic_preenabled && disable_x2apic) {
-		pr_info("Skipped enabling x2apic and Interrupt-remapping "
-			"because of nox2apic\n");
-		return;
+	ret = dmar_table_init();
+	if (ret) {
+		pr_debug("dmar_table_init() failed with %d:\n", ret);
+		goto ir_failed;
 	}
 
-	if (x2apic_preenabled && disable_x2apic)
-		panic("Bios already enabled x2apic, can't enforce nox2apic");
-
-	if (!x2apic_preenabled && skip_ioapic_setup) {
-		pr_info("Skipped enabling x2apic and Interrupt-remapping "
-			"because of skipping io-apic setup\n");
-		return;
+	if (!intr_remapping_supported()) {
+		pr_debug("intr-remapping not supported\n");
+		goto ir_failed;
 	}
 
-	ret = dmar_table_init();
-	if (ret) {
-		pr_info("dmar_table_init() failed with %d:\n", ret);
 
-		if (x2apic_preenabled)
-			panic("x2apic enabled by bios. But IR enabling failed");
-		else
-			pr_info("Not enabling x2apic,Intr-remapping\n");
+	if (!x2apic_preenabled && skip_ioapic_setup) {
+		pr_info("Skipped enabling intr-remap because of skipping "
+			"io-apic setup\n");
 		return;
 	}
 
@@ -1398,20 +1390,25 @@ void __init enable_IR_x2apic(void)
 	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-	ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
-	if (ret && x2apic_preenabled) {
-		local_irq_restore(flags);
-		panic("x2apic enabled by bios. But IR enabling failed");
-	}
+#ifdef CONFIG_X86_X2APIC
+	if (cpu_has_x2apic)
+		ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
+	else
+#endif
+		ret = enable_intr_remapping(EIM_8BIT_APIC_ID);
 
 	if (ret)
 		goto end_restore;
 
-	if (!x2apic) {
+	pr_info("Enabled Interrupt-remapping\n");
+
+#ifdef CONFIG_X86_X2APIC
+	if (cpu_has_x2apic && !x2apic) {
 		x2apic = 1;
 		enable_x2apic();
+		pr_info("Enabled x2apic\n");
 	}
+#endif
 
 end_restore:
 	if (ret)
@@ -1426,30 +1423,29 @@ end_restore:
 	local_irq_restore(flags);
 
 end:
-	if (!ret) {
-		if (!x2apic_preenabled)
-			pr_info("Enabled x2apic and interrupt-remapping\n");
-		else
-			pr_info("Enabled Interrupt-remapping\n");
-	} else
-		pr_err("Failed to enable Interrupt-remapping and x2apic\n");
 	if (ioapic_entries)
 		free_ioapic_entries(ioapic_entries);
+
+	if (!ret)
+		return;
+
+ir_failed:
+	if (x2apic_preenabled)
+		panic("x2apic enabled by bios. But IR enabling failed");
+	else if (cpu_has_x2apic)
+		pr_info("Not enabling x2apic,Intr-remapping\n");
 #else
 	if (!cpu_has_x2apic)
 		return;
 
 	if (x2apic_preenabled)
 		panic("x2apic enabled prior OS handover,"
-		      " enable CONFIG_INTR_REMAP");
-
-	pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
-		" and x2apic\n");
+		      " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
 #endif
 
 	return;
 }
-#endif /* CONFIG_X86_X2APIC */
+
 
 #ifdef CONFIG_X86_64
 /*
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 001b328adf8..9ce8f0764be 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1968,15 +1968,6 @@ static int __init init_dmars(void)
 		}
 	}
 
-#ifdef CONFIG_INTR_REMAP
-	if (!intr_remapping_enabled) {
-		ret = enable_intr_remapping(0);
-		if (ret)
-			printk(KERN_ERR
-			       "IOMMU: enable interrupt remapping failed\n");
-	}
-#endif
-
 	/*
 	 * For each rmrr
 	 *   for each dev attached to rmrr
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea724a6..5c2142656e9 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -423,20 +423,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 		      readl, (sts & DMA_GSTS_IRTPS), sts);
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-	if (mode == 0) {
-		spin_lock_irqsave(&iommu->register_lock, flags);
-
-		/* enable comaptiblity format interrupt pass through */
-		cmd = iommu->gcmd | DMA_GCMD_CFI;
-		iommu->gcmd |= DMA_GCMD_CFI;
-		writel(cmd, iommu->reg + DMAR_GCMD_REG);
-
-		IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-			      readl, (sts & DMA_GSTS_CFIS), sts);
-
-		spin_unlock_irqrestore(&iommu->register_lock, flags);
-	}
-
 	/*
 	 * global invalidation of interrupt entry cache before enabling
 	 * interrupt-remapping.
@@ -516,6 +502,20 @@ end:
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
+int __init intr_remapping_supported(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			return 0;
+	}
+
+	return 1;
+}
+
 int __init enable_intr_remapping(int eim)
 {
 	struct dmar_drhd_unit *drhd;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc342cd..06f592a7f73 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -108,6 +108,7 @@ struct irte {
 };
 #ifdef CONFIG_INTR_REMAP
 extern int intr_remapping_enabled;
+extern int intr_remapping_supported(void);
 extern int enable_intr_remapping(int);
 extern void disable_intr_remapping(void);
 extern int reenable_intr_remapping(int);
-- 
cgit v1.2.3-70-g09d2


From a0f82f64e26929776c58a5c93c2ecb38e3d82815 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 19 Apr 2009 09:43:48 +0000
Subject: syncookies: remove last_synq_overflow from struct tcp_sock

last_synq_overflow eats 4 or 8 bytes in struct tcp_sock, even
though it is only used when a listening sockets syn queue
is full.

We can (ab)use rx_opt.ts_recent_stamp to store the same information;
it is not used otherwise as long as a socket is in listen state.

Move linger2 around to avoid splitting struct mtu_probe
across cacheline boundary on 32 bit arches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  4 +---
 include/net/tcp.h     | 13 +++++++++++++
 net/ipv4/syncookies.c |  5 ++---
 net/ipv6/syncookies.c |  4 ++--
 4 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9d5078bd23a..8afac76cd74 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -377,7 +377,7 @@ struct tcp_sock {
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
 	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
 
-	unsigned long last_synq_overflow; 
+	int			linger2;
 
 /* Receiver side RTT estimation */
 	struct {
@@ -406,8 +406,6 @@ struct tcp_sock {
 /* TCP MD5 Signagure Option information */
 	struct tcp_md5sig_info	*md5sig_info;
 #endif
-
-	int			linger2;
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1b94b9bfe2d..b55b4891029 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -265,6 +265,19 @@ static inline int tcp_too_many_orphans(struct sock *sk, int num)
 		 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]);
 }
 
+/* syncookies: remember time of last synqueue overflow */
+static inline void tcp_synq_overflow(struct sock *sk)
+{
+	tcp_sk(sk)->rx_opt.ts_recent_stamp = jiffies;
+}
+
+/* syncookies: no recent synqueue overflow on this listening socket? */
+static inline int tcp_synq_no_recent_overflow(const struct sock *sk)
+{
+	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	return time_after(jiffies, last_overflow + TCP_TIMEOUT_INIT);
+}
+
 extern struct proto tcp_prot;
 
 #define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b35a950d2e0..cd2b97f1b6e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -161,13 +161,12 @@ static __u16 const msstab[] = {
  */
 __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	const struct iphdr *iph = ip_hdr(skb);
 	const struct tcphdr *th = tcp_hdr(skb);
 	int mssind;
 	const __u16 mss = *mssp;
 
-	tp->last_synq_overflow = jiffies;
+	tcp_synq_overflow(sk);
 
 	/* XXX sort msstab[] by probability?  Binary search? */
 	for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
@@ -268,7 +267,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	if (!sysctl_tcp_syncookies || !th->ack)
 		goto out;
 
-	if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
+	if (tcp_synq_no_recent_overflow(sk) ||
 	    (mss = cookie_check(skb, cookie)) == 0) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 711175e0571..8c2513982b6 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -131,7 +131,7 @@ __u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
 	int mssind;
 	const __u16 mss = *mssp;
 
-	tcp_sk(sk)->last_synq_overflow = jiffies;
+	tcp_synq_overflow(sk);
 
 	for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
 		;
@@ -175,7 +175,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	if (!sysctl_tcp_syncookies || !th->ack)
 		goto out;
 
-	if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
+	if (tcp_synq_no_recent_overflow(sk) ||
 		(mss = cookie_check(skb, cookie)) == 0) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 23de29de2d8b227943be191d59fb6d983996d55e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 12:59:29 -0400
Subject: tracing: remove dangling semicolon

Due to a cut and paste error, the trace_seq_putc had a semicolon
after the prototype but before the stub function when tracing is
disabled.

[Impact: fix compile error ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 37db9bdfbc1..ba9627f00d3 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -63,7 +63,7 @@ static inline int trace_seq_puts(struct trace_seq *s, const char *str)
 {
 	return 0;
 }
-static inline int trace_seq_putc(struct trace_seq *s, unsigned char c);
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 20:38:21 +0200
Subject: perfcounters, sched: remove __task_delta_exec()

This function was left orphan by the latest round of sw-counter
cleanups.

[ Impact: remove unused kernel function ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |  1 -
 kernel/sched.c              | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 080d1fd461d..a77c6007dc9 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -85,7 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
  */
-extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
 
 extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
diff --git a/kernel/sched.c b/kernel/sched.c
index b66a08c2480..a69278eef42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4546,29 +4546,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
-/*
- * Return any ns on the sched_clock that have not yet been banked in
- * @p in case that task is currently running.
- */
-unsigned long long __task_delta_exec(struct task_struct *p, int update)
-{
-	s64 delta_exec;
-	struct rq *rq;
-
-	rq = task_rq(p);
-	WARN_ON_ONCE(!runqueue_is_locked());
-	WARN_ON_ONCE(!task_current(rq, p));
-
-	if (update)
-		update_rq_clock(rq);
-
-	delta_exec = rq->clock - p->se.exec_start;
-
-	WARN_ON_ONCE(delta_exec < 0);
-
-	return delta_exec;
-}
-
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
-- 
cgit v1.2.3-70-g09d2


From fc1edaf9e7cc4d4696f83dee495b8f158d01c4eb Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:27 -0700
Subject: x86: x2apic, IR: Clean up X86_X2APIC and INTR_REMAP config checks

Add x2apic_supported() to clean up CONFIG_X86_X2APIC checks.

Fix CONFIG_INTR_REMAP checks.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.128993000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h          | 10 ++++----
 arch/x86/include/asm/io_apic.h       |  2 --
 arch/x86/include/asm/irq_remapping.h |  2 +-
 arch/x86/kernel/apic/apic.c          | 49 +++++++++---------------------------
 arch/x86/kernel/apic/io_apic.c       |  2 --
 arch/x86/kernel/apic/probe_64.c      |  2 +-
 include/linux/dmar.h                 |  2 ++
 7 files changed, 21 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index fbdd65446c7..3738438a91f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
 extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
-#define EIM_8BIT_APIC_ID	0
-#define EIM_32BIT_APIC_ID	1
+extern int x2apic_mode;
 
 #ifdef CONFIG_X86_X2APIC
 /*
@@ -166,7 +165,7 @@ static inline u64 native_x2apic_icr_read(void)
 	return val;
 }
 
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
@@ -182,6 +181,8 @@ static inline int x2apic_enabled(void)
 		return 1;
 	return 0;
 }
+
+#define x2apic_supported()	(cpu_has_x2apic)
 #else
 static inline void check_x2apic(void)
 {
@@ -194,9 +195,8 @@ static inline int x2apic_enabled(void)
 	return 0;
 }
 
-#define	x2apic	0
 #define	x2apic_preenabled 0
-
+#define	x2apic_supported()	0
 #endif
 
 extern void enable_IR_x2apic(void);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e43601..34eaa37f7ad 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,7 +161,6 @@ extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
-#ifdef CONFIG_X86_64
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
 extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -169,7 +168,6 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
 	struct IO_APIC_route_entry **ioapic_entries);
-#endif
 
 extern void probe_nr_irqs_gsi(void);
 
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb..f275e224450 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
 #ifndef _ASM_X86_IRQ_REMAPPING_H
 #define _ASM_X86_IRQ_REMAPPING_H
 
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7b41a32339e..2b30e520dce 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -134,8 +134,8 @@ static __init int setup_apicpmtimer(char *s)
 __setup("apicpmtimer", setup_apicpmtimer);
 #endif
 
+int x2apic_mode;
 #ifdef CONFIG_X86_X2APIC
-int x2apic;
 /* x2apic enabled before OS handover */
 static int x2apic_preenabled;
 static int disable_x2apic;
@@ -858,7 +858,7 @@ void clear_local_APIC(void)
 	u32 v;
 
 	/* APIC hasn't been mapped yet */
-	if (!x2apic && !apic_phys)
+	if (!x2apic_mode && !apic_phys)
 		return;
 
 	maxlvt = lapic_get_maxlvt();
@@ -1330,7 +1330,7 @@ void check_x2apic(void)
 {
 	if (x2apic_enabled()) {
 		pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
-		x2apic_preenabled = x2apic = 1;
+		x2apic_preenabled = x2apic_mode = 1;
 	}
 }
 
@@ -1338,7 +1338,7 @@ void enable_x2apic(void)
 {
 	int msr, msr2;
 
-	if (!x2apic)
+	if (!x2apic_mode)
 		return;
 
 	rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1390,25 +1390,17 @@ void __init enable_IR_x2apic(void)
 	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-#ifdef CONFIG_X86_X2APIC
-	if (cpu_has_x2apic)
-		ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-	else
-#endif
-		ret = enable_intr_remapping(EIM_8BIT_APIC_ID);
-
+	ret = enable_intr_remapping(x2apic_supported());
 	if (ret)
 		goto end_restore;
 
 	pr_info("Enabled Interrupt-remapping\n");
 
-#ifdef CONFIG_X86_X2APIC
-	if (cpu_has_x2apic && !x2apic) {
-		x2apic = 1;
+	if (x2apic_supported() && !x2apic_mode) {
+		x2apic_mode = 1;
 		enable_x2apic();
 		pr_info("Enabled x2apic\n");
 	}
-#endif
 
 end_restore:
 	if (ret)
@@ -1576,7 +1568,7 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-	if (x2apic) {
+	if (x2apic_mode) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
 	}
@@ -2010,10 +2002,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 
 	local_irq_save(flags);
 	disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
 	if (intr_remapping_enabled)
 		disable_intr_remapping();
-#endif
+
 	local_irq_restore(flags);
 	return 0;
 }
@@ -2023,8 +2015,6 @@ static int lapic_resume(struct sys_device *dev)
 	unsigned int l, h;
 	unsigned long flags;
 	int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
 	int ret;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
@@ -2050,17 +2040,8 @@ static int lapic_resume(struct sys_device *dev)
 		mask_8259A();
 	}
 
-	if (x2apic)
+	if (x2apic_mode)
 		enable_x2apic();
-#else
-	if (!apic_pm_state.active)
-		return 0;
-
-	local_irq_save(flags);
-	if (x2apic)
-		enable_x2apic();
-#endif
-
 	else {
 		/*
 		 * Make sure the APICBASE points to the right address
@@ -2098,18 +2079,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
-		if (x2apic)
-			reenable_intr_remapping(EIM_32BIT_APIC_ID);
-		else
-			reenable_intr_remapping(EIM_8BIT_APIC_ID);
-
+		reenable_intr_remapping(x2apic_mode);
 		unmask_8259A();
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
 	}
-#endif
 
 	local_irq_restore(flags);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ea22a86e3cd..3a45d2ec974 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -736,7 +736,6 @@ static int __init ioapic_pirq_setup(char *str)
 __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_INTR_REMAP
 struct IO_APIC_route_entry **alloc_ioapic_entries(void)
 {
 	int apic;
@@ -857,7 +856,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 
 	kfree(ioapic_entries);
 }
-#endif
 
 /*
  * Find the IRQ entry number of a certain pin.
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e..bc3e880f9b8 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
 void __init default_setup_apic_routing(void)
 {
 #ifdef CONFIG_X86_X2APIC
-	if (x2apic && (apic != &apic_x2apic_phys &&
+	if (x2apic_mode && (apic != &apic_x2apic_phys &&
 #ifdef CONFIG_X86_UV
 		       apic != &apic_x2apic_uv_x &&
 #endif
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 06f592a7f73..10ff5c49882 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -158,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 }
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
+#define disable_intr_remapping()	(0)
+#define reenable_intr_remapping(mode)	(0)
 #define intr_remapping_enabled		(0)
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 03ad032bb78b2732b607ed198e951240e1d21e59 Mon Sep 17 00:00:00 2001
From: Peter Holik <peter@holik.at>
Date: Sat, 18 Apr 2009 07:24:17 +0000
Subject: export usbnet_get_ethernet_addr from usbnet and fixed cdc_ether.c

because of using the same function get_ethernet_addr as cdc_ether.c
i export usbnet_get_ethernet_addr from usbnet and fixed cdc_ether
(suggested by Oliver Neukum).

Signed-off-by: Peter Holik <peter@holik.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c | 33 +--------------------------------
 drivers/net/usb/usbnet.c    | 31 +++++++++++++++++++++++++++++++
 include/linux/usb/usbnet.h  |  1 +
 3 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 55e8ecc3a9e..01fd528306e 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -25,7 +25,6 @@
 #include <linux/init.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
-#include <linux/ctype.h>
 #include <linux/ethtool.h>
 #include <linux/workqueue.h>
 #include <linux/mii.h>
@@ -389,36 +388,6 @@ static void cdc_status(struct usbnet *dev, struct urb *urb)
 	}
 }
 
-static u8 nibble(unsigned char c)
-{
-	if (likely(isdigit(c)))
-		return c - '0';
-	c = toupper(c);
-	if (likely(isxdigit(c)))
-		return 10 + c - 'A';
-	return 0;
-}
-
-static inline int
-get_ethernet_addr(struct usbnet *dev, struct usb_cdc_ether_desc *e)
-{
-	int 		tmp, i;
-	unsigned char	buf [13];
-
-	tmp = usb_string(dev->udev, e->iMACAddress, buf, sizeof buf);
-	if (tmp != 12) {
-		dev_dbg(&dev->udev->dev,
-			"bad MAC string %d fetch, %d\n", e->iMACAddress, tmp);
-		if (tmp >= 0)
-			tmp = -EINVAL;
-		return tmp;
-	}
-	for (i = tmp = 0; i < 6; i++, tmp += 2)
-		dev->net->dev_addr [i] =
-			(nibble(buf [tmp]) << 4) + nibble(buf [tmp + 1]);
-	return 0;
-}
-
 static int cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 {
 	int				status;
@@ -428,7 +397,7 @@ static int cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (status < 0)
 		return status;
 
-	status = get_ethernet_addr(dev, info->ether);
+	status = usbnet_get_ethernet_addr(dev, info->ether->iMACAddress);
 	if (status < 0) {
 		usb_set_intfdata(info->data, NULL);
 		usb_driver_release_interface(driver_of(intf), info->data);
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 2b8b9036aff..c94de624314 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -37,6 +37,7 @@
 #include <linux/init.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/ctype.h>
 #include <linux/ethtool.h>
 #include <linux/workqueue.h>
 #include <linux/mii.h>
@@ -156,6 +157,36 @@ int usbnet_get_endpoints(struct usbnet *dev, struct usb_interface *intf)
 }
 EXPORT_SYMBOL_GPL(usbnet_get_endpoints);
 
+static u8 nibble(unsigned char c)
+{
+	if (likely(isdigit(c)))
+		return c - '0';
+	c = toupper(c);
+	if (likely(isxdigit(c)))
+		return 10 + c - 'A';
+	return 0;
+}
+
+int usbnet_get_ethernet_addr(struct usbnet *dev, int iMACAddress)
+{
+	int 		tmp, i;
+	unsigned char	buf [13];
+
+	tmp = usb_string(dev->udev, iMACAddress, buf, sizeof buf);
+	if (tmp != 12) {
+		dev_dbg(&dev->udev->dev,
+			"bad MAC string %d fetch, %d\n", iMACAddress, tmp);
+		if (tmp >= 0)
+			tmp = -EINVAL;
+		return tmp;
+	}
+	for (i = tmp = 0; i < 6; i++, tmp += 2)
+		dev->net->dev_addr [i] =
+			(nibble(buf [tmp]) << 4) + nibble(buf [tmp + 1]);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usbnet_get_ethernet_addr);
+
 static void intr_complete (struct urb *urb);
 
 static int init_status (struct usbnet *dev, struct usb_interface *intf)
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 36fabb95c7d..5d44059f6d6 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -183,6 +183,7 @@ extern void usbnet_tx_timeout (struct net_device *net);
 extern int usbnet_change_mtu (struct net_device *net, int new_mtu);
 
 extern int usbnet_get_endpoints(struct usbnet *, struct usb_interface *);
+extern int usbnet_get_ethernet_addr(struct usbnet *, int);
 extern void usbnet_defer_kevent (struct usbnet *, int);
 extern void usbnet_skb_return (struct usbnet *, struct sk_buff *);
 extern void usbnet_unlink_rx_urbs(struct usbnet *);
-- 
cgit v1.2.3-70-g09d2


From b1b67dd45a6b629eb41553856805aaa1614fbb83 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Apr 2009 04:49:28 +0000
Subject: net: factor out ethtool invocation of vlan/macvlan drivers

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c     | 22 +++-------------------
 include/linux/netdevice.h | 24 +++++++++++++++++++++++-
 net/8021q/vlan_dev.c      | 23 +++--------------------
 3 files changed, 29 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 214a8cf2b70..329cd50d0e2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -374,36 +374,20 @@ static void macvlan_ethtool_get_drvinfo(struct net_device *dev,
 static u32 macvlan_ethtool_get_rx_csum(struct net_device *dev)
 {
 	const struct macvlan_dev *vlan = netdev_priv(dev);
-	struct net_device *lowerdev = vlan->lowerdev;
-
-	if (lowerdev->ethtool_ops == NULL ||
-	    lowerdev->ethtool_ops->get_rx_csum == NULL)
-		return 0;
-	return lowerdev->ethtool_ops->get_rx_csum(lowerdev);
+	return dev_ethtool_get_rx_csum(vlan->lowerdev);
 }
 
 static int macvlan_ethtool_get_settings(struct net_device *dev,
 					struct ethtool_cmd *cmd)
 {
 	const struct macvlan_dev *vlan = netdev_priv(dev);
-	struct net_device *lowerdev = vlan->lowerdev;
-
-	if (!lowerdev->ethtool_ops ||
-	    !lowerdev->ethtool_ops->get_settings)
-		return -EOPNOTSUPP;
-
-	return lowerdev->ethtool_ops->get_settings(lowerdev, cmd);
+	return dev_ethtool_get_settings(vlan->lowerdev, cmd);
 }
 
 static u32 macvlan_ethtool_get_flags(struct net_device *dev)
 {
 	const struct macvlan_dev *vlan = netdev_priv(dev);
-	struct net_device *lowerdev = vlan->lowerdev;
-
-	if (!lowerdev->ethtool_ops ||
-	    !lowerdev->ethtool_ops->get_flags)
-		return 0;
-	return lowerdev->ethtool_ops->get_flags(lowerdev);
+	return dev_ethtool_get_flags(vlan->lowerdev);
 }
 
 static const struct ethtool_ops macvlan_ethtool_ops = {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 54db3ebf219..31167451d08 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -42,6 +42,7 @@
 #include <linux/dmaengine.h>
 #include <linux/workqueue.h>
 
+#include <linux/ethtool.h>
 #include <net/net_namespace.h>
 #include <net/dsa.h>
 #ifdef CONFIG_DCB
@@ -49,7 +50,6 @@
 #endif
 
 struct vlan_group;
-struct ethtool_ops;
 struct netpoll_info;
 /* 802.11 specific */
 struct wireless_dev;
@@ -1906,6 +1906,28 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 }
 
 extern struct pernet_operations __net_initdata loopback_net_ops;
+
+static inline int dev_ethtool_get_settings(struct net_device *dev,
+					   struct ethtool_cmd *cmd)
+{
+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
+		return -EOPNOTSUPP;
+	return dev->ethtool_ops->get_settings(dev, cmd);
+}
+
+static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev)
+{
+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_rx_csum)
+		return 0;
+	return dev->ethtool_ops->get_rx_csum(dev);
+}
+
+static inline u32 dev_ethtool_get_flags(struct net_device *dev)
+{
+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_flags)
+		return 0;
+	return dev->ethtool_ops->get_flags(dev);
+}
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 6b092136401..04dc8c8a685 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -666,13 +666,7 @@ static int vlan_ethtool_get_settings(struct net_device *dev,
 				     struct ethtool_cmd *cmd)
 {
 	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
-	struct net_device *real_dev = vlan->real_dev;
-
-	if (!real_dev->ethtool_ops ||
-	    !real_dev->ethtool_ops->get_settings)
-		return -EOPNOTSUPP;
-
-	return real_dev->ethtool_ops->get_settings(real_dev, cmd);
+	return dev_ethtool_get_settings(vlan->real_dev, cmd);
 }
 
 static void vlan_ethtool_get_drvinfo(struct net_device *dev,
@@ -686,24 +680,13 @@ static void vlan_ethtool_get_drvinfo(struct net_device *dev,
 static u32 vlan_ethtool_get_rx_csum(struct net_device *dev)
 {
 	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
-	struct net_device *real_dev = vlan->real_dev;
-
-	if (real_dev->ethtool_ops == NULL ||
-	    real_dev->ethtool_ops->get_rx_csum == NULL)
-		return 0;
-	return real_dev->ethtool_ops->get_rx_csum(real_dev);
+	return dev_ethtool_get_rx_csum(vlan->real_dev);
 }
 
 static u32 vlan_ethtool_get_flags(struct net_device *dev)
 {
 	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
-	struct net_device *real_dev = vlan->real_dev;
-
-	if (!(real_dev->features & NETIF_F_HW_VLAN_RX) ||
-	    real_dev->ethtool_ops == NULL ||
-	    real_dev->ethtool_ops->get_flags == NULL)
-		return 0;
-	return real_dev->ethtool_ops->get_flags(real_dev);
+	return dev_ethtool_get_flags(vlan->real_dev);
 }
 
 static const struct ethtool_ops vlan_ethtool_ops = {
-- 
cgit v1.2.3-70-g09d2


From 0a1ec07a67bd8b0033dace237249654d015efa21 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 20 Apr 2009 01:25:46 +0000
Subject: net: skb_copy_datagram_const_iovec()

There's an skb_copy_datagram_iovec() to copy out of a paged skb,
but it modifies the iovec, and does not support starting
at an offset in the destination. We want both in tun.c, so let's
add the function.

It's a carbon copy of skb_copy_datagram_iovec() with enough changes to
be annoying.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 +++
 include/linux/socket.h |  2 ++
 net/core/datagram.c    | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/core/iovec.c       | 26 ++++++++++++++
 4 files changed, 125 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fd389162f0..af2b21bdda8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1717,6 +1717,11 @@ extern int	       skb_copy_datagram_from_iovec(struct sk_buff *skb,
 						    int offset,
 						    struct iovec *from,
 						    int len);
+extern int	       skb_copy_datagram_const_iovec(const struct sk_buff *from,
+						     int offset,
+						     const struct iovec *to,
+						     int to_offset,
+						     int size);
 extern void	       skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 extern int	       skb_kill_datagram(struct sock *sk, struct sk_buff *skb,
 					 unsigned int flags);
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 421afb4d29b..171b08db9c4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -318,6 +318,8 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata,
 
 extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode);
 extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
+extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
+			     int offset, int len);
 extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, int __user *ulen);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d0de644b378..4dbb05cd572 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -338,6 +338,98 @@ fault:
 	return -EFAULT;
 }
 
+/**
+ *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: io vector to copy to
+ *	@to_offset: offset in the io vector to start copying to
+ *	@len: amount of data to copy from buffer to iovec
+ *
+ *	Returns 0 or -EFAULT.
+ *	Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
+				  const struct iovec *to, int to_offset,
+				  int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		to_offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
+						offset - start, to_offset, copy);
+			kunmap(page);
+			if (err)
+				goto fault;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+			to_offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			WARN_ON(start > offset + len);
+
+			end = start + list->len;
+			if ((copy = end - offset) > 0) {
+				if (copy > len)
+					copy = len;
+				if (skb_copy_datagram_const_iovec(list,
+							    offset - start,
+							    to, to_offset,
+							    copy))
+					goto fault;
+				if ((len -= copy) == 0)
+					return 0;
+				offset += copy;
+				to_offset += copy;
+			}
+			start = end;
+		}
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+
 /**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 4c9c0121c9d..a215545c0a3 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -97,6 +97,31 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
 	return 0;
 }
 
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+		      int offset, int len)
+{
+	int copy;
+	for (; len > 0; ++iov) {
+		/* Skip over the finished iovecs */
+		if (unlikely(offset >= iov->iov_len)) {
+			offset -= iov->iov_len;
+			continue;
+		}
+		copy = min_t(unsigned int, iov->iov_len - offset, len);
+		offset = 0;
+		if (copy_to_user(iov->iov_base, kdata, copy))
+			return -EFAULT;
+		kdata += copy;
+		len -= copy;
+	}
+
+	return 0;
+}
+
 /*
  *	Copy iovec to kernel. Returns -EFAULT on error.
  *
@@ -236,3 +261,4 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
 EXPORT_SYMBOL(memcpy_fromiovec);
 EXPORT_SYMBOL(memcpy_fromiovecend);
 EXPORT_SYMBOL(memcpy_toiovec);
+EXPORT_SYMBOL(memcpy_toiovecend);
-- 
cgit v1.2.3-70-g09d2


From 6f26c9a7555e5bcca3560919db9b852015077dae Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 20 Apr 2009 01:26:11 +0000
Subject: tun: fix tun_chr_aio_write so that aio works

aio_write gets const struct iovec * but tun_chr_aio_write casts this to struct
iovec * and modifies the iovec. As a result, attempts to use io_submit
to send packets to a tun device fail with weird errors such as EINVAL.

Since tun is the only user of skb_copy_datagram_from_iovec, we can
fix this simply by changing the later so that it does not
touch the iovec passed to it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 13 ++++++++-----
 include/linux/skbuff.h |  3 ++-
 include/linux/socket.h |  4 ++--
 net/core/datagram.c    | 20 ++++++++++++++------
 net/core/iovec.c       |  7 ++++---
 5 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3b513e29d39..589f0ca668d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -540,31 +540,34 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
 
 /* Get packet from user space buffer */
 static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
-				       struct iovec *iv, size_t count,
+				       const struct iovec *iv, size_t count,
 				       int noblock)
 {
 	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
 	struct sk_buff *skb;
 	size_t len = count, align = 0;
 	struct virtio_net_hdr gso = { 0 };
+	int offset = 0;
 
 	if (!(tun->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) > count)
 			return -EINVAL;
 
-		if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
+		if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
 			return -EFAULT;
+		offset += sizeof(pi);
 	}
 
 	if (tun->flags & TUN_VNET_HDR) {
 		if ((len -= sizeof(gso)) > count)
 			return -EINVAL;
 
-		if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
+		if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
 			return -EFAULT;
 
 		if (gso.hdr_len > len)
 			return -EINVAL;
+		offset += sizeof(pi);
 	}
 
 	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
@@ -581,7 +584,7 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
 		return PTR_ERR(skb);
 	}
 
-	if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) {
+	if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
 		tun->dev->stats.rx_dropped++;
 		kfree_skb(skb);
 		return -EFAULT;
@@ -673,7 +676,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	result = tun_get_user(tun, (struct iovec *)iv, iov_length(iv, count),
+	result = tun_get_user(tun, iv, iov_length(iv, count),
 			      file->f_flags & O_NONBLOCK);
 
 	tun_put(tun);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index af2b21bdda8..1b5c3d298f4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1715,7 +1715,8 @@ extern int	       skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
 							struct iovec *iov);
 extern int	       skb_copy_datagram_from_iovec(struct sk_buff *skb,
 						    int offset,
-						    struct iovec *from,
+						    const struct iovec *from,
+						    int from_offset,
 						    int len);
 extern int	       skb_copy_datagram_const_iovec(const struct sk_buff *from,
 						     int offset,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 171b08db9c4..42a0396f2c5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -309,8 +309,8 @@ struct ucred {
 
 #ifdef __KERNEL__
 extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
-extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, 
-				int offset, int len);
+extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			       int offset, int len);
 extern int csum_partial_copy_fromiovecend(unsigned char *kdata, 
 					  struct iovec *iov, 
 					  int offset, 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 4dbb05cd572..914d5fa773b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -435,13 +435,15 @@ EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to
  *	@from: io vector to copy to
+ *	@from_offset: offset in the io vector to start copying from
  *	@len: amount of data to copy to buffer from iovec
  *
  *	Returns 0 or -EFAULT.
- *	Note: the iovec is modified during the copy.
+ *	Note: the iovec is not modified during the copy.
  */
 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
-				 struct iovec *from, int len)
+				 const struct iovec *from, int from_offset,
+				 int len)
 {
 	int start = skb_headlen(skb);
 	int i, copy = start - offset;
@@ -450,11 +452,12 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 	if (copy > 0) {
 		if (copy > len)
 			copy = len;
-		if (memcpy_fromiovec(skb->data + offset, from, copy))
+		if (memcpy_fromiovecend(skb->data + offset, from, 0, copy))
 			goto fault;
 		if ((len -= copy) == 0)
 			return 0;
 		offset += copy;
+		from_offset += copy;
 	}
 
 	/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -473,8 +476,9 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 			if (copy > len)
 				copy = len;
 			vaddr = kmap(page);
-			err = memcpy_fromiovec(vaddr + frag->page_offset +
-					       offset - start, from, copy);
+			err = memcpy_fromiovecend(vaddr + frag->page_offset +
+						  offset - start,
+						  from, from_offset, copy);
 			kunmap(page);
 			if (err)
 				goto fault;
@@ -482,6 +486,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 			if (!(len -= copy))
 				return 0;
 			offset += copy;
+			from_offset += copy;
 		}
 		start = end;
 	}
@@ -500,11 +505,14 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 					copy = len;
 				if (skb_copy_datagram_from_iovec(list,
 								 offset - start,
-								 from, copy))
+								 from,
+								 from_offset,
+								 copy))
 					goto fault;
 				if ((len -= copy) == 0)
 					return 0;
 				offset += copy;
+				from_offset += copy;
 			}
 			start = end;
 		}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index a215545c0a3..40a76ce19d9 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -147,10 +147,11 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
 }
 
 /*
- *	For use with ip_build_xmit
+ *	Copy iovec from kernel. Returns -EFAULT on error.
  */
-int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
-			int len)
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			int offset, int len)
 {
 	/* Skip over the finished iovecs */
 	while (offset >= iov->iov_len) {
-- 
cgit v1.2.3-70-g09d2


From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 22 Apr 2009 16:53:34 +0800
Subject: tracing/events: make struct trace_entry->type to be int type

struct trace_entry->type is unsigned char, while trace event's id is
int type, thus for a event with id >= 256, it's entry->type is cast
to (id % 256), and then we can't see the trace output of this event.

 # insmod trace-events-sample.ko
 # echo foo_bar > /mnt/tracing/set_event
 # cat /debug/tracing/events/trace-events-sample/foo_bar/id
 256
 # cat /mnt/tracing/trace_pipe
           <...>-3548  [001]   215.091142: Unknown type 0
           <...>-3548  [001]   216.089207: Unknown type 0
           <...>-3548  [001]   217.087271: Unknown type 0
           <...>-3548  [001]   218.085332: Unknown type 0

[ Impact: fix output for trace events with id >= 256 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h | 4 ++--
 include/trace/ftrace.h       | 2 +-
 kernel/trace/trace.c         | 4 ++--
 kernel/trace/trace.h         | 2 +-
 kernel/trace/trace_events.c  | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 75f3ac01a87..2a4a4074991 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,7 +16,7 @@ struct dentry;
  *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
  */
 struct trace_entry {
-	unsigned char		type;
+	int			type;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
@@ -73,7 +73,7 @@ enum print_line_t {
 
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 39a3351f2e7..15ef08d9add 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -198,7 +198,7 @@ ftrace_define_fields_##call(void)					\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
+	__common_field(int, type);					\
 	__common_field(unsigned char, flags);				\
 	__common_field(unsigned char, preempt_count);			\
 	__common_field(int, pid);					\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b9a3adce922..b6183bc9eca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags, int pc)
 {
@@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc)
 {
 	return trace_buffer_lock_reserve(&global_trace,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 247948e81b0..7d55bcf50e4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 struct ring_buffer_event;
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags,
 						    int pc);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9ea55a7dfde..5d6e879cf87 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(unsigned char, type),
+				FIELD(int, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
-- 
cgit v1.2.3-70-g09d2


From c1c6b14b22af0f85d05a70405dc3fba5de840c7b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 30 Mar 2009 11:32:46 +0200
Subject: rfkill: remove deprecated state constants

I only did superficial review, but these constants are stupid
to have and without proper warnings nobody will review the
code anyway, no amount of shouting will help.

Also fix wimax to use correct states.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 arch/arm/mach-pxa/tosa-bt.c | 4 ++--
 drivers/net/usb/hso.c       | 4 ++--
 include/linux/rfkill.h      | 8 --------
 net/wimax/op-rfkill.c       | 8 ++++----
 4 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/tosa-bt.c b/arch/arm/mach-pxa/tosa-bt.c
index fb0294bd431..bde42aa2937 100644
--- a/arch/arm/mach-pxa/tosa-bt.c
+++ b/arch/arm/mach-pxa/tosa-bt.c
@@ -38,9 +38,9 @@ static void tosa_bt_off(struct tosa_bt_data *data)
 static int tosa_bt_toggle_radio(void *data, enum rfkill_state state)
 {
 	pr_info("BT_RADIO going: %s\n",
-			state == RFKILL_STATE_ON ? "on" : "off");
+			state == RFKILL_STATE_UNBLOCKED ? "on" : "off");
 
-	if (state == RFKILL_STATE_ON) {
+	if (state == RFKILL_STATE_UNBLOCKED) {
 		pr_info("TOSA_BT: going ON\n");
 		tosa_bt_on(data);
 	} else {
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index f84b78d94c4..d696e5fbc17 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2484,7 +2484,7 @@ static int add_net_device(struct hso_device *hso_dev)
 static int hso_radio_toggle(void *data, enum rfkill_state state)
 {
 	struct hso_device *hso_dev = data;
-	int enabled = (state == RFKILL_STATE_ON);
+	int enabled = (state == RFKILL_STATE_UNBLOCKED);
 	int rv;
 
 	mutex_lock(&hso_dev->mutex);
@@ -2522,7 +2522,7 @@ static void hso_create_rfkill(struct hso_device *hso_dev,
 	snprintf(rfkn, 20, "hso-%d",
 		 interface->altsetting->desc.bInterfaceNumber);
 	hso_net->rfkill->name = rfkn;
-	hso_net->rfkill->state = RFKILL_STATE_ON;
+	hso_net->rfkill->state = RFKILL_STATE_UNBLOCKED;
 	hso_net->rfkill->data = hso_dev;
 	hso_net->rfkill->toggle_radio = hso_radio_toggle;
 	if (rfkill_register(hso_net->rfkill) < 0) {
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 164332cbb77..e1ec7d9aa49 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -52,14 +52,6 @@ enum rfkill_state {
 	RFKILL_STATE_MAX,		/* marker for last valid state */
 };
 
-/*
- * These are DEPRECATED, drivers using them should be verified to
- * comply with the rfkill usage guidelines in Documentation/rfkill.txt
- * and then converted to use the new names for rfkill_state
- */
-#define RFKILL_STATE_OFF RFKILL_STATE_SOFT_BLOCKED
-#define RFKILL_STATE_ON  RFKILL_STATE_UNBLOCKED
-
 /**
  * struct rfkill - rfkill control structure.
  * @name: Name of the switch.
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
index 2b75aee0421..870032faece 100644
--- a/net/wimax/op-rfkill.c
+++ b/net/wimax/op-rfkill.c
@@ -113,7 +113,7 @@ void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
 	if (state != wimax_dev->rf_hw) {
 		wimax_dev->rf_hw = state;
 		rfkill_state = state == WIMAX_RF_ON ?
-			RFKILL_STATE_OFF : RFKILL_STATE_ON;
+			RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
 		if (wimax_dev->rf_hw == WIMAX_RF_ON
 		    && wimax_dev->rf_sw == WIMAX_RF_ON)
 			wimax_state = WIMAX_ST_READY;
@@ -259,10 +259,10 @@ int wimax_rfkill_toggle_radio(void *data, enum rfkill_state state)
 
 	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
 	switch (state) {
-	case RFKILL_STATE_ON:
+	case RFKILL_STATE_SOFT_BLOCKED:
 		rf_state = WIMAX_RF_OFF;
 		break;
-	case RFKILL_STATE_OFF:
+	case RFKILL_STATE_UNBLOCKED:
 		rf_state = WIMAX_RF_ON;
 		break;
 	default:
@@ -361,7 +361,7 @@ int wimax_rfkill_add(struct wimax_dev *wimax_dev)
 	wimax_dev->rfkill = rfkill;
 
 	rfkill->name = wimax_dev->name;
-	rfkill->state = RFKILL_STATE_OFF;
+	rfkill->state = RFKILL_STATE_UNBLOCKED;
 	rfkill->data = wimax_dev;
 	rfkill->toggle_radio = wimax_rfkill_toggle_radio;
 	rfkill->user_claim_unsupported = 1;
-- 
cgit v1.2.3-70-g09d2


From 621cac85297de5ba655e3430b007dd2e0da91da6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 27 Mar 2009 14:14:31 +0100
Subject: rfkill: remove user_claim stuff

Almost all drivers do not support user_claim, so remove it
completely and always report -EOPNOTSUPP to userspace. Since
userspace cannot really drive rfkill _anyway_ (due to the
odd restrictions imposed by the documentation) having this
code is just pointless.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/rfkill.txt                  | 16 +++++------
 drivers/net/wireless/ath9k/main.c         |  1 -
 drivers/net/wireless/b43/rfkill.c         |  1 -
 drivers/net/wireless/b43legacy/rfkill.c   |  1 -
 drivers/net/wireless/iwlwifi/iwl-rfkill.c |  1 -
 drivers/platform/x86/acer-wmi.c           |  1 -
 drivers/platform/x86/hp-wmi.c             |  3 ---
 drivers/platform/x86/sony-laptop.c        |  4 ---
 drivers/platform/x86/toshiba_acpi.c       |  1 -
 include/linux/rfkill.h                    |  6 -----
 net/rfkill/rfkill.c                       | 45 +++----------------------------
 net/wimax/op-rfkill.c                     |  1 -
 12 files changed, 9 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 4d3ee317a4a..40c3a3f1081 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -521,16 +521,12 @@ status of the system.
 Input devices may issue events that are related to rfkill.  These are the
 various KEY_* events and SW_* events supported by rfkill-input.c.
 
-******IMPORTANT******
-When rfkill-input is ACTIVE, userspace is NOT TO CHANGE THE STATE OF AN RFKILL
-SWITCH IN RESPONSE TO AN INPUT EVENT also handled by rfkill-input, unless it
-has set to true the user_claim attribute for that particular switch.  This rule
-is *absolute*; do NOT violate it.
-******IMPORTANT******
-
-Userspace must not assume it is the only source of control for rfkill switches.
-Their state CAN and WILL change due to firmware actions, direct user actions,
-and the rfkill-input EPO override for *_RFKILL_ALL.
+Userspace may not change the state of an rfkill switch in response to an
+input event, it should refrain from changing states entirely.
+
+Userspace cannot assume it is the only source of control for rfkill switches.
+Their state can change due to firmware actions, direct user actions, and the
+rfkill-input EPO override for *_RFKILL_ALL.
 
 When rfkill-input is not active, userspace must initiate a rfkill status
 change by writing to the "state" attribute in order for anything to happen.
diff --git a/drivers/net/wireless/ath9k/main.c b/drivers/net/wireless/ath9k/main.c
index 13d4e6756c9..0607df20e49 100644
--- a/drivers/net/wireless/ath9k/main.c
+++ b/drivers/net/wireless/ath9k/main.c
@@ -1267,7 +1267,6 @@ static int ath_init_sw_rfkill(struct ath_softc *sc)
 	sc->rf_kill.rfkill->data = sc;
 	sc->rf_kill.rfkill->toggle_radio = ath_sw_toggle_radio;
 	sc->rf_kill.rfkill->state = RFKILL_STATE_UNBLOCKED;
-	sc->rf_kill.rfkill->user_claim_unsupported = 1;
 
 	return 0;
 }
diff --git a/drivers/net/wireless/b43/rfkill.c b/drivers/net/wireless/b43/rfkill.c
index afad4235869..9e1d00bc24d 100644
--- a/drivers/net/wireless/b43/rfkill.c
+++ b/drivers/net/wireless/b43/rfkill.c
@@ -139,7 +139,6 @@ void b43_rfkill_init(struct b43_wldev *dev)
 	rfk->rfkill->state = RFKILL_STATE_UNBLOCKED;
 	rfk->rfkill->data = dev;
 	rfk->rfkill->toggle_radio = b43_rfkill_soft_toggle;
-	rfk->rfkill->user_claim_unsupported = 1;
 
 	rfk->poll_dev = input_allocate_polled_device();
 	if (!rfk->poll_dev) {
diff --git a/drivers/net/wireless/b43legacy/rfkill.c b/drivers/net/wireless/b43legacy/rfkill.c
index b32bf6a94f1..4b0c7d27a51 100644
--- a/drivers/net/wireless/b43legacy/rfkill.c
+++ b/drivers/net/wireless/b43legacy/rfkill.c
@@ -142,7 +142,6 @@ void b43legacy_rfkill_init(struct b43legacy_wldev *dev)
 	rfk->rfkill->state = RFKILL_STATE_UNBLOCKED;
 	rfk->rfkill->data = dev;
 	rfk->rfkill->toggle_radio = b43legacy_rfkill_soft_toggle;
-	rfk->rfkill->user_claim_unsupported = 1;
 
 	rfk->poll_dev = input_allocate_polled_device();
 	if (!rfk->poll_dev) {
diff --git a/drivers/net/wireless/iwlwifi/iwl-rfkill.c b/drivers/net/wireless/iwlwifi/iwl-rfkill.c
index 2ad9faf1508..65605ad44e4 100644
--- a/drivers/net/wireless/iwlwifi/iwl-rfkill.c
+++ b/drivers/net/wireless/iwlwifi/iwl-rfkill.c
@@ -91,7 +91,6 @@ int iwl_rfkill_init(struct iwl_priv *priv)
 	priv->rfkill->data = priv;
 	priv->rfkill->state = RFKILL_STATE_UNBLOCKED;
 	priv->rfkill->toggle_radio = iwl_rfkill_soft_rf_kill;
-	priv->rfkill->user_claim_unsupported = 1;
 
 	priv->rfkill->dev.class->suspend = NULL;
 	priv->rfkill->dev.class->resume = NULL;
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 0f6e43bf4fc..62d02b3c998 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -1005,7 +1005,6 @@ enum rfkill_type type, char *name, u32 cap)
 	*data = cap;
 	rfkill_dev->data = data;
 	rfkill_dev->toggle_radio = acer_rfkill_set;
-	rfkill_dev->user_claim_unsupported = 1;
 
 	err = rfkill_register(rfkill_dev);
 	if (err) {
diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c
index 50d9019de2b..fe171fad12c 100644
--- a/drivers/platform/x86/hp-wmi.c
+++ b/drivers/platform/x86/hp-wmi.c
@@ -434,7 +434,6 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 		wifi_rfkill->name = "hp-wifi";
 		wifi_rfkill->state = hp_wmi_wifi_state();
 		wifi_rfkill->toggle_radio = hp_wmi_wifi_set;
-		wifi_rfkill->user_claim_unsupported = 1;
 		err = rfkill_register(wifi_rfkill);
 		if (err)
 			goto add_sysfs_error;
@@ -446,7 +445,6 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 		bluetooth_rfkill->name = "hp-bluetooth";
 		bluetooth_rfkill->state = hp_wmi_bluetooth_state();
 		bluetooth_rfkill->toggle_radio = hp_wmi_bluetooth_set;
-		bluetooth_rfkill->user_claim_unsupported = 1;
 		err = rfkill_register(bluetooth_rfkill);
 		if (err)
 			goto register_bluetooth_error;
@@ -457,7 +455,6 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 		wwan_rfkill->name = "hp-wwan";
 		wwan_rfkill->state = hp_wmi_wwan_state();
 		wwan_rfkill->toggle_radio = hp_wmi_wwan_set;
-		wwan_rfkill->user_claim_unsupported = 1;
 		err = rfkill_register(wwan_rfkill);
 		if (err)
 			goto register_wwan_err;
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index d3c92d777bd..184e99e7268 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -1097,7 +1097,6 @@ static int sony_nc_setup_wifi_rfkill(struct acpi_device *device)
 	sony_wifi_rfkill->name = "sony-wifi";
 	sony_wifi_rfkill->toggle_radio = sony_nc_rfkill_set;
 	sony_wifi_rfkill->get_state = sony_nc_rfkill_get;
-	sony_wifi_rfkill->user_claim_unsupported = 1;
 	sony_wifi_rfkill->data = (void *)SONY_WIFI;
 	err = rfkill_register(sony_wifi_rfkill);
 	if (err)
@@ -1119,7 +1118,6 @@ static int sony_nc_setup_bluetooth_rfkill(struct acpi_device *device)
 	sony_bluetooth_rfkill->name = "sony-bluetooth";
 	sony_bluetooth_rfkill->toggle_radio = sony_nc_rfkill_set;
 	sony_bluetooth_rfkill->get_state = sony_nc_rfkill_get;
-	sony_bluetooth_rfkill->user_claim_unsupported = 1;
 	sony_bluetooth_rfkill->data = (void *)SONY_BLUETOOTH;
 	err = rfkill_register(sony_bluetooth_rfkill);
 	if (err)
@@ -1140,7 +1138,6 @@ static int sony_nc_setup_wwan_rfkill(struct acpi_device *device)
 	sony_wwan_rfkill->name = "sony-wwan";
 	sony_wwan_rfkill->toggle_radio = sony_nc_rfkill_set;
 	sony_wwan_rfkill->get_state = sony_nc_rfkill_get;
-	sony_wwan_rfkill->user_claim_unsupported = 1;
 	sony_wwan_rfkill->data = (void *)SONY_WWAN;
 	err = rfkill_register(sony_wwan_rfkill);
 	if (err)
@@ -1161,7 +1158,6 @@ static int sony_nc_setup_wimax_rfkill(struct acpi_device *device)
 	sony_wimax_rfkill->name = "sony-wimax";
 	sony_wimax_rfkill->toggle_radio = sony_nc_rfkill_set;
 	sony_wimax_rfkill->get_state = sony_nc_rfkill_get;
-	sony_wimax_rfkill->user_claim_unsupported = 1;
 	sony_wimax_rfkill->data = (void *)SONY_WIMAX;
 	err = rfkill_register(sony_wimax_rfkill);
 	if (err)
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 9f187265db8..4345089f517 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -803,7 +803,6 @@ static int __init toshiba_acpi_init(void)
 
 		toshiba_acpi.rfk_dev->name = toshiba_acpi.bt_name;
 		toshiba_acpi.rfk_dev->toggle_radio = bt_rfkill_toggle_radio;
-		toshiba_acpi.rfk_dev->user_claim_unsupported = 1;
 		toshiba_acpi.rfk_dev->data = &toshiba_acpi;
 
 		if (hci_get_bt_on(&bt_on) == HCI_SUCCESS && bt_on) {
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index e1ec7d9aa49..de18ef227e0 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -58,9 +58,6 @@ enum rfkill_state {
  * @type: Radio type which the button controls, the value stored
  *	here should be a value from enum rfkill_type.
  * @state: State of the switch, "UNBLOCKED" means radio can operate.
- * @user_claim_unsupported: Whether the hardware supports exclusive
- *	RF-kill control by userspace. Set this before registering.
- * @user_claim: Set when the switch is controlled exlusively by userspace.
  * @mutex: Guards switch state transitions.  It serializes callbacks
  *	and also protects the state.
  * @data: Pointer to the RF button drivers private data which will be
@@ -83,9 +80,6 @@ struct rfkill {
 	const char *name;
 	enum rfkill_type type;
 
-	bool user_claim_unsupported;
-	bool user_claim;
-
 	/* the mutex serializes callbacks and also protects
 	 * the state */
 	struct mutex mutex;
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 3eaa39403c1..df1269c5ca7 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -200,7 +200,7 @@ static void __rfkill_switch_all(const enum rfkill_type type,
 
 	rfkill_global_states[type].current_state = state;
 	list_for_each_entry(rfkill, &rfkill_list, node) {
-		if ((!rfkill->user_claim) && (rfkill->type == type)) {
+		if (rfkill->type == type) {
 			mutex_lock(&rfkill->mutex);
 			rfkill_toggle_radio(rfkill, state, 0);
 			mutex_unlock(&rfkill->mutex);
@@ -447,53 +447,14 @@ static ssize_t rfkill_claim_show(struct device *dev,
 				 struct device_attribute *attr,
 				 char *buf)
 {
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	return sprintf(buf, "%d\n", rfkill->user_claim);
+	return sprintf(buf, "%d\n", 0);
 }
 
 static ssize_t rfkill_claim_store(struct device *dev,
 				  struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
-	struct rfkill *rfkill = to_rfkill(dev);
-	unsigned long claim_tmp;
-	bool claim;
-	int error;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (rfkill->user_claim_unsupported)
-		return -EOPNOTSUPP;
-
-	error = strict_strtoul(buf, 0, &claim_tmp);
-	if (error)
-		return error;
-	claim = !!claim_tmp;
-
-	/*
-	 * Take the global lock to make sure the kernel is not in
-	 * the middle of rfkill_switch_all
-	 */
-	error = mutex_lock_killable(&rfkill_global_mutex);
-	if (error)
-		return error;
-
-	if (rfkill->user_claim != claim) {
-		if (!claim && !rfkill_epo_lock_active) {
-			mutex_lock(&rfkill->mutex);
-			rfkill_toggle_radio(rfkill,
-					rfkill_global_states[rfkill->type].current_state,
-					0);
-			mutex_unlock(&rfkill->mutex);
-		}
-		rfkill->user_claim = claim;
-	}
-
-	mutex_unlock(&rfkill_global_mutex);
-
-	return error ? error : count;
+	return -EOPNOTSUPP;
 }
 
 static struct device_attribute rfkill_dev_attrs[] = {
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
index 870032faece..a3616e2ccb8 100644
--- a/net/wimax/op-rfkill.c
+++ b/net/wimax/op-rfkill.c
@@ -364,7 +364,6 @@ int wimax_rfkill_add(struct wimax_dev *wimax_dev)
 	rfkill->state = RFKILL_STATE_UNBLOCKED;
 	rfkill->data = wimax_dev;
 	rfkill->toggle_radio = wimax_rfkill_toggle_radio;
-	rfkill->user_claim_unsupported = 1;
 
 	/* Initialize the input device for the hw key */
 	input_dev = input_allocate_device();
-- 
cgit v1.2.3-70-g09d2


From a3b8b0569fbef725597f05278ec58083321f6e9d Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Fri, 27 Mar 2009 21:59:49 +0200
Subject: nl80211: Add Michael MIC failure event

Define a new nl80211 event, NL80211_CMD_MICHAEL_MIC_FAILURE, to be
used to notify user space about locally detected Michael MIC failures.
This matches with the MLME-MICHAELMICFAILURE.indication() primitive.

Since we do not actually have TSC in the skb anymore when
mac80211_ev_michael_mic_failure() is called, that function is changed
to take in the TSC as an optional parameter instead of as a
requirement to include the TSC after the hdr field (which we did not
really follow). For now, TSC is not included in the events from
mac80211, but it could be added at some point.

Signed-off-by: Jouni Malinen <j@w1.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    | 28 ++++++++++++++++++++++++++++
 include/net/cfg80211.h     | 16 ++++++++++++++++
 net/mac80211/event.c       | 17 +++++++++--------
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/rx.c          |  2 +-
 net/mac80211/wpa.c         |  2 +-
 net/wireless/mlme.c        | 10 ++++++++++
 net/wireless/nl80211.c     | 40 ++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h     |  5 +++++
 9 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index cbe8ce3bf48..27f230f063b 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -199,6 +199,14 @@
  *	NL80211_CMD_AUTHENTICATE but for Disassociation frames (similar to
  *	MLME-DISASSOCIATE.request and MLME-DISASSOCIATE.indication primitives).
  *
+ * @NL80211_CMD_MICHAEL_MIC_FAILURE: notification of a locally detected Michael
+ *	MIC (part of TKIP) failure; sent on the "mlme" multicast group; the
+ *	event includes %NL80211_ATTR_MAC to describe the source MAC address of
+ *	the frame with invalid MIC, %NL80211_ATTR_KEY_TYPE to show the key
+ *	type, %NL80211_ATTR_KEY_IDX to indicate the key identifier, and
+ *	%NL80211_ATTR_KEY_SEQ to indicate the TSC value of the frame; this
+ *	event matches with MLME-MICHAELMICFAILURE.indication() primitive
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -260,6 +268,8 @@ enum nl80211_commands {
 	NL80211_CMD_DEAUTHENTICATE,
 	NL80211_CMD_DISASSOCIATE,
 
+	NL80211_CMD_MICHAEL_MIC_FAILURE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -408,6 +418,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_REASON_CODE: ReasonCode for %NL80211_CMD_DEAUTHENTICATE and
  *	%NL80211_CMD_DISASSOCIATE, u16
  *
+ * @NL80211_ATTR_KEY_TYPE: Key Type, see &enum nl80211_key_type, represented as
+ *	a u32
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -492,6 +505,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_AUTH_TYPE,
 	NL80211_ATTR_REASON_CODE,
 
+	NL80211_ATTR_KEY_TYPE,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -1062,4 +1077,17 @@ enum nl80211_auth_type {
 	NL80211_AUTHTYPE_FT,
 	NL80211_AUTHTYPE_NETWORK_EAP,
 };
+
+/**
+ * enum nl80211_key_type - Key Type
+ * @NL80211_KEYTYPE_GROUP: Group (broadcast/multicast) key
+ * @NL80211_KEYTYPE_PAIRWISE: Pairwise (unicast/individual) key
+ * @NL80211_KEYTYPE_PEERKEY: PeerKey (DLS)
+ */
+enum nl80211_key_type {
+	NL80211_KEYTYPE_GROUP,
+	NL80211_KEYTYPE_PAIRWISE,
+	NL80211_KEYTYPE_PEERKEY,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ec33096fc65..f8bf0c86650 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -957,4 +957,20 @@ void cfg80211_hold_bss(struct cfg80211_bss *bss);
  */
 void cfg80211_unhold_bss(struct cfg80211_bss *bss);
 
+/**
+ * cfg80211_michael_mic_failure - notification of Michael MIC failure (TKIP)
+ * @dev: network device
+ * @addr: The source MAC address of the frame
+ * @key_type: The key type that the received frame used
+ * @key_id: Key identifier (0..3)
+ * @tsc: The TSC value of the frame that generated the MIC failure (6 octets)
+ *
+ * This function is called whenever the local MAC detects a MIC failure in a
+ * received frame. This matches with MLME-MICHAELMICFAILURE.indication()
+ * primitive.
+ */
+void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
+				  enum nl80211_key_type key_type, int key_id,
+				  const u8 *tsc);
+
 #endif /* __NET_CFG80211_H */
diff --git a/net/mac80211/event.c b/net/mac80211/event.c
index 0d95561c0ee..f288d01a634 100644
--- a/net/mac80211/event.c
+++ b/net/mac80211/event.c
@@ -12,12 +12,12 @@
 #include "ieee80211_i.h"
 
 /*
- * indicate a failed Michael MIC to userspace; the passed packet
- * (in the variable hdr) must be long enough to extract the TKIP
- * fields like TSC
+ * Indicate a failed Michael MIC to userspace. If the caller knows the TSC of
+ * the frame that generated the MIC failure (i.e., if it was provided by the
+ * driver or is still in the frame), it should provide that information.
  */
 void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
-				     struct ieee80211_hdr *hdr)
+				     struct ieee80211_hdr *hdr, const u8 *tsc)
 {
 	union iwreq_data wrqu;
 	char *buf = kmalloc(128, GFP_ATOMIC);
@@ -34,8 +34,9 @@ void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int ke
 		kfree(buf);
 	}
 
-	/*
-	 * TODO: re-add support for sending MIC failure indication
-	 * with all info via nl80211
-	 */
+	cfg80211_michael_mic_failure(sdata->dev, hdr->addr2,
+				     (hdr->addr1[0] & 0x01) ?
+				     NL80211_KEYTYPE_GROUP :
+				     NL80211_KEYTYPE_PAIRWISE,
+				     keyidx, tsc);
 }
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e6ed78cb16b..312347d102c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1060,7 +1060,7 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
 			     int rate, int erp, int short_preamble);
 void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
-				     struct ieee80211_hdr *hdr);
+				     struct ieee80211_hdr *hdr, const u8 *tsc);
 void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata);
 void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 5fa7aedd90e..19c4b4589fe 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1932,7 +1932,7 @@ static void ieee80211_rx_michael_mic_report(struct net_device *dev,
 	    !ieee80211_is_auth(hdr->frame_control))
 		goto ignore;
 
-	mac80211_ev_michael_mic_failure(rx->sdata, keyidx, hdr);
+	mac80211_ev_michael_mic_failure(rx->sdata, keyidx, hdr, NULL);
  ignore:
 	dev_kfree_skb(rx->skb);
 	rx->skb = NULL;
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 4f8bfea278f..dcfae8884b8 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -122,7 +122,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
 			return RX_DROP_UNUSABLE;
 
 		mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx,
-						(void *) skb->data);
+						(void *) skb->data, NULL);
 		return RX_DROP_UNUSABLE;
 	}
 
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 33ff7cca496..1407244a647 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -43,3 +43,13 @@ void cfg80211_send_disassoc(struct net_device *dev, const u8 *buf, size_t len)
 	nl80211_send_disassoc(rdev, dev, buf, len);
 }
 EXPORT_SYMBOL(cfg80211_send_disassoc);
+
+void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
+				  enum nl80211_key_type key_type, int key_id,
+				  const u8 *tsc)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc);
+}
+EXPORT_SYMBOL(cfg80211_michael_mic_failure);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 195424eee77..1394115cde9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3430,6 +3430,46 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
 				NL80211_CMD_DISASSOCIATE);
 }
 
+void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, const u8 *addr,
+				 enum nl80211_key_type key_type, int key_id,
+				 const u8 *tsc)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_MICHAEL_MIC_FAILURE);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	if (addr)
+		NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, addr);
+	NLA_PUT_U32(msg, NL80211_ATTR_KEY_TYPE, key_type);
+	NLA_PUT_U8(msg, NL80211_ATTR_KEY_IDX, key_id);
+	if (tsc)
+		NLA_PUT(msg, NL80211_ATTR_KEY_SEQ, 6, tsc);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_mlme_mcgrp.id, GFP_KERNEL);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 /* initialisation/exit functions */
 
 int nl80211_init(void)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 55686fc264f..e4b92cccd15 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -23,5 +23,10 @@ extern void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
 extern void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
 				  struct net_device *netdev,
 				  const u8 *buf, size_t len);
+extern void
+nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev, const u8 *addr,
+			    enum nl80211_key_type key_type,
+			    int key_id, const u8 *tsc);
 
 #endif /* __NET_WIRELESS_NL80211_H */
-- 
cgit v1.2.3-70-g09d2


From 18a8365992a8041aa178ae9ad5f0d951d0457230 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 31 Mar 2009 12:12:05 +0200
Subject: cfg80211: introduce scan IE limit attribute

This patch introduces a new attribute for a wiphy that tells
userspace how long the information elements added to a probe
request frame can be at most. It also updates the at76 to
advertise that it cannot support that, and, for now until I
can fix that, iwlwifi too.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/at76c50x-usb.c     |  1 +
 drivers/net/wireless/iwlwifi/iwl-core.c |  1 +
 include/linux/nl80211.h                 |  4 ++++
 include/net/wireless.h                  |  1 +
 net/mac80211/main.c                     | 13 ++++++++++++-
 net/mac80211/util.c                     |  2 ++
 net/wireless/nl80211.c                  |  7 +++++++
 7 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/at76c50x-usb.c b/drivers/net/wireless/at76c50x-usb.c
index 69248ded510..55f947ac56d 100644
--- a/drivers/net/wireless/at76c50x-usb.c
+++ b/drivers/net/wireless/at76c50x-usb.c
@@ -2250,6 +2250,7 @@ static int at76_init_new_device(struct at76_priv *priv,
 
 	/* mac80211 initialisation */
 	priv->hw->wiphy->max_scan_ssids = 1;
+	priv->hw->wiphy->max_scan_ie_len = 0;
 	priv->hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION);
 	priv->hw->wiphy->bands[IEEE80211_BAND_2GHZ] = &at76_supported_band;
 	priv->hw->flags = IEEE80211_HW_RX_INCLUDES_FCS |
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 82abb1f9087..ef55f91374a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -1306,6 +1306,7 @@ int iwl_setup_mac(struct iwl_priv *priv)
 
 	hw->wiphy->custom_regulatory = true;
 	hw->wiphy->max_scan_ssids = 1;
+	hw->wiphy->max_scan_ie_len = 0; /* XXX for now */
 
 	/* Default value; 4 EDCA QOS priorities */
 	hw->queues = 4;
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 27f230f063b..209cacee528 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -390,6 +390,8 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_MAX_NUM_SCAN_SSIDS: number of SSIDs you can scan with
  *	a single scan request, a wiphy attribute.
+ * @NL80211_ATTR_MAX_SCAN_IE_LEN: maximum length of information elements
+ *	that can be added to a scan request
  *
  * @NL80211_ATTR_SCAN_FREQUENCIES: nested attribute with frequencies (in MHz)
  * @NL80211_ATTR_SCAN_SSIDS: nested attribute with SSIDs, leave out for passive
@@ -507,6 +509,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_KEY_TYPE,
 
+	NL80211_ATTR_MAX_SCAN_IE_LEN,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/wireless.h b/include/net/wireless.h
index 64a76208580..2bcdeda46d8 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -222,6 +222,7 @@ struct wiphy {
 
 	int bss_priv_size;
 	u8 max_scan_ssids;
+	u16 max_scan_ie_len;
 
 	/* If multiple wiphys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee80211_ptr, you won't
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index fbcbed6cad0..ee58a787369 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -728,7 +728,18 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 		return NULL;
 
 	wiphy->privid = mac80211_wiphy_privid;
-	wiphy->max_scan_ssids = 4;
+
+	if (!ops->hw_scan) {
+		/* For hw_scan, driver needs to set these up. */
+		wiphy->max_scan_ssids = 4;
+
+		/* we support a maximum of 32 rates in cfg80211 */
+		wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN
+					 - 2 - 32 /* SSID */
+					 - 4 - 32 /* (ext) supp rates */;
+
+	}
+
 	/* Yes, putting cfg80211_bss into ieee80211_bss is a hack */
 	wiphy->bss_priv_size = sizeof(struct ieee80211_bss) -
 			       sizeof(struct cfg80211_bss);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index fdf432f1455..05caf34f31d 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -890,6 +890,8 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 		*pos = rate->bitrate / 5;
 	}
 
+	/* if adding more here, adjust max_scan_ie_len */
+
 	if (ie)
 		memcpy(skb_put(skb, ie_len), ie, ie_len);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1394115cde9..447fa1790b4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -181,6 +181,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
 	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
 		   dev->wiphy.max_scan_ssids);
+	NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
+		    dev->wiphy.max_scan_ie_len);
 
 	nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
 	if (!nl_modes)
@@ -2528,6 +2530,11 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	else
 		ie_len = 0;
 
+	if (ie_len > wiphy->max_scan_ie_len) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	request = kzalloc(sizeof(*request)
 			+ sizeof(*ssid) * n_ssids
 			+ sizeof(channel) * n_channels
-- 
cgit v1.2.3-70-g09d2


From 6bad8766620a3c8b64afa981502fdb543e3cfd6c Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Thu, 2 Apr 2009 14:08:09 -0400
Subject: cfg80211: send regulatory beacon hint events to userspace

This informs userspace when a change has occured on a world
roaming wiphy's channel which has lifted some restrictions
due to a regulatory beacon hint.

Because this is now sent to userspace through the regulatory
multicast group we remove the debug prints we used to use as
they are no longer necessary.

Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 34 ++++++++++++++++++++++++++++++-
 net/wireless/nl80211.c  | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h  |  5 +++++
 net/wireless/reg.c      | 28 ++++++++++++-------------
 4 files changed, 106 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 209cacee528..05ba3539b77 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -7,7 +7,7 @@
  * Copyright 2008 Michael Wu <flamingice@sourmilk.net>
  * Copyright 2008 Luis Carlos Cobo <luisca@cozybit.com>
  * Copyright 2008 Michael Buesch <mb@bu3sch.de>
- * Copyright 2008 Luis R. Rodriguez <lrodriguez@atheros.com>
+ * Copyright 2008, 2009 Luis R. Rodriguez <lrodriguez@atheros.com>
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  *
@@ -166,6 +166,22 @@
  * 	set (%NL80211_ATTR_REG_TYPE), if the type of regulatory domain is
  * 	%NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on
  * 	to (%NL80211_ATTR_REG_ALPHA2).
+ * @NL80211_CMD_REG_BEACON_HINT: indicates to userspace that an AP beacon
+ * 	has been found while world roaming thus enabling active scan or
+ * 	any mode of operation that initiates TX (beacons) on a channel
+ * 	where we would not have been able to do either before. As an example
+ * 	if you are world roaming (regulatory domain set to world or if your
+ * 	driver is using a custom world roaming regulatory domain) and while
+ * 	doing a passive scan on the 5 GHz band you find an AP there (if not
+ * 	on a DFS channel) you will now be able to actively scan for that AP
+ * 	or use AP mode on your card on that same channel. Note that this will
+ * 	never be used for channels 1-11 on the 2 GHz band as they are always
+ * 	enabled world wide. This beacon hint is only sent if your device had
+ * 	either disabled active scanning or beaconing on a channel. We send to
+ * 	userspace the wiphy on which we removed a restriction from
+ * 	(%NL80211_ATTR_WIPHY) and the channel on which this occurred
+ * 	before (%NL80211_ATTR_FREQ_BEFORE) and after (%NL80211_ATTR_FREQ_AFTER)
+ * 	the beacon hint was processed.
  *
  * @NL80211_CMD_AUTHENTICATE: authentication request and notification.
  *	This command is used both as a command (request to authenticate) and
@@ -270,6 +286,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_MICHAEL_MIC_FAILURE,
 
+	NL80211_CMD_REG_BEACON_HINT,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -288,6 +306,7 @@ enum nl80211_commands {
 #define NL80211_CMD_ASSOCIATE NL80211_CMD_ASSOCIATE
 #define NL80211_CMD_DEAUTHENTICATE NL80211_CMD_DEAUTHENTICATE
 #define NL80211_CMD_DISASSOCIATE NL80211_CMD_DISASSOCIATE
+#define NL80211_CMD_REG_BEACON_HINT NL80211_CMD_REG_BEACON_HINT
 
 /**
  * enum nl80211_attrs - nl80211 netlink attributes
@@ -423,6 +442,17 @@ enum nl80211_commands {
  * @NL80211_ATTR_KEY_TYPE: Key Type, see &enum nl80211_key_type, represented as
  *	a u32
  *
+ * @NL80211_ATTR_FREQ_BEFORE: A channel which has suffered a regulatory change
+ * 	due to considerations from a beacon hint. This attribute reflects
+ * 	the state of the channel _before_ the beacon hint processing. This
+ * 	attributes consists of a nested attribute containing
+ * 	NL80211_FREQUENCY_ATTR_*
+ * @NL80211_ATTR_FREQ_AFTER: A channel which has suffered a regulatory change
+ * 	due to considerations from a beacon hint. This attribute reflects
+ * 	the state of the channel _after_ the beacon hint processing. This
+ * 	attributes consists of a nested attribute containing
+ * 	NL80211_FREQUENCY_ATTR_*
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -511,6 +541,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MAX_SCAN_IE_LEN,
 
+	NL80211_ATTR_FREQ_BEFORE,
+	NL80211_ATTR_FREQ_AFTER,
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7285bdc4e59..85b5aa3c76f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3491,6 +3491,60 @@ void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 	nlmsg_free(msg);
 }
 
+void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
+				    struct ieee80211_channel *channel_before,
+				    struct ieee80211_channel *channel_after)
+{
+	struct sk_buff *msg;
+	void *hdr;
+	struct nlattr *nl_freq;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_BEACON_HINT);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	/*
+	 * Since we are applying the beacon hint to a wiphy we know its
+	 * wiphy_idx is valid
+	 */
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy));
+
+	/* Before */
+	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
+	if (!nl_freq)
+		goto nla_put_failure;
+	if (nl80211_msg_put_channel(msg, channel_before))
+		goto nla_put_failure;
+	nla_nest_end(msg, nl_freq);
+
+	/* After */
+	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
+	if (!nl_freq)
+		goto nla_put_failure;
+	if (nl80211_msg_put_channel(msg, channel_after))
+		goto nla_put_failure;
+	nla_nest_end(msg, nl_freq);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_ATOMIC);
+
+	return;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 /* initialisation/exit functions */
 
 int nl80211_init(void)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index e4b92cccd15..b3aaa59afa0 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -29,4 +29,9 @@ nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 			    enum nl80211_key_type key_type,
 			    int key_id, const u8 *tsc);
 
+extern void
+nl80211_send_beacon_hint_event(struct wiphy *wiphy,
+			       struct ieee80211_channel *channel_before,
+			       struct ieee80211_channel *channel_after);
+
 #endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 4af4304cec3..574e217bcc8 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1049,18 +1049,10 @@ static void handle_reg_beacon(struct wiphy *wiphy,
 			      unsigned int chan_idx,
 			      struct reg_beacon *reg_beacon)
 {
-#ifdef CONFIG_CFG80211_REG_DEBUG
-#define REG_DEBUG_BEACON_FLAG(desc) \
-	printk(KERN_DEBUG "cfg80211: Enabling " desc " on " \
-		"frequency: %d MHz (Ch %d) on %s\n", \
-		reg_beacon->chan.center_freq, \
-		ieee80211_frequency_to_channel(reg_beacon->chan.center_freq), \
-		wiphy_name(wiphy));
-#else
-#define REG_DEBUG_BEACON_FLAG(desc) do {} while (0)
-#endif
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_channel *chan;
+	bool channel_changed = false;
+	struct ieee80211_channel chan_before;
 
 	assert_cfg80211_lock();
 
@@ -1070,20 +1062,28 @@ static void handle_reg_beacon(struct wiphy *wiphy,
 	if (likely(chan->center_freq != reg_beacon->chan.center_freq))
 		return;
 
+	if (chan->beacon_found)
+		return;
+
+	chan->beacon_found = true;
+
+	chan_before.center_freq = chan->center_freq;
+	chan_before.flags = chan->flags;
+
 	if ((chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) &&
 	    !(chan->orig_flags & IEEE80211_CHAN_PASSIVE_SCAN)) {
 		chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN;
-		REG_DEBUG_BEACON_FLAG("active scanning");
+		channel_changed = true;
 	}
 
 	if ((chan->flags & IEEE80211_CHAN_NO_IBSS) &&
 	    !(chan->orig_flags & IEEE80211_CHAN_NO_IBSS)) {
 		chan->flags &= ~IEEE80211_CHAN_NO_IBSS;
-		REG_DEBUG_BEACON_FLAG("beaconing");
+		channel_changed = true;
 	}
 
-	chan->beacon_found = true;
-#undef REG_DEBUG_BEACON_FLAG
+	if (channel_changed)
+		nl80211_send_beacon_hint_event(wiphy, &chan_before, chan);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 25e47c18ac4d8ad09c2ed4b99c1dbbcb7e3d2c51 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 2 Apr 2009 20:14:06 +0200
Subject: cfg80211: add cipher capabilities

This adds the necessary code and fields to let drivers specify
their cipher capabilities and exports them to userspace. Also
update mac80211 to export the ciphers it has.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  4 ++++
 include/net/wireless.h  |  5 +++++
 net/mac80211/main.c     | 14 ++++++++++++++
 net/wireless/nl80211.c  | 14 +++++++++++++-
 4 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 05ba3539b77..c01423888db 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -453,6 +453,9 @@ enum nl80211_commands {
  * 	attributes consists of a nested attribute containing
  * 	NL80211_FREQUENCY_ATTR_*
  *
+ * @NL80211_ATTR_CIPHER_SUITES: a set of u32 values indicating the supported
+ *	cipher suites
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -540,6 +543,7 @@ enum nl80211_attrs {
 	NL80211_ATTR_KEY_TYPE,
 
 	NL80211_ATTR_MAX_SCAN_IE_LEN,
+	NL80211_ATTR_CIPHER_SUITES,
 
 	NL80211_ATTR_FREQ_BEFORE,
 	NL80211_ATTR_FREQ_AFTER,
diff --git a/include/net/wireless.h b/include/net/wireless.h
index 2bcdeda46d8..44c2642d3c0 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -205,6 +205,8 @@ struct ieee80211_supported_band {
  *	on the reg_notifier() if it chooses to ignore future
  *	regulatory domain changes caused by other drivers.
  * @signal_type: signal type reported in &struct cfg80211_bss.
+ * @cipher_suites: supported cipher suites
+ * @n_cipher_suites: number of supported cipher suites
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -224,6 +226,9 @@ struct wiphy {
 	u8 max_scan_ssids;
 	u16 max_scan_ie_len;
 
+	int n_cipher_suites;
+	const u32 *cipher_suites;
+
 	/* If multiple wiphys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee80211_ptr, you won't
 	 * know whether it points to a wiphy your driver has registered
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 679b3a14f11..c1145be72da 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -823,6 +823,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	struct ieee80211_master_priv *mpriv;
 	int channels, i, j, max_bitrates;
 	bool supp_ht;
+	static const u32 cipher_suites[] = {
+		WLAN_CIPHER_SUITE_WEP40,
+		WLAN_CIPHER_SUITE_WEP104,
+		WLAN_CIPHER_SUITE_TKIP,
+		WLAN_CIPHER_SUITE_CCMP,
+
+		/* keep last -- depends on hw flags! */
+		WLAN_CIPHER_SUITE_AES_CMAC
+	};
 
 	/*
 	 * generic code guarantees at least one band,
@@ -894,6 +903,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	if (local->hw.wiphy->max_scan_ie_len)
 		local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
 
+	local->hw.wiphy->cipher_suites = cipher_suites;
+	local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
+	if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
+		local->hw.wiphy->n_cipher_suites--;
+
 	result = wiphy_register(local->hw.wiphy);
 	if (result < 0)
 		goto fail_wiphy_register;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 85b5aa3c76f..d33cab0e0fb 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -208,6 +208,10 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
 		    dev->wiphy.max_scan_ie_len);
 
+	NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES,
+		sizeof(u32) * dev->wiphy.n_cipher_suites,
+		dev->wiphy.cipher_suites);
+
 	nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
 	if (!nl_modes)
 		goto nla_put_failure;
@@ -979,7 +983,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *drv;
-	int err;
+	int err, i;
 	struct net_device *dev;
 	struct key_params params;
 	u8 key_idx = 0;
@@ -1048,6 +1052,14 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto unlock_rtnl;
 
+	for (i = 0; i < drv->wiphy.n_cipher_suites; i++)
+		if (params.cipher == drv->wiphy.cipher_suites[i])
+			break;
+	if (i == drv->wiphy.n_cipher_suites) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	if (!drv->ops->add_key) {
 		err = -EOPNOTSUPP;
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 6a362bb1c9f900f7e6daeee52ff2d538badae49b Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Sun, 5 Apr 2009 11:01:20 -0400
Subject: ath5k: add support for Fukato Datacask Jupiter LEDs

This adds support for the LEDs on the Jupiter netbook.

Reported-by: Martin Bammer <mrb74@gmx.at>

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath5k/led.c | 2 ++
 include/linux/pci_ids.h              | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath5k/led.c b/drivers/net/wireless/ath/ath5k/led.c
index 006f7716168..cbdc0b30842 100644
--- a/drivers/net/wireless/ath/ath5k/led.c
+++ b/drivers/net/wireless/ath/ath5k/led.c
@@ -67,6 +67,8 @@ static const struct pci_device_id ath5k_led_devices[] = {
 	{ ATH_SDEVICE(PCI_VENDOR_ID_AMBIT, 0x0428), ATH_LED(3, 0) },
 	/* Acer Extensa 5620z (nekoreeve@gmail.com) */
 	{ ATH_SDEVICE(PCI_VENDOR_ID_QMI, 0x0105), ATH_LED(3, 0) },
+	/* Fukato Datacask Jupiter 1014a (mrb74@gmx.at) */
+	{ ATH_SDEVICE(PCI_VENDOR_ID_AZWAVE, 0x1026), ATH_LED(3, 0) },
 	{ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ee98cd57088..ea061e290d0 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2313,6 +2313,8 @@
 
 #define PCI_VENDOR_ID_QMI		0x1a32
 
+#define PCI_VENDOR_ID_AZWAVE		0x1a3b
+
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
-- 
cgit v1.2.3-70-g09d2


From e45d8e534b67580eedd9b4910ccc16d6dd3cceff Mon Sep 17 00:00:00 2001
From: Bing Zhao <bzhao@marvell.com>
Date: Mon, 6 Apr 2009 15:50:56 -0700
Subject: libertas: add support for Marvell SD8688 chip

libertas: add support for Marvell SD8688 chip

Use RxPD->pkt_ptr to locate eth803 header in the packet
received since SD8688/v10 firmware allows a gap between
RxPD and eth803 header.

Set SDIO block size to 256 for CMD53.
The maximum block size for SD8688 WLAN function is set
to 512 in TPLFE_MAX_BLK_SIZE. But using 512 as block size
results upto 2K bytes data (4 blocks) being transferred
and causes buffer overflow in firmware.

Both changes above are backward compatible with earlier
firmware versions for SD8385/SD8686.

The SDIO_DEVICE_IDs for SD8688 chip are added in
include/linux/mmc/sdio_ids.h

Signed-off-by: Kiran Divekar <dkiran@marvell.com>
Signed-off-by: Bing Zhao <bzhao@marvell.com>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/Kconfig            |  4 ++--
 drivers/net/wireless/libertas/if_sdio.c | 17 +++++++++++++----
 drivers/net/wireless/libertas/if_sdio.h |  2 ++
 drivers/net/wireless/libertas/rx.c      | 15 ++++++++-------
 include/linux/mmc/sdio_ids.h            |  2 ++
 5 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index 8f279e7bd04..ad99470ae92 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -146,10 +146,10 @@ config LIBERTAS_CS
 	  A driver for Marvell Libertas 8385 CompactFlash devices.
 
 config LIBERTAS_SDIO
-	tristate "Marvell Libertas 8385 and 8686 SDIO 802.11b/g cards"
+	tristate "Marvell Libertas 8385/8686/8688 SDIO 802.11b/g cards"
 	depends on LIBERTAS && MMC
 	---help---
-	  A driver for Marvell Libertas 8385 and 8686 SDIO devices.
+	  A driver for Marvell Libertas 8385/8686/8688 SDIO devices.
 
 config LIBERTAS_SPI
 	tristate "Marvell Libertas 8686 SPI 802.11b/g cards"
diff --git a/drivers/net/wireless/libertas/if_sdio.c b/drivers/net/wireless/libertas/if_sdio.c
index 76f4c653d64..55864c10f9f 100644
--- a/drivers/net/wireless/libertas/if_sdio.c
+++ b/drivers/net/wireless/libertas/if_sdio.c
@@ -48,8 +48,11 @@ static char *lbs_fw_name = NULL;
 module_param_named(fw_name, lbs_fw_name, charp, 0644);
 
 static const struct sdio_device_id if_sdio_ids[] = {
-	{ SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL, SDIO_DEVICE_ID_MARVELL_LIBERTAS) },
-	{ /* end: all zeroes */						},
+	{ SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL,
+			SDIO_DEVICE_ID_MARVELL_LIBERTAS) },
+	{ SDIO_DEVICE(SDIO_VENDOR_ID_MARVELL,
+			SDIO_DEVICE_ID_MARVELL_8688WLAN) },
+	{ /* end: all zeroes */				},
 };
 
 MODULE_DEVICE_TABLE(sdio, if_sdio_ids);
@@ -73,6 +76,12 @@ static struct if_sdio_model if_sdio_models[] = {
 		.helper = "sd8686_helper.bin",
 		.firmware = "sd8686.bin",
 	},
+	{
+		/* 8688 */
+		.model = 0x10,
+		.helper = "sd8688_helper.bin",
+		.firmware = "sd8688.bin",
+	},
 };
 
 struct if_sdio_packet {
@@ -488,7 +497,7 @@ static int if_sdio_prog_helper(struct if_sdio_card *card)
 	ret = 0;
 
 release:
-	sdio_set_block_size(card->func, 0);
+	sdio_set_block_size(card->func, IF_SDIO_BLOCK_SIZE);
 	sdio_release_host(card->func);
 	kfree(chunk_buffer);
 release_fw:
@@ -624,7 +633,7 @@ static int if_sdio_prog_real(struct if_sdio_card *card)
 	ret = 0;
 
 release:
-	sdio_set_block_size(card->func, 0);
+	sdio_set_block_size(card->func, IF_SDIO_BLOCK_SIZE);
 	sdio_release_host(card->func);
 	kfree(chunk_buffer);
 release_fw:
diff --git a/drivers/net/wireless/libertas/if_sdio.h b/drivers/net/wireless/libertas/if_sdio.h
index 533bdfbf5d2..37ada2c29aa 100644
--- a/drivers/net/wireless/libertas/if_sdio.h
+++ b/drivers/net/wireless/libertas/if_sdio.h
@@ -42,4 +42,6 @@
 
 #define IF_SDIO_EVENT           0x80fc
 
+#define IF_SDIO_BLOCK_SIZE	256
+
 #endif
diff --git a/drivers/net/wireless/libertas/rx.c b/drivers/net/wireless/libertas/rx.c
index 820c22d9220..bd845d09f17 100644
--- a/drivers/net/wireless/libertas/rx.c
+++ b/drivers/net/wireless/libertas/rx.c
@@ -25,7 +25,6 @@ struct rfc1042hdr {
 } __attribute__ ((packed));
 
 struct rxpackethdr {
-	struct rxpd rx_pd;
 	struct eth803hdr eth803_hdr;
 	struct rfc1042hdr rfc1042_hdr;
 } __attribute__ ((packed));
@@ -158,8 +157,9 @@ int lbs_process_rxed_packet(struct lbs_private *priv, struct sk_buff *skb)
 	if (priv->monitormode)
 		return process_rxed_802_11_packet(priv, skb);
 
-	p_rx_pkt = (struct rxpackethdr *) skb->data;
-	p_rx_pd = &p_rx_pkt->rx_pd;
+	p_rx_pd = (struct rxpd *) skb->data;
+	p_rx_pkt = (struct rxpackethdr *) ((u8 *)p_rx_pd +
+		le32_to_cpu(p_rx_pd->pkt_ptr));
 	if (priv->mesh_dev) {
 		if (priv->mesh_fw_ver == MESH_FW_OLD) {
 			if (p_rx_pd->rx_control & RxPD_MESH_FRAME)
@@ -181,8 +181,9 @@ int lbs_process_rxed_packet(struct lbs_private *priv, struct sk_buff *skb)
 		goto done;
 	}
 
-	lbs_deb_rx("rx data: skb->len-sizeof(RxPd) = %d-%zd = %zd\n",
-	       skb->len, sizeof(struct rxpd), skb->len - sizeof(struct rxpd));
+	lbs_deb_rx("rx data: skb->len - pkt_ptr = %d-%zd = %zd\n",
+		skb->len, le32_to_cpu(p_rx_pd->pkt_ptr),
+		skb->len - le32_to_cpu(p_rx_pd->pkt_ptr));
 
 	lbs_deb_hex(LBS_DEB_RX, "RX Data: Dest", p_rx_pkt->eth803_hdr.dest_addr,
 		sizeof(p_rx_pkt->eth803_hdr.dest_addr));
@@ -216,14 +217,14 @@ int lbs_process_rxed_packet(struct lbs_private *priv, struct sk_buff *skb)
 		/* Chop off the rxpd + the excess memory from the 802.2/llc/snap header
 		 *   that was removed
 		 */
-		hdrchop = (u8 *) p_ethhdr - (u8 *) p_rx_pkt;
+		hdrchop = (u8 *)p_ethhdr - (u8 *)p_rx_pd;
 	} else {
 		lbs_deb_hex(LBS_DEB_RX, "RX Data: LLC/SNAP",
 			(u8 *) & p_rx_pkt->rfc1042_hdr,
 			sizeof(p_rx_pkt->rfc1042_hdr));
 
 		/* Chop off the rxpd */
-		hdrchop = (u8 *) & p_rx_pkt->eth803_hdr - (u8 *) p_rx_pkt;
+		hdrchop = (u8 *)&p_rx_pkt->eth803_hdr - (u8 *)p_rx_pd;
 	}
 
 	/* Chop off the leading header bytes so the skb points to the start of
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index ea1bf5ba092..c7211ab6dd4 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -25,5 +25,7 @@
 
 #define SDIO_VENDOR_ID_MARVELL			0x02df
 #define SDIO_DEVICE_ID_MARVELL_LIBERTAS		0x9103
+#define SDIO_DEVICE_ID_MARVELL_8688WLAN		0x9104
+#define SDIO_DEVICE_ID_MARVELL_8688BT		0x9105
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 10f644a47b76d3e61b98f2d02ce9690b94c51ee5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 16 Apr 2009 13:17:25 +0200
Subject: mac80211: disable powersave if pm_qos asks for low latency

When an application asks for a latency lower than the beacon interval
there's nothing we can do -- we need to stay awake and not have the
AP buffer frames for us. Add code to automatically calculate this
constraint in mac80211 so drivers need not concern themselves with it.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  |  9 +++++++++
 net/mac80211/ieee80211_i.h |  5 ++++-
 net/mac80211/iface.c       |  4 ++--
 net/mac80211/main.c        | 31 ++++++++++++++++++++++++-------
 net/mac80211/mlme.c        | 36 ++++++++++++++++++++++++++++++++----
 net/mac80211/wext.c        |  2 +-
 6 files changed, 72 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 4b501b48ce8..53563d53b5a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1383,4 +1383,13 @@ static inline int ieee80211_freq_to_ofdm_chan(int s_freq, int freq)
 		return -1;
 }
 
+/**
+ * ieee80211_tu_to_usec - convert time units (TU) to microseconds
+ * @tu: the TUs
+ */
+static inline unsigned long ieee80211_tu_to_usec(unsigned long tu)
+{
+	return 1024 * tu;
+}
+
 #endif /* LINUX_IEEE80211_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ff40dd7b523..b1d18d967d8 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -750,6 +750,7 @@ struct ieee80211_local {
 	struct work_struct dynamic_ps_enable_work;
 	struct work_struct dynamic_ps_disable_work;
 	struct timer_list dynamic_ps_timer;
+	struct notifier_block network_latency_notifier;
 
 	int user_power_level; /* in dBm */
 	int power_constr_level; /* in dBm */
@@ -938,7 +939,9 @@ int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason
 int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
 void ieee80211_send_pspoll(struct ieee80211_local *local,
 			   struct ieee80211_sub_if_data *sdata);
-void ieee80211_recalc_ps(struct ieee80211_local *local);
+void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency);
+int ieee80211_max_network_latency(struct notifier_block *nb,
+				  unsigned long data, void *dummy);
 
 /* IBSS code */
 int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 6240f76e2a4..5d60deb219d 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -317,7 +317,7 @@ static int ieee80211_open(struct net_device *dev)
 		ieee80211_set_wmm_default(sdata);
 	}
 
-	ieee80211_recalc_ps(local);
+	ieee80211_recalc_ps(local, -1);
 
 	/*
 	 * ieee80211_sta_work is disabled while network interface
@@ -574,7 +574,7 @@ static int ieee80211_stop(struct net_device *dev)
 		hw_reconf_flags = 0;
 	}
 
-	ieee80211_recalc_ps(local);
+	ieee80211_recalc_ps(local, -1);
 
 	/* do after stop to avoid reconfiguring when we stop anyway */
 	if (hw_reconf_flags)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 80c0e28bf54..049ce863980 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -21,6 +21,7 @@
 #include <linux/wireless.h>
 #include <linux/rtnetlink.h>
 #include <linux/bitmap.h>
+#include <linux/pm_qos_params.h>
 #include <net/net_namespace.h>
 #include <net/cfg80211.h>
 
@@ -1038,25 +1039,38 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		}
 	}
 
+	local->network_latency_notifier.notifier_call =
+		ieee80211_max_network_latency;
+	result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY,
+				     &local->network_latency_notifier);
+
+	if (result) {
+		rtnl_lock();
+		goto fail_pm_qos;
+	}
+
 	return 0;
 
-fail_wep:
+ fail_pm_qos:
+	ieee80211_led_exit(local);
+	ieee80211_remove_interfaces(local);
+ fail_wep:
 	rate_control_deinitialize(local);
-fail_rate:
+ fail_rate:
 	unregister_netdevice(local->mdev);
 	local->mdev = NULL;
-fail_dev:
+ fail_dev:
 	rtnl_unlock();
 	sta_info_stop(local);
-fail_sta_info:
+ fail_sta_info:
 	debugfs_hw_del(local);
 	destroy_workqueue(local->hw.workqueue);
-fail_workqueue:
+ fail_workqueue:
 	if (local->mdev)
 		free_netdev(local->mdev);
-fail_mdev_alloc:
+ fail_mdev_alloc:
 	wiphy_unregister(local->hw.wiphy);
-fail_wiphy_register:
+ fail_wiphy_register:
 	kfree(local->int_scan_req.channels);
 	return result;
 }
@@ -1069,6 +1083,9 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
 	tasklet_kill(&local->tx_pending_tasklet);
 	tasklet_kill(&local->tasklet);
 
+	pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
+			       &local->network_latency_notifier);
+
 	rtnl_lock();
 
 	/*
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 06d9a1d2325..c39a214e7ad 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -17,6 +17,7 @@
 #include <linux/if_arp.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
+#include <linux/pm_qos_params.h>
 #include <net/mac80211.h>
 #include <asm/unaligned.h>
 
@@ -515,7 +516,7 @@ static void ieee80211_change_ps(struct ieee80211_local *local)
 }
 
 /* need to hold RTNL or interface lock */
-void ieee80211_recalc_ps(struct ieee80211_local *local)
+void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
 {
 	struct ieee80211_sub_if_data *sdata, *found = NULL;
 	int count = 0;
@@ -534,10 +535,22 @@ void ieee80211_recalc_ps(struct ieee80211_local *local)
 		count++;
 	}
 
-	if (count == 1 && found->u.mgd.powersave)
-		local->ps_sdata = found;
-	else
+	if (count == 1 && found->u.mgd.powersave) {
+		s32 beaconint_us;
+
+		if (latency < 0)
+			latency = pm_qos_requirement(PM_QOS_NETWORK_LATENCY);
+
+		beaconint_us = ieee80211_tu_to_usec(
+					found->vif.bss_conf.beacon_int);
+
+		if (beaconint_us > latency)
+			local->ps_sdata = NULL;
+		else
+			local->ps_sdata = found;
+	} else {
 		local->ps_sdata = NULL;
+	}
 
 	ieee80211_change_ps(local);
 }
@@ -2324,3 +2337,18 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 		ieee80211_restart_sta_timer(sdata);
 	rcu_read_unlock();
 }
+
+int ieee80211_max_network_latency(struct notifier_block *nb,
+				  unsigned long data, void *dummy)
+{
+	s32 latency_usec = (s32) data;
+	struct ieee80211_local *local =
+		container_of(nb, struct ieee80211_local,
+			     network_latency_notifier);
+
+	mutex_lock(&local->iflist_mtx);
+	ieee80211_recalc_ps(local, latency_usec);
+	mutex_unlock(&local->iflist_mtx);
+
+	return 0;
+}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 81f63e57027..1c4664b8b1a 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -789,7 +789,7 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev,
 		ieee80211_hw_config(local,
 				    IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT);
 
-	ieee80211_recalc_ps(local);
+	ieee80211_recalc_ps(local, -1);
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 04a773ade0680d862b479d7219973df60f7a3834 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sun, 19 Apr 2009 21:24:32 +0200
Subject: cfg80211/nl80211: add IBSS API

This adds IBSS API along with (preliminary) wext handlers.
The wext handlers can only do IBSS so you need to call them
from your own wext handlers if the mode is IBSS.

The nl80211 API requires
 * an SSID
 * a channel (frequency) for the case that a new IBSS
   has to be created

It optionally supports
 * a flag to fix the channel
 * a fixed BSSID

The cfg80211 code also takes care to leave the IBSS before
the netdev is set down. If wireless extensions are used, it
also caches values when the interface is down and instructs
the driver to join when the interface is set up.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    |  18 +++
 include/net/cfg80211.h     |  72 +++++++++
 include/net/wireless.h     |  14 ++
 net/wireless/Makefile      |   2 +-
 net/wireless/core.c        |  16 ++
 net/wireless/core.h        |   8 +
 net/wireless/ibss.c        | 360 +++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.c     | 182 +++++++++++++++++++++--
 net/wireless/nl80211.h     |   4 +
 net/wireless/wext-compat.c |  30 ++++
 10 files changed, 693 insertions(+), 13 deletions(-)
 create mode 100644 net/wireless/ibss.c

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index c01423888db..25ce3e42bd1 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -223,6 +223,15 @@
  *	%NL80211_ATTR_KEY_SEQ to indicate the TSC value of the frame; this
  *	event matches with MLME-MICHAELMICFAILURE.indication() primitive
  *
+ * @NL80211_CMD_JOIN_IBSS: Join a new IBSS -- given at least an SSID and a
+ *	FREQ attribute (for the initial frequency if no peer can be found)
+ *	and optionally a MAC (as BSSID) and FREQ_FIXED attribute if those
+ *	should be fixed rather than automatically determined. Can only be
+ *	executed on a network interface that is UP, and fixed BSSID/FREQ
+ *	may be rejected.
+ * @NL80211_CMD_LEAVE_IBSS: Leave the IBSS -- no special arguments, the IBSS is
+ *	determined by the network interface.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -288,6 +297,9 @@ enum nl80211_commands {
 
 	NL80211_CMD_REG_BEACON_HINT,
 
+	NL80211_CMD_JOIN_IBSS,
+	NL80211_CMD_LEAVE_IBSS,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -456,6 +468,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_CIPHER_SUITES: a set of u32 values indicating the supported
  *	cipher suites
  *
+ * @NL80211_ATTR_FREQ_FIXED: a flag indicating the IBSS should not try to look
+ *	for other networks on different channels
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -547,6 +562,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_FREQ_BEFORE,
 	NL80211_ATTR_FREQ_AFTER,
+
+	NL80211_ATTR_FREQ_FIXED,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 019a41efa0b..5287a3e56e7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -657,6 +657,31 @@ struct cfg80211_disassoc_request {
 	size_t ie_len;
 };
 
+/**
+ * struct cfg80211_ibss_params - IBSS parameters
+ *
+ * This structure defines the IBSS parameters for the join_ibss()
+ * method.
+ *
+ * @ssid: The SSID, will always be non-null.
+ * @ssid_len: The length of the SSID, will always be non-zero.
+ * @bssid: Fixed BSSID requested, maybe be %NULL, if set do not
+ *	search for IBSSs with a different BSSID.
+ * @channel: The channel to use if no IBSS can be found to join.
+ * @channel_fixed: The channel should be fixed -- do not search for
+ *	IBSSs to join on other channels.
+ * @ie: information element(s) to include in the beacon
+ * @ie_len: length of that
+ */
+struct cfg80211_ibss_params {
+	u8 *ssid;
+	u8 *bssid;
+	struct ieee80211_channel *channel;
+	u8 *ie;
+	u8 ssid_len, ie_len;
+	bool channel_fixed;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -732,6 +757,11 @@ struct cfg80211_disassoc_request {
  * @assoc: Request to (re)associate with the specified peer
  * @deauth: Request to deauthenticate from the specified peer
  * @disassoc: Request to disassociate from the specified peer
+ *
+ * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
+ *	cfg80211_ibss_joined(), also call that function when changing BSSID due
+ *	to a merge.
+ * @leave_ibss: Leave the IBSS.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy);
@@ -817,6 +847,10 @@ struct cfg80211_ops {
 			  struct cfg80211_deauth_request *req);
 	int	(*disassoc)(struct wiphy *wiphy, struct net_device *dev,
 			    struct cfg80211_disassoc_request *req);
+
+	int	(*join_ibss)(struct wiphy *wiphy, struct net_device *dev,
+			     struct cfg80211_ibss_params *params);
+	int	(*leave_ibss)(struct wiphy *wiphy, struct net_device *dev);
 };
 
 /* temporary wext handlers */
@@ -839,6 +873,28 @@ int cfg80211_wext_siwmlme(struct net_device *dev,
 int cfg80211_wext_giwrange(struct net_device *dev,
 			   struct iw_request_info *info,
 			   struct iw_point *data, char *extra);
+int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra);
+int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra);
+int cfg80211_ibss_wext_siwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid);
+int cfg80211_ibss_wext_giwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid);
+int cfg80211_ibss_wext_siwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra);
+int cfg80211_ibss_wext_giwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra);
+
+/* wext helper for now (to be removed) */
+struct ieee80211_channel *cfg80211_wext_freq(struct wiphy *wiphy,
+					     struct iw_freq *freq);
 
 /**
  * cfg80211_scan_done - notify that scan finished
@@ -984,4 +1040,20 @@ void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
 				  enum nl80211_key_type key_type, int key_id,
 				  const u8 *tsc);
 
+/**
+ * cfg80211_ibss_joined - notify cfg80211 that device joined an IBSS
+ *
+ * @dev: network device
+ * @bssid: the BSSID of the IBSS joined
+ * @gfp: allocation flags
+ *
+ * This function notifies cfg80211 that the device joined an IBSS or
+ * switched to a different BSSID. Before this function can be called,
+ * either a beacon has to have been received from the IBSS, or one of
+ * the cfg80211_inform_bss{,_frame} functions must have been called
+ * with the locally generated beacon -- this guarantees that there is
+ * always a scan result for this IBSS. cfg80211 will handle the rest.
+ */
+void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, gfp_t gfp);
+
 #endif /* __NET_CFG80211_H */
diff --git a/include/net/wireless.h b/include/net/wireless.h
index 44c2642d3c0..abd27b03333 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -265,6 +265,8 @@ struct wiphy {
  *
  * @wiphy: pointer to hardware description
  * @iftype: interface type
+ * @list: (private)
+ * @netdev (private)
  */
 struct wireless_dev {
 	struct wiphy *wiphy;
@@ -273,6 +275,18 @@ struct wireless_dev {
 	/* private to the generic wireless code */
 	struct list_head list;
 	struct net_device *netdev;
+
+	/* currently used for IBSS - might be rearranged in the future */
+	struct cfg80211_bss *current_bss;
+	u8 bssid[ETH_ALEN];
+	u8 ssid[IEEE80211_MAX_SSID_LEN];
+	u8 ssid_len;
+
+#ifdef CONFIG_WIRELESS_EXT
+	/* wext data */
+	struct cfg80211_ibss_params wext;
+	u8 wext_bssid[ETH_ALEN];
+#endif
 };
 
 /**
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 6d1e7b27b75..14ea01c4a10 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o
 obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o
 obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o
 
-cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o mlme.o
+cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o mlme.o ibss.o
 cfg80211-$(CONFIG_WIRELESS_EXT) += wext-compat.o
 
 ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/wireless/core.c b/net/wireless/core.c
index d1f556535f6..de1ac51ae4e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -450,6 +450,22 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 		dev->ieee80211_ptr->netdev = dev;
 		mutex_unlock(&rdev->devlist_mtx);
 		break;
+	case NETDEV_GOING_DOWN:
+		if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+			break;
+		if (!dev->ieee80211_ptr->ssid_len)
+			break;
+		cfg80211_leave_ibss(rdev, dev);
+		break;
+	case NETDEV_UP:
+#ifdef CONFIG_WIRELESS_EXT
+		if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+			break;
+		if (!dev->ieee80211_ptr->wext.ssid_len)
+			break;
+		cfg80211_join_ibss(rdev, dev, &dev->ieee80211_ptr->wext);
+		break;
+#endif
 	case NETDEV_UNREGISTER:
 		mutex_lock(&rdev->devlist_mtx);
 		if (!list_empty(&dev->ieee80211_ptr->list)) {
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 02668b02e33..2ef3595fd6e 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -144,4 +144,12 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
 void cfg80211_bss_age(struct cfg80211_registered_device *dev,
                       unsigned long age_secs);
 
+/* IBSS */
+int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       struct cfg80211_ibss_params *params);
+void cfg80211_clear_ibss(struct net_device *dev);
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			struct net_device *dev);
+
 #endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
new file mode 100644
index 00000000000..2bf42fdef3a
--- /dev/null
+++ b/net/wireless/ibss.c
@@ -0,0 +1,360 @@
+/*
+ * Some IBSS support code for cfg80211.
+ *
+ * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <net/cfg80211.h>
+#include <net/wireless.h>
+#include "nl80211.h"
+
+
+void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_bss *bss;
+#ifdef CONFIG_WIRELESS_EXT
+	union iwreq_data wrqu;
+#endif
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return;
+
+	if (WARN_ON(!wdev->ssid_len))
+		return;
+
+	if (memcmp(bssid, wdev->bssid, ETH_ALEN) == 0)
+		return;
+
+	bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
+			       wdev->ssid, wdev->ssid_len,
+			       WLAN_CAPABILITY_IBSS, WLAN_CAPABILITY_IBSS);
+
+	if (WARN_ON(!bss))
+		return;
+
+	if (wdev->current_bss) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(wdev->current_bss);
+	}
+
+	cfg80211_hold_bss(bss);
+	wdev->current_bss = bss;
+	memcpy(wdev->bssid, bssid, ETH_ALEN);
+
+	nl80211_send_ibss_bssid(wiphy_to_dev(wdev->wiphy), dev, bssid, gfp);
+#ifdef CONFIG_WIRELESS_EXT
+	memset(&wrqu, 0, sizeof(wrqu));
+	memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+#endif
+}
+EXPORT_SYMBOL(cfg80211_ibss_joined);
+
+int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       struct cfg80211_ibss_params *params)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	if (wdev->ssid_len)
+		return -EALREADY;
+
+#ifdef CONFIG_WIRELESS_EXT
+	wdev->wext.channel = params->channel;
+#endif
+	err = rdev->ops->join_ibss(&rdev->wiphy, dev, params);
+
+	if (err)
+		return err;
+
+	memcpy(wdev->ssid, params->ssid, params->ssid_len);
+	wdev->ssid_len = params->ssid_len;
+
+	return 0;
+}
+
+void cfg80211_clear_ibss(struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (wdev->current_bss) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(wdev->current_bss);
+	}
+
+	wdev->current_bss = NULL;
+	wdev->ssid_len = 0;
+	memset(wdev->bssid, 0, ETH_ALEN);
+}
+
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			struct net_device *dev)
+{
+	int err;
+
+	err = rdev->ops->leave_ibss(&rdev->wiphy, dev);
+
+	if (err)
+		return err;
+
+	cfg80211_clear_ibss(dev);
+
+	return 0;
+}
+
+#ifdef CONFIG_WIRELESS_EXT
+static int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
+				   struct wireless_dev *wdev)
+{
+	enum ieee80211_band band;
+	int i;
+
+	/* try to find an IBSS channel if none requested ... */
+	if (!wdev->wext.channel) {
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			struct ieee80211_supported_band *sband;
+			struct ieee80211_channel *chan;
+
+			sband = rdev->wiphy.bands[band];
+			if (!sband)
+				continue;
+
+			for (i = 0; i < sband->n_channels; i++) {
+				chan = &sband->channels[i];
+				if (chan->flags & IEEE80211_CHAN_NO_IBSS)
+					continue;
+				if (chan->flags & IEEE80211_CHAN_DISABLED)
+					continue;
+				wdev->wext.channel = chan;
+				break;
+			}
+
+			if (wdev->wext.channel)
+				break;
+		}
+
+		if (!wdev->wext.channel)
+			return -EINVAL;
+	}
+
+	/* don't join -- SSID is not there */
+	if (!wdev->wext.ssid_len)
+		return 0;
+
+	if (!netif_running(wdev->netdev))
+		return 0;
+
+	return cfg80211_join_ibss(wiphy_to_dev(wdev->wiphy),
+				  wdev->netdev, &wdev->wext);
+}
+
+int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct ieee80211_channel *chan;
+	int err;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (!wiphy_to_dev(wdev->wiphy)->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	chan = cfg80211_wext_freq(wdev->wiphy, freq);
+	if (chan && IS_ERR(chan))
+		return PTR_ERR(chan);
+
+	if (chan &&
+	    (chan->flags & IEEE80211_CHAN_NO_IBSS ||
+	     chan->flags & IEEE80211_CHAN_DISABLED))
+		return -EINVAL;
+
+	if (wdev->wext.channel == chan)
+		return 0;
+
+	if (wdev->ssid_len) {
+		err = cfg80211_leave_ibss(wiphy_to_dev(wdev->wiphy), dev);
+		if (err)
+			return err;
+	}
+
+	if (chan) {
+		wdev->wext.channel = chan;
+		wdev->wext.channel_fixed = true;
+	} else {
+		/* cfg80211_ibss_wext_join will pick one if needed */
+		wdev->wext.channel_fixed = false;
+	}
+
+	return cfg80211_ibss_wext_join(wiphy_to_dev(wdev->wiphy), wdev);
+}
+/* temporary symbol - mark GPL - in the future the handler won't be */
+EXPORT_SYMBOL_GPL(cfg80211_ibss_wext_siwfreq);
+
+int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct ieee80211_channel *chan = NULL;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (wdev->current_bss)
+		chan = wdev->current_bss->channel;
+	else if (wdev->wext.channel)
+		chan = wdev->wext.channel;
+
+	if (chan) {
+		freq->m = chan->center_freq;
+		freq->e = 6;
+		return 0;
+	}
+
+	/* no channel if not joining */
+	return -EINVAL;
+}
+/* temporary symbol - mark GPL - in the future the handler won't be */
+EXPORT_SYMBOL_GPL(cfg80211_ibss_wext_giwfreq);
+
+int cfg80211_ibss_wext_siwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	size_t len = data->length;
+	int err;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (!wiphy_to_dev(wdev->wiphy)->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	if (wdev->ssid_len) {
+		err = cfg80211_leave_ibss(wiphy_to_dev(wdev->wiphy), dev);
+		if (err)
+			return err;
+	}
+
+	/* iwconfig uses nul termination in SSID.. */
+	if (len > 0 && ssid[len - 1] == '\0')
+		len--;
+
+	wdev->wext.ssid = wdev->ssid;
+	memcpy(wdev->wext.ssid, ssid, len);
+	wdev->wext.ssid_len = len;
+
+	return cfg80211_ibss_wext_join(wiphy_to_dev(wdev->wiphy), wdev);
+}
+/* temporary symbol - mark GPL - in the future the handler won't be */
+EXPORT_SYMBOL_GPL(cfg80211_ibss_wext_siwessid);
+
+int cfg80211_ibss_wext_giwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	data->flags = 0;
+
+	if (wdev->ssid_len) {
+		data->flags = 1;
+		data->length = wdev->ssid_len;
+		memcpy(ssid, wdev->ssid, data->length);
+	} else if (wdev->wext.ssid) {
+		data->flags = 1;
+		data->length = wdev->wext.ssid_len;
+		memcpy(ssid, wdev->wext.ssid, data->length);
+	}
+
+	return 0;
+}
+/* temporary symbol - mark GPL - in the future the handler won't be */
+EXPORT_SYMBOL_GPL(cfg80211_ibss_wext_giwessid);
+
+int cfg80211_ibss_wext_siwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	u8 *bssid = ap_addr->sa_data;
+	int err;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (!wiphy_to_dev(wdev->wiphy)->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	if (ap_addr->sa_family != ARPHRD_ETHER)
+		return -EINVAL;
+
+	/* automatic mode */
+	if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid))
+		bssid = NULL;
+
+	/* both automatic */
+	if (!bssid && !wdev->wext.bssid)
+		return 0;
+
+	/* fixed already - and no change */
+	if (wdev->wext.bssid && bssid &&
+	    compare_ether_addr(bssid, wdev->wext.bssid) == 0)
+		return 0;
+
+	if (wdev->ssid_len) {
+		err = cfg80211_leave_ibss(wiphy_to_dev(wdev->wiphy), dev);
+		if (err)
+			return err;
+	}
+
+	if (bssid) {
+		memcpy(wdev->wext_bssid, bssid, ETH_ALEN);
+		wdev->wext.bssid = wdev->wext_bssid;
+	} else
+		wdev->wext.bssid = NULL;
+
+	return cfg80211_ibss_wext_join(wiphy_to_dev(wdev->wiphy), wdev);
+}
+/* temporary symbol - mark GPL - in the future the handler won't be */
+EXPORT_SYMBOL_GPL(cfg80211_ibss_wext_siwap);
+
+int cfg80211_ibss_wext_giwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	ap_addr->sa_family = ARPHRD_ETHER;
+
+	if (wdev->wext.bssid) {
+		memcpy(ap_addr->sa_data, wdev->wext.bssid, ETH_ALEN);
+		return 0;
+	}
+
+	memcpy(ap_addr->sa_data, wdev->bssid, ETH_ALEN);
+	return 0;
+}
+/* temporary symbol - mark GPL - in the future the handler won't be */
+EXPORT_SYMBOL_GPL(cfg80211_ibss_wext_giwap);
+#endif
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d2cfde659e7..16f86356ac9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -116,6 +116,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 				.len = IEEE80211_MAX_SSID_LEN },
 	[NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
+	[NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
 };
 
 /* IE validation */
@@ -322,6 +323,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	CMD(assoc, ASSOCIATE);
 	CMD(deauth, DEAUTHENTICATE);
 	CMD(disassoc, DISASSOCIATE);
+	CMD(join_ibss, JOIN_IBSS);
 
 #undef CMD
 	nla_nest_end(msg, nl_cmds);
@@ -668,7 +670,7 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *drv;
 	struct vif_params params;
 	int err, ifindex;
-	enum nl80211_iftype type;
+	enum nl80211_iftype otype, ntype;
 	struct net_device *dev;
 	u32 _flags, *flags = NULL;
 	bool change = false;
@@ -682,30 +684,27 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		goto unlock_rtnl;
 
 	ifindex = dev->ifindex;
-	type = dev->ieee80211_ptr->iftype;
+	otype = ntype = dev->ieee80211_ptr->iftype;
 	dev_put(dev);
 
 	if (info->attrs[NL80211_ATTR_IFTYPE]) {
-		enum nl80211_iftype ntype;
-
 		ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
-		if (type != ntype)
+		if (otype != ntype)
 			change = true;
-		type = ntype;
-		if (type > NL80211_IFTYPE_MAX) {
+		if (ntype > NL80211_IFTYPE_MAX) {
 			err = -EINVAL;
 			goto unlock;
 		}
 	}
 
 	if (!drv->ops->change_virtual_intf ||
-	    !(drv->wiphy.interface_modes & (1 << type))) {
+	    !(drv->wiphy.interface_modes & (1 << ntype))) {
 		err = -EOPNOTSUPP;
 		goto unlock;
 	}
 
 	if (info->attrs[NL80211_ATTR_MESH_ID]) {
-		if (type != NL80211_IFTYPE_MESH_POINT) {
+		if (ntype != NL80211_IFTYPE_MESH_POINT) {
 			err = -EINVAL;
 			goto unlock;
 		}
@@ -715,7 +714,7 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
-		if (type != NL80211_IFTYPE_MONITOR) {
+		if (ntype != NL80211_IFTYPE_MONITOR) {
 			err = -EINVAL;
 			goto unlock;
 		}
@@ -730,12 +729,17 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 
 	if (change)
 		err = drv->ops->change_virtual_intf(&drv->wiphy, ifindex,
-						    type, flags, &params);
+						    ntype, flags, &params);
 	else
 		err = 0;
 
 	dev = __dev_get_by_index(&init_net, ifindex);
-	WARN_ON(!dev || (!err && dev->ieee80211_ptr->iftype != type));
+	WARN_ON(!dev || (!err && dev->ieee80211_ptr->iftype != ntype));
+
+	if (dev && !err && (ntype != otype)) {
+		if (otype == NL80211_IFTYPE_ADHOC)
+			cfg80211_clear_ibss(dev);
+	}
 
  unlock:
 	cfg80211_put_dev(drv);
@@ -3052,6 +3056,114 @@ unlock_rtnl:
 	return err;
 }
 
+static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct cfg80211_ibss_params ibss;
+	struct wiphy *wiphy;
+	int err;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+	    !info->attrs[NL80211_ATTR_SSID] ||
+	    !nla_len(info->attrs[NL80211_ATTR_SSID]))
+		return -EINVAL;
+
+	rtnl_lock();
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		goto unlock_rtnl;
+
+	if (!drv->ops->join_ibss) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!netif_running(dev)) {
+		err = -ENETDOWN;
+		goto out;
+	}
+
+	wiphy = &drv->wiphy;
+	memset(&ibss, 0, sizeof(ibss));
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		ibss.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	ibss.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+	ibss.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		ibss.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		ibss.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	ibss.channel = ieee80211_get_channel(wiphy,
+		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (!ibss.channel ||
+	    ibss.channel->flags & IEEE80211_CHAN_NO_IBSS ||
+	    ibss.channel->flags & IEEE80211_CHAN_DISABLED) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
+
+	err = cfg80211_join_ibss(drv, dev, &ibss);
+
+out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+unlock_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
+static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	int err;
+
+	rtnl_lock();
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		goto unlock_rtnl;
+
+	if (!drv->ops->leave_ibss) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!netif_running(dev)) {
+		err = -ENETDOWN;
+		goto out;
+	}
+
+	err = cfg80211_leave_ibss(drv, dev);
+
+out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+unlock_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -3253,6 +3365,18 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_JOIN_IBSS,
+		.doit = nl80211_join_ibss,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_LEAVE_IBSS,
+		.doit = nl80211_leave_ibss,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
 	.name = "mlme",
@@ -3466,6 +3590,40 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
 				NL80211_CMD_DISASSOCIATE);
 }
 
+void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev, const u8 *bssid,
+			     gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_JOIN_IBSS);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *addr,
 				 enum nl80211_key_type key_type, int key_id,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b3aaa59afa0..17d2d8bfaf7 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -34,4 +34,8 @@ nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 			       struct ieee80211_channel *channel_before,
 			       struct ieee80211_channel *channel_after);
 
+void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev, const u8 *bssid,
+			     gfp_t gfp);
+
 #endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 6fd7bf7b448..57eaea26b67 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -285,3 +285,33 @@ int cfg80211_wext_siwmlme(struct net_device *dev,
 	}
 }
 EXPORT_SYMBOL(cfg80211_wext_siwmlme);
+
+
+/**
+ * cfg80211_wext_freq - get wext frequency for non-"auto"
+ * @wiphy: the wiphy
+ * @freq: the wext freq encoding
+ *
+ * Returns a channel, %NULL for auto, or an ERR_PTR for errors!
+ */
+struct ieee80211_channel *cfg80211_wext_freq(struct wiphy *wiphy,
+					     struct iw_freq *freq)
+{
+	if (freq->e == 0) {
+		if (freq->m < 0)
+			return NULL;
+		else
+			return ieee80211_get_channel(wiphy,
+				ieee80211_channel_to_frequency(freq->m));
+	} else {
+		int i, div = 1000000;
+		for (i = 0; i < freq->e; i++)
+			div /= 10;
+		if (div > 0)
+			return ieee80211_get_channel(wiphy, freq->m / div);
+		else
+			return ERR_PTR(-EINVAL);
+	}
+
+}
+EXPORT_SYMBOL(cfg80211_wext_freq);
-- 
cgit v1.2.3-70-g09d2


From b9a5f8cab751d362f7c2d94899ca788c22fcd1ef Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Mon, 20 Apr 2009 18:39:05 +0200
Subject: nl80211: Add set/get for frag/rts threshold and retry limits

Add new nl80211 attributes that can be used with NL80211_CMD_SET_WIPHY
and NL80211_CMD_GET_WIPHY to manage fragmentation/RTS threshold and
retry limits.

Since these values are stored in struct wiphy, remove the local copy
from mac80211 where feasible (frag & rts threshold). The retry limits
are currently needed in struct ieee80211_conf, but these could be
eventually removed since the driver should have access to the values
in struct wiphy.

Signed-off-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    |  24 ++++++-
 include/net/cfg80211.h     |  50 +++++++++++++++
 net/mac80211/cfg.c         |  27 ++++++++
 net/mac80211/debugfs.c     |   8 +--
 net/mac80211/ieee80211_i.h |   3 -
 net/mac80211/main.c        |   6 +-
 net/mac80211/tx.c          |   9 ++-
 net/mac80211/util.c        |   2 +-
 net/mac80211/wext.c        | 138 ++---------------------------------------
 net/wireless/core.c        |  10 +++
 net/wireless/nl80211.c     |  95 ++++++++++++++++++++++++++++
 net/wireless/wext-compat.c | 151 +++++++++++++++++++++++++++++++++++++++++++++
 12 files changed, 372 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 25ce3e42bd1..dc9d9ec5d1a 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -46,8 +46,10 @@
  *	to get a list of all present wiphys.
  * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or
  *	%NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME,
- *	%NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, and/or
- *	%NL80211_ATTR_WIPHY_CHANNEL_TYPE.
+ *	%NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ,
+ *	%NL80211_ATTR_WIPHY_CHANNEL_TYPE, %NL80211_ATTR_WIPHY_RETRY_SHORT,
+ *	%NL80211_ATTR_WIPHY_RETRY_LONG, %NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
+ *	and/or %NL80211_ATTR_WIPHY_RTS_THRESHOLD.
  * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request
  *	or rename notification. Has attributes %NL80211_ATTR_WIPHY and
  *	%NL80211_ATTR_WIPHY_NAME.
@@ -337,6 +339,18 @@ enum nl80211_commands {
  *	NL80211_CHAN_HT20 = HT20 only
  *	NL80211_CHAN_HT40MINUS = secondary channel is below the primary channel
  *	NL80211_CHAN_HT40PLUS = secondary channel is above the primary channel
+ * @NL80211_ATTR_WIPHY_RETRY_SHORT: TX retry limit for frames whose length is
+ *	less than or equal to the RTS threshold; allowed range: 1..255;
+ *	dot11ShortRetryLimit; u8
+ * @NL80211_ATTR_WIPHY_RETRY_LONG: TX retry limit for frames whose length is
+ *	greater than the RTS threshold; allowed range: 1..255;
+ *	dot11ShortLongLimit; u8
+ * @NL80211_ATTR_WIPHY_FRAG_THRESHOLD: fragmentation threshold, i.e., maximum
+ *	length in octets for frames; allowed range: 256..8000, disable
+ *	fragmentation with (u32)-1; dot11FragmentationThreshold; u32
+ * @NL80211_ATTR_WIPHY_RTS_THRESHOLD: RTS threshold (TX frames with length
+ *	larger than or equal to this use RTS/CTS handshake); allowed range:
+ *	0..65536, disable with (u32)-1; dot11RTSThreshold; u32
  *
  * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on
  * @NL80211_ATTR_IFNAME: network interface name
@@ -565,6 +579,12 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_FREQ_FIXED,
 
+
+	NL80211_ATTR_WIPHY_RETRY_SHORT,
+	NL80211_ATTR_WIPHY_RETRY_LONG,
+	NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
+	NL80211_ATTR_WIPHY_RTS_THRESHOLD,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 601eac64b02..54bc69c8369 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -743,6 +743,20 @@ struct cfg80211_ibss_params {
 	bool channel_fixed;
 };
 
+/**
+ * enum wiphy_params_flags - set_wiphy_params bitfield values
+ * WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed
+ * WIPHY_PARAM_RETRY_LONG: wiphy->retry_long has changed
+ * WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed
+ * WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
+ */
+enum wiphy_params_flags {
+	WIPHY_PARAM_RETRY_SHORT		= 1 << 0,
+	WIPHY_PARAM_RETRY_LONG		= 1 << 1,
+	WIPHY_PARAM_FRAG_THRESHOLD	= 1 << 2,
+	WIPHY_PARAM_RTS_THRESHOLD	= 1 << 3,
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -823,6 +837,11 @@ struct cfg80211_ibss_params {
  *	cfg80211_ibss_joined(), also call that function when changing BSSID due
  *	to a merge.
  * @leave_ibss: Leave the IBSS.
+ *
+ * @set_wiphy_params: Notify that wiphy parameters have changed;
+ *	@changed bitfield (see &enum wiphy_params_flags) describes which values
+ *	have changed. The actual parameter values are available in
+ *	struct wiphy. If returning an error, no value should be changed.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy);
@@ -912,6 +931,8 @@ struct cfg80211_ops {
 	int	(*join_ibss)(struct wiphy *wiphy, struct net_device *dev,
 			     struct cfg80211_ibss_params *params);
 	int	(*leave_ibss)(struct wiphy *wiphy, struct net_device *dev);
+
+	int	(*set_wiphy_params)(struct wiphy *wiphy, u32 changed);
 };
 
 /*
@@ -945,6 +966,11 @@ struct cfg80211_ops {
  * @signal_type: signal type reported in &struct cfg80211_bss.
  * @cipher_suites: supported cipher suites
  * @n_cipher_suites: number of supported cipher suites
+ * @retry_short: Retry limit for short frames (dot11ShortRetryLimit)
+ * @retry_long: Retry limit for long frames (dot11LongRetryLimit)
+ * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold);
+ *	-1 = fragmentation disabled, only odd values >= 256 used
+ * @rts_threshold: RTS threshold (dot11RTSThreshold); -1 = RTS/CTS disabled
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -967,6 +993,11 @@ struct wiphy {
 	int n_cipher_suites;
 	const u32 *cipher_suites;
 
+	u8 retry_short;
+	u8 retry_long;
+	u32 frag_threshold;
+	u32 rts_threshold;
+
 	/* If multiple wiphys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee80211_ptr, you won't
 	 * know whether it points to a wiphy your driver has registered
@@ -1345,6 +1376,25 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev,
 struct ieee80211_channel *cfg80211_wext_freq(struct wiphy *wiphy,
 					     struct iw_freq *freq);
 
+int cfg80211_wext_siwrts(struct net_device *dev,
+			 struct iw_request_info *info,
+			 struct iw_param *rts, char *extra);
+int cfg80211_wext_giwrts(struct net_device *dev,
+			 struct iw_request_info *info,
+			 struct iw_param *rts, char *extra);
+int cfg80211_wext_siwfrag(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *frag, char *extra);
+int cfg80211_wext_giwfrag(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *frag, char *extra);
+int cfg80211_wext_siwretry(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *retry, char *extra);
+int cfg80211_wext_giwretry(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *retry, char *extra);
+
 /*
  * callbacks for asynchronous cfg80211 methods, notification
  * functions and BSS handling helpers
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 14013dc6447..5e1c230744b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1298,6 +1298,32 @@ static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev)
 	return ieee80211_ibss_leave(sdata);
 }
 
+static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	if (changed & WIPHY_PARAM_RTS_THRESHOLD) {
+		int err;
+
+		if (local->ops->set_rts_threshold) {
+			err = local->ops->set_rts_threshold(
+				local_to_hw(local), wiphy->rts_threshold);
+			if (err)
+				return err;
+		}
+	}
+
+	if (changed & WIPHY_PARAM_RETRY_SHORT)
+		local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
+	if (changed & WIPHY_PARAM_RETRY_LONG)
+		local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
+	if (changed &
+	    (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
+
+	return 0;
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1336,4 +1362,5 @@ struct cfg80211_ops mac80211_config_ops = {
 	.disassoc = ieee80211_disassoc,
 	.join_ibss = ieee80211_join_ibss,
 	.leave_ibss = ieee80211_leave_ibss,
+	.set_wiphy_params = ieee80211_set_wiphy_params,
 };
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 210b9b6fecd..5001328be46 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -52,13 +52,13 @@ static const struct file_operations name## _ops = {			\
 DEBUGFS_READONLY_FILE(frequency, 20, "%d",
 		      local->hw.conf.channel->center_freq);
 DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
-		      local->rts_threshold);
+		      local->hw.wiphy->rts_threshold);
 DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
-		      local->fragmentation_threshold);
+		      local->hw.wiphy->frag_threshold);
 DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
-		      local->hw.conf.short_frame_max_tx_count);
+		      local->hw.wiphy->retry_short);
 DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
-		      local->hw.conf.long_frame_max_tx_count);
+		      local->hw.wiphy->retry_long);
 DEBUGFS_READONLY_FILE(total_ps_buffered, 20, "%d",
 		      local->total_ps_buffered);
 DEBUGFS_READONLY_FILE(wep_iv, 20, "%#08x",
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 92a573bf103..dba78d89a10 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -643,9 +643,6 @@ struct ieee80211_local {
 
 	struct rate_control_ref *rate_ctrl;
 
-	int rts_threshold;
-	int fragmentation_threshold;
-
 	struct crypto_blkcipher *wep_tx_tfm;
 	struct crypto_blkcipher *wep_rx_tfm;
 	u32 wep_iv;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d26fc399285..5320e08434a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -776,10 +776,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	/* set up some defaults */
 	local->hw.queues = 1;
 	local->hw.max_rates = 1;
-	local->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD;
-	local->fragmentation_threshold = IEEE80211_MAX_FRAG_THRESHOLD;
-	local->hw.conf.long_frame_max_tx_count = 4;
-	local->hw.conf.short_frame_max_tx_count = 7;
+	local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
+	local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
 	local->hw.conf.radio_enabled = true;
 
 	INIT_LIST_HEAD(&local->interfaces);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c53d77db3e4..9ab49826c15 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -516,7 +516,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 	sband = tx->local->hw.wiphy->bands[tx->channel->band];
 
 	len = min_t(int, tx->skb->len + FCS_LEN,
-			 tx->local->fragmentation_threshold);
+			 tx->local->hw.wiphy->frag_threshold);
 
 	/* set up the tx rate control struct we give the RC algo */
 	txrc.hw = local_to_hw(tx->local);
@@ -527,8 +527,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 	txrc.max_rate_idx = tx->sdata->max_ratectrl_rateidx;
 
 	/* set up RTS protection if desired */
-	if (tx->local->rts_threshold < IEEE80211_MAX_RTS_THRESHOLD &&
-	    len > tx->local->rts_threshold) {
+	if (len > tx->local->hw.wiphy->rts_threshold) {
 		txrc.rts = rts = true;
 	}
 
@@ -770,7 +769,7 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct sk_buff *skb = tx->skb;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (void *)skb->data;
-	int frag_threshold = tx->local->fragmentation_threshold;
+	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
 	int hdrlen;
 	int fragnum;
 
@@ -1088,7 +1087,7 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 
 	if (tx->flags & IEEE80211_TX_FRAGMENTED) {
 		if ((tx->flags & IEEE80211_TX_UNICAST) &&
-		    skb->len + FCS_LEN > local->fragmentation_threshold &&
+		    skb->len + FCS_LEN > local->hw.wiphy->frag_threshold &&
 		    !(info->flags & IEEE80211_TX_CTL_AMPDU))
 			tx->flags |= IEEE80211_TX_FRAGMENTED;
 		else
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 3dd490fa4b6..11244212f41 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1044,7 +1044,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 
 	/* setup RTS threshold */
 	if (local->ops->set_rts_threshold)
-		local->ops->set_rts_threshold(hw, local->rts_threshold);
+		local->ops->set_rts_threshold(hw, hw->wiphy->rts_threshold);
 
 	/* reconfigure hardware */
 	ieee80211_hw_config(local, ~0);
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index eb63fc14801..1eb6d8642a7 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -472,132 +472,6 @@ static int ieee80211_ioctl_giwtxpower(struct net_device *dev,
 	return 0;
 }
 
-static int ieee80211_ioctl_siwrts(struct net_device *dev,
-				  struct iw_request_info *info,
-				  struct iw_param *rts, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-
-	if (rts->disabled)
-		local->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD;
-	else if (!rts->fixed)
-		/* if the rts value is not fixed, then take default */
-		local->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD;
-	else if (rts->value < 0 || rts->value > IEEE80211_MAX_RTS_THRESHOLD)
-		return -EINVAL;
-	else
-		local->rts_threshold = rts->value;
-
-	/* If the wlan card performs RTS/CTS in hardware/firmware,
-	 * configure it here */
-
-	if (local->ops->set_rts_threshold)
-		local->ops->set_rts_threshold(local_to_hw(local),
-					     local->rts_threshold);
-
-	return 0;
-}
-
-static int ieee80211_ioctl_giwrts(struct net_device *dev,
-				  struct iw_request_info *info,
-				  struct iw_param *rts, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-
-	rts->value = local->rts_threshold;
-	rts->disabled = (rts->value >= IEEE80211_MAX_RTS_THRESHOLD);
-	rts->fixed = 1;
-
-	return 0;
-}
-
-
-static int ieee80211_ioctl_siwfrag(struct net_device *dev,
-				   struct iw_request_info *info,
-				   struct iw_param *frag, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-
-	if (frag->disabled)
-		local->fragmentation_threshold = IEEE80211_MAX_FRAG_THRESHOLD;
-	else if (!frag->fixed)
-		local->fragmentation_threshold = IEEE80211_MAX_FRAG_THRESHOLD;
-	else if (frag->value < 256 ||
-		 frag->value > IEEE80211_MAX_FRAG_THRESHOLD)
-		return -EINVAL;
-	else {
-		/* Fragment length must be even, so strip LSB. */
-		local->fragmentation_threshold = frag->value & ~0x1;
-	}
-
-	return 0;
-}
-
-static int ieee80211_ioctl_giwfrag(struct net_device *dev,
-				   struct iw_request_info *info,
-				   struct iw_param *frag, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-
-	frag->value = local->fragmentation_threshold;
-	frag->disabled = (frag->value >= IEEE80211_MAX_FRAG_THRESHOLD);
-	frag->fixed = 1;
-
-	return 0;
-}
-
-
-static int ieee80211_ioctl_siwretry(struct net_device *dev,
-				    struct iw_request_info *info,
-				    struct iw_param *retry, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-
-	if (retry->disabled ||
-	    (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT)
-		return -EINVAL;
-
-	if (retry->flags & IW_RETRY_MAX) {
-		local->hw.conf.long_frame_max_tx_count = retry->value;
-	} else if (retry->flags & IW_RETRY_MIN) {
-		local->hw.conf.short_frame_max_tx_count = retry->value;
-	} else {
-		local->hw.conf.long_frame_max_tx_count = retry->value;
-		local->hw.conf.short_frame_max_tx_count = retry->value;
-	}
-
-	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
-
-	return 0;
-}
-
-
-static int ieee80211_ioctl_giwretry(struct net_device *dev,
-				    struct iw_request_info *info,
-				    struct iw_param *retry, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-
-	retry->disabled = 0;
-	if (retry->flags == 0 || retry->flags & IW_RETRY_MIN) {
-		/* first return min value, iwconfig will ask max value
-		 * later if needed */
-		retry->flags |= IW_RETRY_LIMIT;
-		retry->value = local->hw.conf.short_frame_max_tx_count;
-		if (local->hw.conf.long_frame_max_tx_count !=
-		    local->hw.conf.short_frame_max_tx_count)
-			retry->flags |= IW_RETRY_MIN;
-		return 0;
-	}
-	if (retry->flags & IW_RETRY_MAX) {
-		retry->flags = IW_RETRY_LIMIT | IW_RETRY_MAX;
-		retry->value = local->hw.conf.long_frame_max_tx_count;
-	}
-
-	return 0;
-}
-
-
 static int ieee80211_ioctl_siwencode(struct net_device *dev,
 				     struct iw_request_info *info,
 				     struct iw_point *erq, char *keybuf)
@@ -1050,14 +924,14 @@ static const iw_handler ieee80211_handler[] =
 	(iw_handler) NULL,				/* -- hole -- */
 	(iw_handler) ieee80211_ioctl_siwrate,		/* SIOCSIWRATE */
 	(iw_handler) ieee80211_ioctl_giwrate,		/* SIOCGIWRATE */
-	(iw_handler) ieee80211_ioctl_siwrts,		/* SIOCSIWRTS */
-	(iw_handler) ieee80211_ioctl_giwrts,		/* SIOCGIWRTS */
-	(iw_handler) ieee80211_ioctl_siwfrag,		/* SIOCSIWFRAG */
-	(iw_handler) ieee80211_ioctl_giwfrag,		/* SIOCGIWFRAG */
+	(iw_handler) cfg80211_wext_siwrts,		/* SIOCSIWRTS */
+	(iw_handler) cfg80211_wext_giwrts,		/* SIOCGIWRTS */
+	(iw_handler) cfg80211_wext_siwfrag,		/* SIOCSIWFRAG */
+	(iw_handler) cfg80211_wext_giwfrag,		/* SIOCGIWFRAG */
 	(iw_handler) ieee80211_ioctl_siwtxpower,	/* SIOCSIWTXPOW */
 	(iw_handler) ieee80211_ioctl_giwtxpower,	/* SIOCGIWTXPOW */
-	(iw_handler) ieee80211_ioctl_siwretry,		/* SIOCSIWRETRY */
-	(iw_handler) ieee80211_ioctl_giwretry,		/* SIOCGIWRETRY */
+	(iw_handler) cfg80211_wext_siwretry,		/* SIOCSIWRETRY */
+	(iw_handler) cfg80211_wext_giwretry,		/* SIOCGIWRETRY */
 	(iw_handler) ieee80211_ioctl_siwencode,		/* SIOCSIWENCODE */
 	(iw_handler) ieee80211_ioctl_giwencode,		/* SIOCGIWENCODE */
 	(iw_handler) ieee80211_ioctl_siwpower,		/* SIOCSIWPOWER */
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 827a5626355..f256b4f7e83 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -273,6 +273,16 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
 	drv->wiphy.dev.class = &ieee80211_class;
 	drv->wiphy.dev.platform_data = drv;
 
+	/*
+	 * Initialize wiphy parameters to IEEE 802.11 MIB default values.
+	 * Fragmentation and RTS threshold are disabled by default with the
+	 * special -1 value.
+	 */
+	drv->wiphy.retry_short = 7;
+	drv->wiphy.retry_long = 4;
+	drv->wiphy.frag_threshold = (u32) -1;
+	drv->wiphy.rts_threshold = (u32) -1;
+
 	return &drv->wiphy;
 }
 EXPORT_SYMBOL(wiphy_new);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 16f86356ac9..5a9a5c6c71d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -61,6 +61,10 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },
 	[NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 },
+	[NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 },
+	[NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 },
 
 	[NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
@@ -204,6 +208,16 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 
 	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx);
 	NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
+
+	NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT,
+		   dev->wiphy.retry_short);
+	NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_RETRY_LONG,
+		   dev->wiphy.retry_long);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
+		    dev->wiphy.frag_threshold);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD,
+		    dev->wiphy.rts_threshold);
+
 	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
 		   dev->wiphy.max_scan_ssids);
 	NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
@@ -416,6 +430,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev;
 	int result = 0, rem_txq_params = 0;
 	struct nlattr *nl_txq_params;
+	u32 changed;
+	u8 retry_short = 0, retry_long = 0;
+	u32 frag_threshold = 0, rts_threshold = 0;
 
 	rtnl_lock();
 
@@ -530,6 +547,84 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			goto bad_res;
 	}
 
+	changed = 0;
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
+		retry_short = nla_get_u8(
+			info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]);
+		if (retry_short == 0) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+		changed |= WIPHY_PARAM_RETRY_SHORT;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) {
+		retry_long = nla_get_u8(
+			info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]);
+		if (retry_long == 0) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+		changed |= WIPHY_PARAM_RETRY_LONG;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) {
+		frag_threshold = nla_get_u32(
+			info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]);
+		if (frag_threshold < 256) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+		if (frag_threshold != (u32) -1) {
+			/*
+			 * Fragments (apart from the last one) are required to
+			 * have even length. Make the fragmentation code
+			 * simpler by stripping LSB should someone try to use
+			 * odd threshold value.
+			 */
+			frag_threshold &= ~0x1;
+		}
+		changed |= WIPHY_PARAM_FRAG_THRESHOLD;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) {
+		rts_threshold = nla_get_u32(
+			info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]);
+		changed |= WIPHY_PARAM_RTS_THRESHOLD;
+	}
+
+	if (changed) {
+		u8 old_retry_short, old_retry_long;
+		u32 old_frag_threshold, old_rts_threshold;
+
+		if (!rdev->ops->set_wiphy_params) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		old_retry_short = rdev->wiphy.retry_short;
+		old_retry_long = rdev->wiphy.retry_long;
+		old_frag_threshold = rdev->wiphy.frag_threshold;
+		old_rts_threshold = rdev->wiphy.rts_threshold;
+
+		if (changed & WIPHY_PARAM_RETRY_SHORT)
+			rdev->wiphy.retry_short = retry_short;
+		if (changed & WIPHY_PARAM_RETRY_LONG)
+			rdev->wiphy.retry_long = retry_long;
+		if (changed & WIPHY_PARAM_FRAG_THRESHOLD)
+			rdev->wiphy.frag_threshold = frag_threshold;
+		if (changed & WIPHY_PARAM_RTS_THRESHOLD)
+			rdev->wiphy.rts_threshold = rts_threshold;
+
+		result = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
+		if (result) {
+			rdev->wiphy.retry_short = old_retry_short;
+			rdev->wiphy.retry_long = old_retry_long;
+			rdev->wiphy.frag_threshold = old_frag_threshold;
+			rdev->wiphy.rts_threshold = old_rts_threshold;
+		}
+	}
 
  bad_res:
 	mutex_unlock(&rdev->mtx);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 4e054ea9c0a..3279e7f038d 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -314,3 +314,154 @@ struct ieee80211_channel *cfg80211_wext_freq(struct wiphy *wiphy,
 
 }
 EXPORT_SYMBOL(cfg80211_wext_freq);
+
+int cfg80211_wext_siwrts(struct net_device *dev,
+			 struct iw_request_info *info,
+			 struct iw_param *rts, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u32 orts = wdev->wiphy->rts_threshold;
+	int err;
+
+	if (rts->disabled || !rts->fixed)
+		wdev->wiphy->rts_threshold = (u32) -1;
+	else if (rts->value < 0)
+		return -EINVAL;
+	else
+		wdev->wiphy->rts_threshold = rts->value;
+
+	err = rdev->ops->set_wiphy_params(wdev->wiphy,
+					  WIPHY_PARAM_RTS_THRESHOLD);
+	if (err)
+		wdev->wiphy->rts_threshold = orts;
+
+	return err;
+}
+EXPORT_SYMBOL(cfg80211_wext_siwrts);
+
+int cfg80211_wext_giwrts(struct net_device *dev,
+			 struct iw_request_info *info,
+			 struct iw_param *rts, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	rts->value = wdev->wiphy->rts_threshold;
+	rts->disabled = rts->value == (u32) -1;
+	rts->fixed = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_wext_giwrts);
+
+int cfg80211_wext_siwfrag(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *frag, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u32 ofrag = wdev->wiphy->frag_threshold;
+	int err;
+
+	if (frag->disabled || !frag->fixed)
+		wdev->wiphy->frag_threshold = (u32) -1;
+	else if (frag->value < 256)
+		return -EINVAL;
+	else {
+		/* Fragment length must be even, so strip LSB. */
+		wdev->wiphy->frag_threshold = frag->value & ~0x1;
+	}
+
+	err = rdev->ops->set_wiphy_params(wdev->wiphy,
+					  WIPHY_PARAM_FRAG_THRESHOLD);
+	if (err)
+		wdev->wiphy->frag_threshold = ofrag;
+
+	return err;
+}
+EXPORT_SYMBOL(cfg80211_wext_siwfrag);
+
+int cfg80211_wext_giwfrag(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *frag, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	frag->value = wdev->wiphy->frag_threshold;
+	frag->disabled = frag->value == (u32) -1;
+	frag->fixed = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_wext_giwfrag);
+
+int cfg80211_wext_siwretry(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *retry, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u32 changed = 0;
+	u8 olong = wdev->wiphy->retry_long;
+	u8 oshort = wdev->wiphy->retry_short;
+	int err;
+
+	if (retry->disabled ||
+	    (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT)
+		return -EINVAL;
+
+	if (retry->flags & IW_RETRY_LONG) {
+		wdev->wiphy->retry_long = retry->value;
+		changed |= WIPHY_PARAM_RETRY_LONG;
+	} else if (retry->flags & IW_RETRY_SHORT) {
+		wdev->wiphy->retry_short = retry->value;
+		changed |= WIPHY_PARAM_RETRY_SHORT;
+	} else {
+		wdev->wiphy->retry_short = retry->value;
+		wdev->wiphy->retry_long = retry->value;
+		changed |= WIPHY_PARAM_RETRY_LONG;
+		changed |= WIPHY_PARAM_RETRY_SHORT;
+	}
+
+	if (!changed)
+		return 0;
+
+	err = rdev->ops->set_wiphy_params(wdev->wiphy, changed);
+	if (err) {
+		wdev->wiphy->retry_short = oshort;
+		wdev->wiphy->retry_long = olong;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(cfg80211_wext_siwretry);
+
+int cfg80211_wext_giwretry(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *retry, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	retry->disabled = 0;
+
+	if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) {
+		/*
+		 * First return short value, iwconfig will ask long value
+		 * later if needed
+		 */
+		retry->flags |= IW_RETRY_LIMIT;
+		retry->value = wdev->wiphy->retry_short;
+		if (wdev->wiphy->retry_long != wdev->wiphy->retry_short)
+			retry->flags |= IW_RETRY_LONG;
+
+		return 0;
+	}
+
+	if (retry->flags & IW_RETRY_LONG) {
+		retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG;
+		retry->value = wdev->wiphy->retry_long;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_wext_giwretry);
-- 
cgit v1.2.3-70-g09d2


From e7ec86f54e519e8e86f1cf328db13263f3ef8bd4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sat, 18 Apr 2009 17:33:24 +0200
Subject: mac80211: validate TIM IE length (redux)

The TIM IE must not be shorter than 4 bytes, so verify that
when parsing it and use the proper type. To ease that adjust
struct ieee80211_tim_ie to have a virtual bitmap of size
at least 1.

Also check that the TIM IE is actually present before trying
to parse it!

Because other people may need the function, make it a static
inline in ieee80211.h.

(The original "mac80211: validate TIM IE length" was a minimal fix for
2.6.30.  This purports to be the full, correct fix. -- JWL)

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  | 32 +++++++++++++++++++++++++++++++-
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/mlme.c        | 27 ++-------------------------
 net/mac80211/util.c        |  6 ++++--
 4 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 53563d53b5a..c52e7fba4e4 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -540,7 +540,7 @@ struct ieee80211_tim_ie {
 	u8 dtim_period;
 	u8 bitmap_ctrl;
 	/* variable size: 1 - 251 bytes */
-	u8 virtual_map[0];
+	u8 virtual_map[1];
 } __attribute__ ((packed));
 
 #define WLAN_SA_QUERY_TR_ID_LEN 16
@@ -1392,4 +1392,34 @@ static inline unsigned long ieee80211_tu_to_usec(unsigned long tu)
 	return 1024 * tu;
 }
 
+/**
+ * ieee80211_check_tim - check if AID bit is set in TIM
+ * @tim: the TIM IE
+ * @tim_len: length of the TIM IE
+ * @aid: the AID to look for
+ */
+static inline bool ieee80211_check_tim(struct ieee80211_tim_ie *tim,
+				       u8 tim_len, u16 aid)
+{
+	u8 mask;
+	u8 index, indexn1, indexn2;
+
+	if (unlikely(!tim || tim_len < sizeof(*tim)))
+		return false;
+
+	aid &= 0x3fff;
+	index = aid / 8;
+	mask  = 1 << (aid & 7);
+
+	indexn1 = tim->bitmap_ctrl & 0xfe;
+	indexn2 = tim_len + indexn1 - 4;
+
+	if (index < indexn1 || index > indexn2)
+		return false;
+
+	index -= indexn1;
+
+	return !!(tim->virtual_map[index] & mask);
+}
+
 #endif /* LINUX_IEEE80211_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index dba78d89a10..1579bc92c88 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -832,7 +832,7 @@ struct ieee802_11_elems {
 	u8 *fh_params;
 	u8 *ds_params;
 	u8 *cf_params;
-	u8 *tim;
+	struct ieee80211_tim_ie *tim;
 	u8 *ibss_params;
 	u8 *challenge;
 	u8 *wpa;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 428742d7f44..1b0b7aa387e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -675,30 +675,6 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 	}
 }
 
-static bool ieee80211_check_tim(struct ieee802_11_elems *elems, u16 aid)
-{
-	u8 mask;
-	u8 index, indexn1, indexn2;
-	struct ieee80211_tim_ie *tim = (struct ieee80211_tim_ie *) elems->tim;
-
-	if (unlikely(!tim || elems->tim_len < 4))
-		return false;
-
-	aid &= 0x3fff;
-	index = aid / 8;
-	mask  = 1 << (aid & 7);
-
-	indexn1 = tim->bitmap_ctrl & 0xfe;
-	indexn2 = elems->tim_len + indexn1 - 4;
-
-	if (index < indexn1 || index > indexn2)
-		return false;
-
-	index -= indexn1;
-
-	return !!(tim->virtual_map[index] & mask);
-}
-
 static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 					   u16 capab, bool erp_valid, u8 erp)
 {
@@ -1806,7 +1782,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 					  care_about_ies, ncrc);
 
 	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
-		directed_tim = ieee80211_check_tim(&elems, ifmgd->aid);
+		directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len,
+						   ifmgd->aid);
 
 	ncrc = crc32_be(ncrc, (void *)&directed_tim, sizeof(directed_tim));
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 11244212f41..61876eb50b4 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -588,8 +588,10 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			elems->cf_params_len = elen;
 			break;
 		case WLAN_EID_TIM:
-			elems->tim = pos;
-			elems->tim_len = elen;
+			if (elen >= sizeof(struct ieee80211_tim_ie)) {
+				elems->tim = (void *)pos;
+				elems->tim_len = elen;
+			}
 			break;
 		case WLAN_EID_IBSS_PARAMS:
 			elems->ibss_params = pos;
-- 
cgit v1.2.3-70-g09d2


From 8e30bc55de98c000b0b836cb42525c82f605f191 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 22 Apr 2009 17:45:38 +0200
Subject: nl80211: allow configuring IBSS beacon interval

Make the JOIN_IBSS command look at the beacon interval
attribute to see if the user requested a specific beacon
interval, if not default to 100 TU (wext too).

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  4 +++-
 include/net/cfg80211.h  |  2 ++
 net/wireless/ibss.c     |  3 +++
 net/wireless/nl80211.c  | 12 +++++++++++-
 4 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index dc9d9ec5d1a..b6a48dd502c 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -230,7 +230,9 @@
  *	and optionally a MAC (as BSSID) and FREQ_FIXED attribute if those
  *	should be fixed rather than automatically determined. Can only be
  *	executed on a network interface that is UP, and fixed BSSID/FREQ
- *	may be rejected.
+ *	may be rejected. Another optional parameter is the beacon interval,
+ *	given in the %NL80211_ATTR_BEACON_INTERVAL attribute, which if not
+ *	given defaults to 100 TU (102.4ms).
  * @NL80211_CMD_LEAVE_IBSS: Leave the IBSS -- no special arguments, the IBSS is
  *	determined by the network interface.
  *
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 54bc69c8369..7f7b53b69cb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -733,6 +733,7 @@ struct cfg80211_disassoc_request {
  *	IBSSs to join on other channels.
  * @ie: information element(s) to include in the beacon
  * @ie_len: length of that
+ * @beacon_interval: beacon interval to use
  */
 struct cfg80211_ibss_params {
 	u8 *ssid;
@@ -740,6 +741,7 @@ struct cfg80211_ibss_params {
 	struct ieee80211_channel *channel;
 	u8 *ie;
 	u8 ssid_len, ie_len;
+	u16 beacon_interval;
 	bool channel_fixed;
 };
 
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index b5c601e1b1b..3c38afaed28 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -116,6 +116,9 @@ static int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 	enum ieee80211_band band;
 	int i;
 
+	if (!wdev->wext.beacon_interval)
+		wdev->wext.beacon_interval = 100;
+
 	/* try to find an IBSS channel if none requested ... */
 	if (!wdev->wext.channel) {
 		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 97bb5c80125..3b21b3e89e9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3159,6 +3159,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	struct wiphy *wiphy;
 	int err;
 
+	memset(&ibss, 0, sizeof(ibss));
+
 	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
 		return -EINVAL;
 
@@ -3167,6 +3169,15 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	    !nla_len(info->attrs[NL80211_ATTR_SSID]))
 		return -EINVAL;
 
+	ibss.beacon_interval = 100;
+
+	if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
+		ibss.beacon_interval =
+			nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
+		if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000)
+			return -EINVAL;
+	}
+
 	rtnl_lock();
 
 	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
@@ -3189,7 +3200,6 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	wiphy = &drv->wiphy;
-	memset(&ibss, 0, sizeof(ibss));
 
 	if (info->attrs[NL80211_ATTR_MAC])
 		ibss.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
-- 
cgit v1.2.3-70-g09d2


From 1965c85331ed29dc4fd32479ff31663e3e9a518f Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Wed, 22 Apr 2009 21:38:25 +0300
Subject: nl80211: Add event for authentication/association timeout

SME needs to be notified when the authentication or association
attempt times out and MLME has stopped processing in order to allow
the SME to decide what to do next.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 13 +++++++++++--
 include/net/cfg80211.h  | 22 ++++++++++++++++++++--
 net/mac80211/mlme.c     |  4 ++--
 net/wireless/mlme.c     | 27 +++++++++++++++++++++++++++
 net/wireless/nl80211.c  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h  |  6 ++++++
 6 files changed, 115 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index b6a48dd502c..e9fd13aa79f 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -203,8 +203,12 @@
  *	frame, i.e., it was for the local STA and was received in correct
  *	state. This is similar to MLME-AUTHENTICATE.confirm primitive in the
  *	MLME SAP interface (kernel providing MLME, userspace SME). The
- *	included NL80211_ATTR_FRAME attribute contains the management frame
- *	(including both the header and frame body, but not FCS).
+ *	included %NL80211_ATTR_FRAME attribute contains the management frame
+ *	(including both the header and frame body, but not FCS). This event is
+ *	also used to indicate if the authentication attempt timed out. In that
+ *	case the %NL80211_ATTR_FRAME attribute is replaced with a
+ *	%NL80211_ATTR_TIMED_OUT flag (and %NL80211_ATTR_MAC to indicate which
+ *	pending authentication timed out).
  * @NL80211_CMD_ASSOCIATE: association request and notification; like
  *	NL80211_CMD_AUTHENTICATE but for Association and Reassociation
  *	(similar to MLME-ASSOCIATE.request, MLME-REASSOCIATE.request,
@@ -487,6 +491,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_FREQ_FIXED: a flag indicating the IBSS should not try to look
  *	for other networks on different channels
  *
+ * @NL80211_ATTR_TIMED_OUT: a flag indicating than an operation timed out; this
+ *	is used, e.g., with %NL80211_CMD_AUTHENTICATE event
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -587,6 +594,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
 	NL80211_ATTR_WIPHY_RTS_THRESHOLD,
 
+	NL80211_ATTR_TIMED_OUT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7f7b53b69cb..b8a76764e1c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1475,10 +1475,19 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
  * @len: length of the frame data
  *
  * This function is called whenever an authentication has been processed in
- * station mode.
+ * station mode. The driver is required to call either this function or
+ * cfg80211_send_auth_timeout() to indicate the result of cfg80211_ops::auth()
+ * call.
  */
 void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len);
 
+/**
+ * cfg80211_send_auth_timeout - notification of timed out authentication
+ * @dev: network device
+ * @addr: The MAC address of the device with which the authentication timed out
+ */
+void cfg80211_send_auth_timeout(struct net_device *dev, const u8 *addr);
+
 /**
  * cfg80211_send_rx_assoc - notification of processed association
  * @dev: network device
@@ -1486,10 +1495,19 @@ void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len);
  * @len: length of the frame data
  *
  * This function is called whenever a (re)association response has been
- * processed in station mode.
+ * processed in station mode. The driver is required to call either this
+ * function or cfg80211_send_assoc_timeout() to indicate the result of
+ * cfg80211_ops::assoc() call.
  */
 void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len);
 
+/**
+ * cfg80211_send_assoc_timeout - notification of timed out association
+ * @dev: network device
+ * @addr: The MAC address of the device with which the association timed out
+ */
+void cfg80211_send_assoc_timeout(struct net_device *dev, const u8 *addr);
+
 /**
  * cfg80211_send_deauth - notification of processed deauthentication
  * @dev: network device
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index df27c68620c..3610c11286b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -932,7 +932,7 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata)
 		       " timed out\n",
 		       sdata->dev->name, ifmgd->bssid);
 		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_sta_send_apinfo(sdata);
+		cfg80211_send_auth_timeout(sdata->dev, ifmgd->bssid);
 		ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
 				sdata->local->hw.conf.channel->center_freq,
 				ifmgd->ssid, ifmgd->ssid_len);
@@ -1115,7 +1115,7 @@ static void ieee80211_associate(struct ieee80211_sub_if_data *sdata)
 		       " timed out\n",
 		       sdata->dev->name, ifmgd->bssid);
 		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_sta_send_apinfo(sdata);
+		cfg80211_send_assoc_timeout(sdata->dev, ifmgd->bssid);
 		ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
 				sdata->local->hw.conf.channel->center_freq,
 				ifmgd->ssid, ifmgd->ssid_len);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 1407244a647..42184361a10 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -44,6 +44,33 @@ void cfg80211_send_disassoc(struct net_device *dev, const u8 *buf, size_t len)
 }
 EXPORT_SYMBOL(cfg80211_send_disassoc);
 
+static void cfg80211_wext_disconnected(struct net_device *dev)
+{
+#ifdef CONFIG_WIRELESS_EXT
+	union iwreq_data wrqu;
+	memset(&wrqu, 0, sizeof(wrqu));
+	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+#endif
+}
+
+void cfg80211_send_auth_timeout(struct net_device *dev, const u8 *addr)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_send_auth_timeout(rdev, dev, addr);
+	cfg80211_wext_disconnected(dev);
+}
+EXPORT_SYMBOL(cfg80211_send_auth_timeout);
+
+void cfg80211_send_assoc_timeout(struct net_device *dev, const u8 *addr)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_send_assoc_timeout(rdev, dev, addr);
+	cfg80211_wext_disconnected(dev);
+}
+EXPORT_SYMBOL(cfg80211_send_assoc_timeout);
+
 void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
 				  enum nl80211_key_type key_type, int key_id,
 				  const u8 *tsc)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3b21b3e89e9..b1fc98225fd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -121,6 +121,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
 	[NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
+	[NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
 };
 
 /* IE validation */
@@ -3695,6 +3696,54 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
 				NL80211_CMD_DISASSOCIATE);
 }
 
+void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev, int cmd,
+			       const u8 *addr)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT_FLAG(msg, NL80211_ATTR_TIMED_OUT);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, addr);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_mlme_mcgrp.id, GFP_ATOMIC);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev, const u8 *addr)
+{
+	nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_AUTHENTICATE,
+				  addr);
+}
+
+void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev, const u8 *addr)
+{
+	nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_ASSOCIATE, addr);
+}
+
 void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 			     struct net_device *netdev, const u8 *bssid,
 			     gfp_t gfp)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 17d2d8bfaf7..5c12ad13499 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -23,6 +23,12 @@ extern void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
 extern void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
 				  struct net_device *netdev,
 				  const u8 *buf, size_t len);
+extern void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
+				      struct net_device *netdev,
+				      const u8 *addr);
+extern void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
+				       struct net_device *netdev,
+				       const u8 *addr);
 extern void
 nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 			    struct net_device *netdev, const u8 *addr,
-- 
cgit v1.2.3-70-g09d2


From 9d5c5d8f4105dc56ec10864b195dd1714f282c22 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Tue, 21 Apr 2009 23:26:22 +0000
Subject: af_iucv: add sockopt() to enable/disable use of IPRM_DATA msgs

Provide the socket operations getsocktopt() and setsockopt() to enable/disable
sending of data in the parameter list of IUCV messages.
The patch sets respective flag only.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ursula.braun@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h     |  1 +
 include/net/iucv/af_iucv.h |  4 +++
 net/iucv/af_iucv.c         | 79 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 81 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 42a0396f2c5..d2310cb45d2 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -303,6 +303,7 @@ struct ucred {
 #define SOL_BLUETOOTH	274
 #define SOL_PNPIPE	275
 #define SOL_RDS		276
+#define SOL_IUCV	277
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index 85f80eadfa3..78a72764aef 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -73,8 +73,12 @@ struct iucv_sock {
 	struct sk_buff_head	backlog_skb_q;
 	struct sock_msg_q	message_q;
 	unsigned int		send_tag;
+	u8			flags;
 };
 
+/* iucv socket options (SOL_IUCV) */
+#define SO_IPRMDATA_MSG	0x0080		/* send/recv IPRM_DATA msgs */
+
 struct iucv_sock_list {
 	struct hlist_head head;
 	rwlock_t	  lock;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 6cf02b41ef9..b7c40c97992 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -32,7 +32,7 @@
 #define CONFIG_IUCV_SOCK_DEBUG 1
 
 #define IPRMDATA 0x80
-#define VERSION "1.0"
+#define VERSION "1.1"
 
 static char iucv_userid[80];
 
@@ -226,6 +226,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
 	spin_lock_init(&iucv_sk(sk)->message_q.lock);
 	skb_queue_head_init(&iucv_sk(sk)->backlog_skb_q);
 	iucv_sk(sk)->send_tag = 0;
+	iucv_sk(sk)->flags = 0;
 
 	sk->sk_destruct = iucv_sock_destruct;
 	sk->sk_sndtimeo = IUCV_CONN_TIMEOUT;
@@ -1003,6 +1004,78 @@ static int iucv_sock_release(struct socket *sock)
 	return err;
 }
 
+/* getsockopt and setsockopt */
+static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	int val;
+	int rc;
+
+	if (level != SOL_IUCV)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *) optval))
+		return -EFAULT;
+
+	rc = 0;
+
+	lock_sock(sk);
+	switch (optname) {
+	case SO_IPRMDATA_MSG:
+		if (val)
+			iucv->flags |= IUCV_IPRMDATA;
+		else
+			iucv->flags &= ~IUCV_IPRMDATA;
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		break;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
+static int iucv_sock_getsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	int val, len;
+
+	if (level != SOL_IUCV)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	switch (optname) {
+	case SO_IPRMDATA_MSG:
+		val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+
 /* Callback wrappers - called from iucv base support */
 static int iucv_callback_connreq(struct iucv_path *path,
 				 u8 ipvmid[8], u8 ipuser[16])
@@ -1229,8 +1302,8 @@ static struct proto_ops iucv_sock_ops = {
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
 	.shutdown	= iucv_sock_shutdown,
-	.setsockopt	= sock_no_setsockopt,
-	.getsockopt	= sock_no_getsockopt
+	.setsockopt	= iucv_sock_setsockopt,
+	.getsockopt	= iucv_sock_getsockopt,
 };
 
 static struct net_proto_family iucv_sock_family_ops = {
-- 
cgit v1.2.3-70-g09d2


From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:03:29 -0400
Subject: tracing: increase size of number of possible events

With the new event tracing registration, we must increase the number
of events that can be registered. Currently the type field is only
one byte, which leaves us only 256 possible events.

Since we do not save the CPU number in the tracer anymore (it is determined
by the per cpu ring buffer that is used) we have an extra byte to use.

This patch increases the size of type from 1 byte (256 events) to
2 bytes (65,536 events).

It also adds a WARN_ON_ONCE if we exceed that limit.

[ Impact: allow more than 255 events ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace_event.h | 5 ++++-
 kernel/trace/trace_events.c  | 2 +-
 kernel/trace/trace_output.c  | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 2a4a4074991..07e0a6d64a2 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,13 +16,16 @@ struct dentry;
  *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
  */
 struct trace_entry {
-	int			type;
+	unsigned short		type;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
 	int			tgid;
 };
 
+#define FTRACE_MAX_EVENT						\
+	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d6e879cf87..9887131afa0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(int, type),
+				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 83a8abb9640..06997e75114 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
+	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
-- 
cgit v1.2.3-70-g09d2


From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 24 Apr 2009 11:27:05 +0800
Subject: ring_buffer: compressed event header

RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes
an 'u32' to save the actually length for events which data size > 28.

This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA.

[ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49F13189.3090000@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 16 +++++----
 kernel/trace/ring_buffer.c  | 83 ++++++++++++++++++++++-----------------------
 2 files changed, 50 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fac8f1ac6f4..1c2f80911fb 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -11,7 +11,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	u32		type:2, len:3, time_delta:27;
+	u32		type_len:5, time_delta:27;
 	u32		array[];
 };
 
@@ -24,7 +24,8 @@ struct ring_buffer_event {
  *				  size is variable depending on how much
  *				  padding is needed
  *				 If time_delta is non zero:
- *				  everything else same as RINGBUF_TYPE_DATA
+ *				  array[0] holds the actual length
+ *				  size = 4 + length (bytes)
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -35,22 +36,23 @@ struct ring_buffer_event {
  *				 array[1..2] = tv_sec
  *				 size = 16 bytes
  *
- * @RINGBUF_TYPE_DATA:		Data record
- *				 If len is zero:
+ * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
+ *				Data record
+ *				 If type_len is zero:
  *				  array[0] holds the actual length
  *				  array[1..(length+3)/4] holds data
- *				  size = 4 + 4 + length (bytes)
+ *				  size = 4 + length (bytes)
  *				 else
- *				  length = len << 2
+ *				  length = type_len << 2
  *				  array[0..(length+3)/4-1] holds data
  *				  size = 4 + length (bytes)
  */
 enum ring_buffer_type {
+	RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
 	RINGBUF_TYPE_PADDING,
 	RINGBUF_TYPE_TIME_EXTEND,
 	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
 	RINGBUF_TYPE_TIME_STAMP,
-	RINGBUF_TYPE_DATA,
 };
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61dbdf21cd3..9692f100ec1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 {
 	int ret;
 
-	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
-	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "# compressed entry header\n");
+	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
 	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
 	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
 	ret = trace_seq_printf(s, "\n");
@@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 			       RINGBUF_TYPE_PADDING);
 	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
 			       RINGBUF_TYPE_TIME_EXTEND);
-	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
-			       RINGBUF_TYPE_DATA);
+	ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+			       RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 
 	return ret;
 }
@@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
-#define RB_MAX_SMALL_DATA	28
+#define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
 enum {
 	RB_LEN_TIME_EXTEND = 8,
@@ -213,17 +216,18 @@ enum {
 
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+	return event->type_len == RINGBUF_TYPE_PADDING
+			&& event->time_delta == 0;
 }
 
 static inline int rb_discarded_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+	return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
 }
 
 static void rb_event_set_padding(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	event->time_delta = 0;
 }
 
@@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
-	if (event->len)
-		length = event->len * RB_ALIGNMENT;
+	if (event->type_len)
+		length = event->type_len * RB_ALIGNMENT;
 	else
 		length = event->array[0];
 	return length + RB_EVNT_HDR_SIZE;
@@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			/* undefined */
 			return -1;
-		return rb_event_data_length(event);
+		return  event->array[0] + RB_EVNT_HDR_SIZE;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
 	unsigned length = rb_event_length(event);
-	if (event->type != RINGBUF_TYPE_DATA)
+	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 		return length;
 	length -= RB_EVNT_HDR_SIZE;
 	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
-	BUG_ON(event->type != RINGBUF_TYPE_DATA);
+	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 	/* If length is in len field, then array[0] has the data */
-	if (event->len)
+	if (event->type_len)
 		return (void *)&event->array[0];
 	/* Otherwise length is in array[0] and array[1] has the data */
 	return (void *)&event->array[1];
@@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
 		cpu_buffer->entries--;
@@ -1133,28 +1137,21 @@ static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
-	event->type = type;
+	event->type_len = type;
 
 	switch (type) {
 
 	case RINGBUF_TYPE_PADDING:
-		break;
-
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-		break;
-
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
-	case RINGBUF_TYPE_DATA:
+	case 0:
 		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA) {
-			event->len = 0;
+		if (length > RB_MAX_SMALL_DATA)
 			event->array[0] = length;
-		} else
-			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+		else
+			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
@@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	event = rb_reserve_next_event(cpu_buffer, 0, length);
 	if (!event)
 		goto out;
 
@@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	/* array[0] holds the actual length for the discarded event */
+	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	if (!event->time_delta)
 		event->time_delta = 1;
@@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer,
-				      RINGBUF_TYPE_DATA, event_length);
+	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
 	if (!event)
 		goto out;
 
@@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
+	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+			|| rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	event = rb_reader_event(cpu_buffer);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			RB_WARN_ON(cpu_buffer, 1);
@@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	event = rb_iter_head_event(iter);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event)) {
 			rb_inc_iter(iter);
@@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->entries--;
 	}
-- 
cgit v1.2.3-70-g09d2


From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:51:43 +0200
Subject: x86, bts, mm: clean up buffer allocation

The current mm interface is asymetric. One function allocates a locked
buffer, another function only refunds the memory.

Change this to have two functions for accounting and refunding locked
memory, respectively; and do the actual buffer allocation in ptrace.

[ Impact: refactor BTS buffer allocation code ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c | 39 ++++++++++++++++++++++++++-------------
 include/linux/mm.h       |  6 ++++--
 mm/mlock.c               | 36 +++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d5252ae6c52..09ecbde91c1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -617,17 +617,28 @@ struct bts_context {
 	struct work_struct	work;
 };
 
-static inline void alloc_bts_buffer(struct bts_context *context,
-				    unsigned int size)
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
 {
-	void *buffer;
+	void *buffer = NULL;
+	int err = -ENOMEM;
 
-	buffer = alloc_locked_buffer(size);
-	if (buffer) {
-		context->buffer = buffer;
-		context->size = size;
-		context->mm = get_task_mm(current);
-	}
+	err = account_locked_memory(current->mm, current->signal->rlim, size);
+	if (err < 0)
+		return err;
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		goto out_refund;
+
+	context->buffer = buffer;
+	context->size = size;
+	context->mm = get_task_mm(current);
+
+	return 0;
+
+ out_refund:
+	refund_locked_memory(current->mm, size);
+	return err;
 }
 
 static inline void free_bts_buffer(struct bts_context *context)
@@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context)
 	kfree(context->buffer);
 	context->buffer = NULL;
 
-	refund_locked_buffer_memory(context->mm, context->size);
+	refund_locked_memory(context->mm, context->size);
 	context->size = 0;
 
 	mmput(context->mm);
@@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child,
 	context->tracer = NULL;
 
 	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		int err;
+
 		free_bts_buffer(context);
 		if (!cfg.size)
 			return 0;
 
-		alloc_bts_buffer(context, cfg.size);
-		if (!context->buffer)
-			return -ENOMEM;
+		err = alloc_bts_buffer(context, cfg.size);
+		if (err < 0)
+			return err;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3963ba23a6..009eabd3c21 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,6 +19,7 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1319,7 +1320,8 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern void *alloc_locked_buffer(size_t size);
-extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
+extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+				 size_t size);
+extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 28be15ead9c..ac130433c7d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -629,38 +629,36 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	free_uid(user);
 }
 
-void *alloc_locked_buffer(size_t size)
+int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+			  size_t size)
 {
-	unsigned long rlim, vm, pgsz;
-	void *buffer = NULL;
+	unsigned long lim, vm, pgsz;
+	int error = -ENOMEM;
 
 	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
-
-	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->total_vm + pgsz;
-	if (rlim < vm)
-		goto out;
+	down_write(&mm->mmap_sem);
 
-	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->locked_vm + pgsz;
-	if (rlim < vm)
+	lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+	vm   = mm->total_vm + pgsz;
+	if (lim < vm)
 		goto out;
 
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
+	lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	vm   = mm->locked_vm + pgsz;
+	if (lim < vm)
 		goto out;
 
-	current->mm->total_vm  += pgsz;
-	current->mm->locked_vm += pgsz;
+	mm->total_vm  += pgsz;
+	mm->locked_vm += pgsz;
 
+	error = 0;
  out:
-	up_write(&current->mm->mmap_sem);
-	return buffer;
+	up_write(&mm->mmap_sem);
+	return error;
 }
 
-void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
+void refund_locked_memory(struct mm_struct *mm, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-- 
cgit v1.2.3-70-g09d2


From b8e65554d80b4c560d201362d0e8fa02109d89fd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:50:39 -0400
Subject: tracing: remove deprecated TRACE_FORMAT

The TRACE_FORMAT macro has been deprecated by the TRACE_EVENT macro.
There are no more users. All new users must use the TRACE_EVENT macro.

[ Impact: remove old functionality ]

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h   |  5 ----
 include/trace/define_trace.h |  4 ---
 include/trace/ftrace.h       | 66 --------------------------------------------
 3 files changed, 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 4353f3f7e62..14df7e635d4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -158,11 +158,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define PARAMS(args...) args
 
-#ifndef TRACE_FORMAT
-#define TRACE_FORMAT(name, proto, args, fmt)		\
-	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-#endif
-
 #ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index abc611feeb8..f7a7ae1e8f9 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,10 +26,6 @@
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	DEFINE_TRACE(name)
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(name, proto, args, print)	\
-	DEFINE_TRACE(name)
-
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)	\
 	DEFINE_TRACE(name)
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a77f71a46db..1e681142f1d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,9 +18,6 @@
 
 #include <linux/ftrace_event.h>
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
@@ -62,9 +59,6 @@
  *
  */
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
 #undef __array
 #define __array(type, item, len)
 
@@ -298,16 +292,6 @@ ftrace_define_fields_##call(void)					\
  *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- * }
- *
  *
  * For those macros defined with TRACE_EVENT:
  *
@@ -417,56 +401,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
 #define _TRACE_PROFILE_INIT(call)
 #endif
 
-#define _TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, #call ": " fmt);			\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static int ftrace_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(NULL);				\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	return 0;							\
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_init_event_##call,		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v1.2.3-70-g09d2


From a9e61e25f9d2e7e43bf17625f5cb56c9e0a89b17 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Tue, 31 Mar 2009 15:12:56 -0500
Subject: lockd: call locks_release_private to cleanup per-filesystem state

For every lock request lockd creates a new file_lock object
in nlmsvc_setgrantargs() by copying the passed in file_lock with
locks_copy_lock(). A filesystem can attach it's own lock_operations
vector to the file_lock. It has to be cleaned up at the end of the
file_lock's life. However, lockd doesn't do it today, yet it
asserts in nlmclnt_release_lockargs() that the per-filesystem
state is clean.
This patch fixes it by exporting locks_release_private() and adding
it to nlmsvc_freegrantargs(), to be symmetrical to creating a
file_lock in nlmsvc_setgrantargs().

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 2 ++
 fs/locks.c         | 3 ++-
 include/linux/fs.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 83ee34203bd..e577a78d7ba 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -326,6 +326,8 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 {
 	if (call->a_args.lock.oh.data != call->a_owner)
 		kfree(call->a_args.lock.oh.data);
+
+	locks_release_private(&call->a_args.lock.fl);
 }
 
 /*
diff --git a/fs/locks.c b/fs/locks.c
index ec3deea29e3..b6440f52178 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -151,7 +151,7 @@ static struct file_lock *locks_alloc_lock(void)
 	return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
 
-static void locks_release_private(struct file_lock *fl)
+void locks_release_private(struct file_lock *fl)
 {
 	if (fl->fl_ops) {
 		if (fl->fl_ops->fl_release_private)
@@ -165,6 +165,7 @@ static void locks_release_private(struct file_lock *fl)
 	}
 
 }
+EXPORT_SYMBOL_GPL(locks_release_private);
 
 /* Free a lock which is not in use. */
 static void locks_free_lock(struct file_lock *fl)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436f435..5ba615e8f53 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1108,6 +1108,7 @@ extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
+extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
-- 
cgit v1.2.3-70-g09d2


From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 12:20:52 -0400
Subject: tracing/events: reuse trace event ids after overflow

With modules being able to add trace events, and the max trace event
counter is 16 bits (65536) we can overflow the counter easily
with a simple while loop adding and removing modules that contain
trace events.

This patch links together the registered trace events and on overflow
searches for available trace event ids. It will still fail if
over 65536 events are registered, but considering that a typical
kernel only has 22000 functions, 65000 events should be sufficient.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace_output.c  | 71 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 61 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 07e0a6d64a2..78a9ba24cbf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -56,6 +56,7 @@ typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
 					      int flags);
 struct trace_event {
 	struct hlist_node	node;
+	struct list_head	list;
 	int			type;
 	trace_print_func	trace;
 	trace_print_func	raw;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 06997e75114..5fc51f0f75f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type)
 	return NULL;
 }
 
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+	struct trace_event *e;
+	int last = __TRACE_LAST_TYPE;
+
+	if (list_empty(&ftrace_event_list)) {
+		*list = &ftrace_event_list;
+		return last + 1;
+	}
+
+	/*
+	 * We used up all possible max events,
+	 * lets see if somebody freed one.
+	 */
+	list_for_each_entry(e, &ftrace_event_list, list) {
+		if (e->type != last + 1)
+			break;
+		last++;
+	}
+
+	/* Did we used up all 65 thousand events??? */
+	if ((last + 1) > FTRACE_MAX_EVENT)
+		return 0;
+
+	*list = &e->list;
+	return last + 1;
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
-	if (!event) {
-		ret = next_event_type++;
+	if (WARN_ON(!event))
 		goto out;
-	}
 
-	if (!event->type)
-		event->type = next_event_type++;
-	else if (event->type > __TRACE_LAST_TYPE) {
+	INIT_LIST_HEAD(&event->list);
+
+	if (!event->type) {
+		struct list_head *list;
+
+		if (next_event_type > FTRACE_MAX_EVENT) {
+
+			event->type = trace_search_list(&list);
+			if (!event->type)
+				goto out;
+
+		} else {
+			
+			event->type = next_event_type++;
+			list = &ftrace_event_list;
+		}
+
+		if (WARN_ON(ftrace_find_event(event->type)))
+			goto out;
+
+		list_add_tail(&event->list, list);
+
+	} else if (event->type > __TRACE_LAST_TYPE) {
 		printk(KERN_WARNING "Need to add type to trace.h\n");
 		WARN_ON(1);
-	}
-
-	if (ftrace_find_event(event->type))
 		goto out;
+	} else {
+		/* Is this event already used */
+		if (ftrace_find_event(event->type))
+			goto out;
+	}
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
@@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
-	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
@@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event)
 {
 	mutex_lock(&trace_event_mutex);
 	hlist_del(&event->node);
+	list_del(&event->list);
 	mutex_unlock(&trace_event_mutex);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From edf391ff17232f097d72441c9ad467bcb3b5db18 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 27 Apr 2009 02:45:02 -0700
Subject: snmp: add missing counters for RFC 4293

The IP MIB (RFC 4293) defines stats for InOctets, OutOctets, InMcastOctets and
OutMcastOctets:
http://tools.ietf.org/html/rfc4293
But it seems we don't track those in any way that easy to separate from other
protocols.  This patch adds those missing counters to the stats file. Tested
successfully by me

With help from Eric Dumazet.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h  | 10 ++++++++--
 include/net/ip.h      |  3 +++
 include/net/ipv6.h    | 15 ++++++++++++++-
 include/net/snmp.h    | 19 ++++++++++++++++++-
 net/ipv4/ip_input.c   | 13 ++++++++-----
 net/ipv4/ip_output.c  | 12 ++++++------
 net/ipv4/proc.c       | 10 ++++++++--
 net/ipv6/ip6_input.c  |  7 ++++---
 net/ipv6/ip6_output.c |  9 +++++----
 net/ipv6/mcast.c      | 19 ++++++++++++-------
 net/ipv6/ndisc.c      |  4 ++--
 net/ipv6/proc.c       | 10 ++++++++--
 net/ipv6/raw.c        |  2 +-
 13 files changed, 97 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index aee3f1e1d1c..0f953fe4041 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -18,7 +18,7 @@
 enum
 {
 	IPSTATS_MIB_NUM = 0,
-	IPSTATS_MIB_INRECEIVES,			/* InReceives */
+	IPSTATS_MIB_INPKTS,			/* InReceives */
 	IPSTATS_MIB_INHDRERRORS,		/* InHdrErrors */
 	IPSTATS_MIB_INTOOBIGERRORS,		/* InTooBigErrors */
 	IPSTATS_MIB_INNOROUTES,			/* InNoRoutes */
@@ -28,7 +28,7 @@ enum
 	IPSTATS_MIB_INDISCARDS,			/* InDiscards */
 	IPSTATS_MIB_INDELIVERS,			/* InDelivers */
 	IPSTATS_MIB_OUTFORWDATAGRAMS,		/* OutForwDatagrams */
-	IPSTATS_MIB_OUTREQUESTS,		/* OutRequests */
+	IPSTATS_MIB_OUTPKTS,			/* OutRequests */
 	IPSTATS_MIB_OUTDISCARDS,		/* OutDiscards */
 	IPSTATS_MIB_OUTNOROUTES,		/* OutNoRoutes */
 	IPSTATS_MIB_REASMTIMEOUT,		/* ReasmTimeout */
@@ -42,6 +42,12 @@ enum
 	IPSTATS_MIB_OUTMCASTPKTS,		/* OutMcastPkts */
 	IPSTATS_MIB_INBCASTPKTS,		/* InBcastPkts */
 	IPSTATS_MIB_OUTBCASTPKTS,		/* OutBcastPkts */
+	IPSTATS_MIB_INOCTETS,			/* InOctets */
+	IPSTATS_MIB_OUTOCTETS,			/* OutOctets */
+	IPSTATS_MIB_INMCASTOCTETS,		/* InMcastOctets */
+	IPSTATS_MIB_OUTMCASTOCTETS,		/* OutMcastOctets */
+	IPSTATS_MIB_INBCASTOCTETS,		/* InBcastOctets */
+	IPSTATS_MIB_OUTBCASTOCTETS,		/* OutBcastOctets */
 	__IPSTATS_MIB_MAX
 };
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 4ac7577f98d..72c36926c26 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -168,7 +168,10 @@ struct ipv4_config
 extern struct ipv4_config ipv4_config;
 #define IP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.ip_statistics, field)
 #define IP_INC_STATS_BH(net, field)	SNMP_INC_STATS_BH((net)->mib.ip_statistics, field)
+#define IP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.ip_statistics, field, val)
 #define IP_ADD_STATS_BH(net, field, val) SNMP_ADD_STATS_BH((net)->mib.ip_statistics, field, val)
+#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS((net)->mib.ip_statistics, field, val)
+#define IP_UPD_PO_STATS_BH(net, field, val) SNMP_UPD_PO_STATS_BH((net)->mib.ip_statistics, field, val)
 #define NET_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.net_statistics, field)
 #define NET_INC_STATS_BH(net, field)	SNMP_INC_STATS_BH((net)->mib.net_statistics, field)
 #define NET_INC_STATS_USER(net, field) 	SNMP_INC_STATS_USER((net)->mib.net_statistics, field)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c1f16fc49ad..f27fd83d67d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -126,15 +126,28 @@ extern struct ctl_path net_ipv6_ctl_path[];
 	SNMP_ADD_STATS##modifier((net)->mib.statname##_statistics, (field), (val));\
 })
 
+#define _DEVUPD(net, statname, modifier, idev, field, val)		\
+({									\
+	struct inet6_dev *_idev = (idev);				\
+	if (likely(_idev != NULL))					\
+		SNMP_UPD_PO_STATS##modifier((_idev)->stats.statname, field, (val)); \
+	SNMP_UPD_PO_STATS##modifier((net)->mib.statname##_statistics, field, (val));\
+})
+
 /* MIBs */
 
 #define IP6_INC_STATS(net, idev,field)		\
 		_DEVINC(net, ipv6, , idev, field)
 #define IP6_INC_STATS_BH(net, idev,field)	\
 		_DEVINC(net, ipv6, _BH, idev, field)
+#define IP6_ADD_STATS(net, idev,field,val)	\
+		_DEVADD(net, ipv6, , idev, field, val)
 #define IP6_ADD_STATS_BH(net, idev,field,val)	\
 		_DEVADD(net, ipv6, _BH, idev, field, val)
-
+#define IP6_UPD_PO_STATS(net, idev,field,val)   \
+		_DEVUPD(net, ipv6, , idev, field, val)
+#define IP6_UPD_PO_STATS_BH(net, idev,field,val)   \
+		_DEVUPD(net, ipv6, _BH, idev, field, val)
 #define ICMP6_INC_STATS(net, idev, field)	\
 		_DEVINC(net, icmpv6, , idev, field)
 #define ICMP6_INC_STATS_BH(net, idev, field)	\
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 57c93628695..8c842e06bec 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -153,6 +153,11 @@ struct linux_xfrm_mib {
 		per_cpu_ptr(mib[!in_softirq()], get_cpu())->mibs[field]--; \
 		put_cpu(); \
 	} while (0)
+#define SNMP_ADD_STATS(mib, field, addend) 	\
+	do { \
+		per_cpu_ptr(mib[!in_softirq()], get_cpu())->mibs[field] += addend; \
+		put_cpu(); \
+	} while (0)
 #define SNMP_ADD_STATS_BH(mib, field, addend) 	\
 	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field] += addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend) 	\
@@ -160,5 +165,17 @@ struct linux_xfrm_mib {
 		per_cpu_ptr(mib[1], get_cpu())->mibs[field] += addend; \
 		put_cpu(); \
 	} while (0)
-
+#define SNMP_UPD_PO_STATS(mib, basefield, addend)	\
+	do { \
+		__typeof__(mib[0]) ptr = per_cpu_ptr(mib[!in_softirq()], get_cpu());\
+		ptr->mibs[basefield##PKTS]++; \
+		ptr->mibs[basefield##OCTETS] += addend;\
+		put_cpu(); \
+	} while (0)
+#define SNMP_UPD_PO_STATS_BH(mib, basefield, addend)	\
+	do { \
+		__typeof__(mib[0]) ptr = per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id());\
+		ptr->mibs[basefield##PKTS]++; \
+		ptr->mibs[basefield##OCTETS] += addend;\
+	} while (0)
 #endif
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 1a58a6fa1dc..40f6206b2aa 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -358,10 +358,12 @@ static int ip_rcv_finish(struct sk_buff *skb)
 		goto drop;
 
 	rt = skb->rtable;
-	if (rt->rt_type == RTN_MULTICAST)
-		IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCASTPKTS);
-	else if (rt->rt_type == RTN_BROADCAST)
-		IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCASTPKTS);
+	if (rt->rt_type == RTN_MULTICAST) {
+		IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
+				skb->len);
+	} else if (rt->rt_type == RTN_BROADCAST)
+		IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
+				skb->len);
 
 	return dst_input(skb);
 
@@ -384,7 +386,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	if (skb->pkt_type == PACKET_OTHERHOST)
 		goto drop;
 
-	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INRECEIVES);
+
+	IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
 
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3e7e910c7c0..ea19c37ccc0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -181,10 +181,10 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 
-	if (rt->rt_type == RTN_MULTICAST)
-		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTMCASTPKTS);
-	else if (rt->rt_type == RTN_BROADCAST)
-		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTBCASTPKTS);
+	if (rt->rt_type == RTN_MULTICAST) {
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+	} else if (rt->rt_type == RTN_BROADCAST)
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 
 	/* Be paranoid, rather than too clever. */
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
@@ -244,7 +244,7 @@ int ip_mc_output(struct sk_buff *skb)
 	/*
 	 *	If the indicated interface is up and running, send the packet.
 	 */
-	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS);
+	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
@@ -298,7 +298,7 @@ int ip_output(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dst->dev;
 
-	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS);
+	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index cf0cdeeb1db..f25542c48b7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -90,14 +90,14 @@ static const struct file_operations sockstat_seq_fops = {
 
 /* snmp items */
 static const struct snmp_mib snmp4_ipstats_list[] = {
-	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
+	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
 	SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
 	SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
 	SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
 	SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
 	SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
 	SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
-	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
+	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
 	SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
 	SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
 	SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
@@ -118,6 +118,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
 	SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
 	SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
+	SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
+	SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
+	SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+	SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+	SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+	SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 8f04bd9da27..bc1a920c34a 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -70,7 +70,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 
 	idev = __in6_dev_get(skb->dev);
 
-	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INRECEIVES);
+	IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len);
 
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
 	    !idev || unlikely(idev->cnf.disable_ipv6)) {
@@ -242,8 +242,9 @@ int ip6_mc_input(struct sk_buff *skb)
 	struct ipv6hdr *hdr;
 	int deliver;
 
-	IP6_INC_STATS_BH(dev_net(skb->dst->dev),
-			 ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCASTPKTS);
+	IP6_UPD_PO_STATS_BH(dev_net(skb->dst->dev),
+			 ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCAST,
+			 skb->len);
 
 	hdr = ipv6_hdr(skb);
 	deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9fb49c3b518..735a2bf4b5f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -159,7 +159,8 @@ static int ip6_output2(struct sk_buff *skb)
 			}
 		}
 
-		IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
+		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
+				skb->len);
 	}
 
 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
@@ -275,8 +276,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
-		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
-			      IPSTATS_MIB_OUTREQUESTS);
+		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb->dst),
+			      IPSTATS_MIB_OUT, skb->len);
 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 				dst_output);
 	}
@@ -1516,7 +1517,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	skb->mark = sk->sk_mark;
 
 	skb->dst = dst_clone(&rt->u.dst);
-	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	if (proto == IPPROTO_ICMPV6) {
 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index a51fb33e686..4b48819a5b8 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1449,7 +1449,8 @@ static void mld_sendpack(struct sk_buff *skb)
 	int err;
 	struct flowi fl;
 
-	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
 	payload_len = (skb->tail - skb->network_header) - sizeof(*pip6);
 	mldlen = skb->tail - skb->transport_header;
 	pip6->payload_len = htons(payload_len);
@@ -1473,13 +1474,15 @@ static void mld_sendpack(struct sk_buff *skb)
 	if (err)
 		goto err_out;
 
+	payload_len = skb->len;
+
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev,
 		      dst_output);
 out:
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT);
 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
-		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
+		IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_OUTMCAST, payload_len);
 	} else
 		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS);
 
@@ -1773,10 +1776,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 		     IPV6_TLV_PADN, 0 };
 	struct flowi fl;
 
-	rcu_read_lock();
-	IP6_INC_STATS(net, __in6_dev_get(dev),
-		      IPSTATS_MIB_OUTREQUESTS);
-	rcu_read_unlock();
 	if (type == ICMPV6_MGM_REDUCTION)
 		snd_addr = &in6addr_linklocal_allrouters;
 	else
@@ -1786,6 +1785,11 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	payload_len = len + sizeof(ra);
 	full_len = sizeof(struct ipv6hdr) + payload_len;
 
+	rcu_read_lock();
+	IP6_UPD_PO_STATS(net, __in6_dev_get(dev),
+		      IPSTATS_MIB_OUT, full_len);
+	rcu_read_unlock();
+
 	skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + full_len, 1, &err);
 
 	if (skb == NULL) {
@@ -1838,13 +1842,14 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	if (err)
 		goto err_out;
 
+
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev,
 		      dst_output);
 out:
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS(net, idev, type);
 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
-		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTMCASTPKTS);
+		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len);
 	} else
 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9f061d1adbc..ab65cc51b00 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -533,7 +533,7 @@ void ndisc_send_skb(struct sk_buff *skb,
 	skb->dst = dst;
 
 	idev = in6_dev_get(dst->dev);
-	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 		      dst_output);
@@ -1613,7 +1613,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 
 	buff->dst = dst;
 	idev = in6_dev_get(dst->dev);
-	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev,
 		      dst_output);
 	if (!err) {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 97c17fdd6f7..590ddefb7ff 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -61,7 +61,7 @@ static const struct file_operations sockstat6_seq_fops = {
 
 static struct snmp_mib snmp6_ipstats_list[] = {
 /* ipv6 mib according to RFC 2465 */
-	SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INRECEIVES),
+	SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS),
 	SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS),
 	SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS),
 	SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES),
@@ -71,7 +71,7 @@ static struct snmp_mib snmp6_ipstats_list[] = {
 	SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS),
 	SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS),
 	SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
-	SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS),
+	SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS),
 	SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS),
 	SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
 	SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
@@ -83,6 +83,12 @@ static struct snmp_mib snmp6_ipstats_list[] = {
 	SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES),
 	SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
 	SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+	SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS),
+	SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS),
+	SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+	SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+	SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+	SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 61f6827e590..e99307fba0b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -638,7 +638,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	if (err)
 		goto error_fault;
 
-	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
+	IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 		      dst_output);
 	if (err > 0)
-- 
cgit v1.2.3-70-g09d2


From 739649c53d7f78f5bf41bdfd1a858ee90c7a687a Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 25 Apr 2009 12:52:40 +0000
Subject: of: add of_parse_phandle() helper for parsing phandle properties

of_parse_phandle() is a helper function to read and parse a phandle
property and return a pointer to the resulting device_node.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/of/base.c  | 24 ++++++++++++++++++++++++
 include/linux/of.h |  3 +++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 41c5dfd8535..ddf224d456b 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -494,6 +494,30 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
 }
 EXPORT_SYMBOL_GPL(of_modalias_node);
 
+/**
+ * of_parse_phandle - Resolve a phandle property to a device_node pointer
+ * @np: Pointer to device node holding phandle property
+ * @phandle_name: Name of property holding a phandle value
+ * @index: For properties holding a table of phandles, this is the index into
+ *         the table
+ *
+ * Returns the device_node pointer with refcount incremented.  Use
+ * of_node_put() on it when done.
+ */
+struct device_node *
+of_parse_phandle(struct device_node *np, const char *phandle_name, int index)
+{
+	const phandle *phandle;
+	int size;
+
+	phandle = of_get_property(np, phandle_name, &size);
+	if ((!phandle) || (size < sizeof(*phandle) * (index + 1)))
+		return NULL;
+
+	return of_find_node_by_phandle(phandle[index]);
+}
+EXPORT_SYMBOL(of_parse_phandle);
+
 /**
  * of_parse_phandles_with_args - Find a node pointed by phandle in a list
  * @np:		pointer to a device tree node containing a list
diff --git a/include/linux/of.h b/include/linux/of.h
index 6a7efa242f5..7be2d1043c1 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -77,6 +77,9 @@ extern int of_n_size_cells(struct device_node *np);
 extern const struct of_device_id *of_match_node(
 	const struct of_device_id *matches, const struct device_node *node);
 extern int of_modalias_node(struct device_node *node, char *modalias, int len);
+extern struct device_node *of_parse_phandle(struct device_node *np,
+					    const char *phandle_name,
+					    int index);
 extern int of_parse_phandles_with_args(struct device_node *np,
 	const char *list_name, const char *cells_name, int index,
 	struct device_node **out_node, const void **out_args);
-- 
cgit v1.2.3-70-g09d2


From 4dea547fef1ba23f9d23f5e7f5218187a7dcf1b3 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 25 Apr 2009 12:52:46 +0000
Subject: phylib: rework to prepare for OF registration of PHYs

This patch makes changes in preparation for supporting open firmware
device tree descriptions of MDIO busses.  Changes include:
- Cleanup handling of phy_map[] entries; they are already NULLed when
  registering and so don't need to be re-cleared, and it is good practice
  to clear them out when unregistering.
- Split phy_device registration out into a new function so that the
  OF helpers can do two stage registration (separate allocation and
  registration steps).

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c   | 29 +++-------------------------
 drivers/net/phy/phy_device.c | 45 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/phy.h          |  1 +
 3 files changed, 45 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index b754020cbe7..bd4e8d72dc0 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -113,7 +113,6 @@ int mdiobus_register(struct mii_bus *bus)
 		bus->reset(bus);
 
 	for (i = 0; i < PHY_MAX_ADDR; i++) {
-		bus->phy_map[i] = NULL;
 		if ((bus->phy_mask & (1 << i)) == 0) {
 			struct phy_device *phydev;
 
@@ -150,6 +149,7 @@ void mdiobus_unregister(struct mii_bus *bus)
 	for (i = 0; i < PHY_MAX_ADDR; i++) {
 		if (bus->phy_map[i])
 			device_unregister(&bus->phy_map[i]->dev);
+		bus->phy_map[i] = NULL;
 	}
 }
 EXPORT_SYMBOL(mdiobus_unregister);
@@ -188,35 +188,12 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr)
 	if (IS_ERR(phydev) || phydev == NULL)
 		return phydev;
 
-	/* There's a PHY at this address
-	 * We need to set:
-	 * 1) IRQ
-	 * 2) bus_id
-	 * 3) parent
-	 * 4) bus
-	 * 5) mii_bus
-	 * And, we need to register it */
-
-	phydev->irq = bus->irq != NULL ? bus->irq[addr] : PHY_POLL;
-
-	phydev->dev.parent = bus->parent;
-	phydev->dev.bus = &mdio_bus_type;
-	dev_set_name(&phydev->dev, PHY_ID_FMT, bus->id, addr);
-
-	phydev->bus = bus;
-
-	/* Run all of the fixups for this PHY */
-	phy_scan_fixups(phydev);
-
-	err = device_register(&phydev->dev);
+	err = phy_device_register(phydev);
 	if (err) {
-		printk(KERN_ERR "phy %d failed to register\n", addr);
 		phy_device_free(phydev);
-		phydev = NULL;
+		return NULL;
 	}
 
-	bus->phy_map[addr] = phydev;
-
 	return phydev;
 }
 EXPORT_SYMBOL(mdiobus_scan);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 0a06e4fd37d..9352ca8fa2c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -39,20 +39,21 @@ MODULE_DESCRIPTION("PHY library");
 MODULE_AUTHOR("Andy Fleming");
 MODULE_LICENSE("GPL");
 
-static struct phy_driver genphy_driver;
-extern int mdio_bus_init(void);
-extern void mdio_bus_exit(void);
-
 void phy_device_free(struct phy_device *phydev)
 {
 	kfree(phydev);
 }
+EXPORT_SYMBOL(phy_device_free);
 
 static void phy_device_release(struct device *dev)
 {
 	phy_device_free(to_phy_device(dev));
 }
 
+static struct phy_driver genphy_driver;
+extern int mdio_bus_init(void);
+extern void mdio_bus_exit(void);
+
 static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
@@ -166,6 +167,10 @@ struct phy_device* phy_device_create(struct mii_bus *bus, int addr, int phy_id)
 	dev->addr = addr;
 	dev->phy_id = phy_id;
 	dev->bus = bus;
+	dev->dev.parent = bus->parent;
+	dev->dev.bus = &mdio_bus_type;
+	dev->irq = bus->irq != NULL ? bus->irq[addr] : PHY_POLL;
+	dev_set_name(&dev->dev, PHY_ID_FMT, bus->id, addr);
 
 	dev->state = PHY_DOWN;
 
@@ -235,6 +240,38 @@ struct phy_device * get_phy_device(struct mii_bus *bus, int addr)
 
 	return dev;
 }
+EXPORT_SYMBOL(get_phy_device);
+
+/**
+ * phy_device_register - Register the phy device on the MDIO bus
+ * @phy_device: phy_device structure to be added to the MDIO bus
+ */
+int phy_device_register(struct phy_device *phydev)
+{
+	int err;
+
+	/* Don't register a phy if one is already registered at this
+	 * address */
+	if (phydev->bus->phy_map[phydev->addr])
+		return -EINVAL;
+	phydev->bus->phy_map[phydev->addr] = phydev;
+
+	/* Run all of the fixups for this PHY */
+	phy_scan_fixups(phydev);
+
+	err = device_register(&phydev->dev);
+	if (err) {
+		pr_err("phy %d failed to register\n", phydev->addr);
+		goto out;
+	}
+
+	return 0;
+
+ out:
+	phydev->bus->phy_map[phydev->addr] = NULL;
+	return err;
+}
+EXPORT_SYMBOL(phy_device_register);
 
 /**
  * phy_prepare_link - prepares the PHY layer to monitor link status
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 97e40cb6b58..bf0b5f112dc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -444,6 +444,7 @@ static inline int phy_write(struct phy_device *phydev, u16 regnum, u16 val)
 
 int get_phy_id(struct mii_bus *bus, int addr, u32 *phy_id);
 struct phy_device* get_phy_device(struct mii_bus *bus, int addr);
+int phy_device_register(struct phy_device *phy);
 int phy_clear_interrupt(struct phy_device *phydev);
 int phy_config_interrupt(struct phy_device *phydev, u32 interrupts);
 struct phy_device * phy_attach(struct net_device *dev,
-- 
cgit v1.2.3-70-g09d2


From fa94f6d93c5382810ff41f010f12ca8698fc775e Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 25 Apr 2009 12:52:51 +0000
Subject: phylib: add *_direct() variants of phy_connect and phy_attach
 functions

Add phy_connect_direct() and phy_attach_direct() functions so that
drivers can use a pointer to the phy_device instead of trying to determine
the phy's bus_id string.

This patch is useful for OF device tree descriptions of phy devices where
the driver doesn't need or know what the bus_id value in order to get a
phy_device pointer.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 118 +++++++++++++++++++++++++++++++------------
 include/linux/phy.h          |   5 ++
 2 files changed, 90 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 9352ca8fa2c..a2ece89622d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -291,6 +291,33 @@ void phy_prepare_link(struct phy_device *phydev,
 	phydev->adjust_link = handler;
 }
 
+/**
+ * phy_connect_direct - connect an ethernet device to a specific phy_device
+ * @dev: the network device to connect
+ * @phydev: the pointer to the phy device
+ * @handler: callback function for state change notifications
+ * @flags: PHY device's dev_flags
+ * @interface: PHY device's interface
+ */
+int phy_connect_direct(struct net_device *dev, struct phy_device *phydev,
+		       void (*handler)(struct net_device *), u32 flags,
+		       phy_interface_t interface)
+{
+	int rc;
+
+	rc = phy_attach_direct(dev, phydev, flags, interface);
+	if (rc)
+		return rc;
+
+	phy_prepare_link(phydev, handler);
+	phy_start_machine(phydev, NULL);
+	if (phydev->irq > 0)
+		phy_start_interrupts(phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_connect_direct);
+
 /**
  * phy_connect - connect an ethernet device to a PHY device
  * @dev: the network device to connect
@@ -312,18 +339,21 @@ struct phy_device * phy_connect(struct net_device *dev, const char *bus_id,
 		phy_interface_t interface)
 {
 	struct phy_device *phydev;
+	struct device *d;
+	int rc;
 
-	phydev = phy_attach(dev, bus_id, flags, interface);
-
-	if (IS_ERR(phydev))
-		return phydev;
-
-	phy_prepare_link(phydev, handler);
-
-	phy_start_machine(phydev, NULL);
+	/* Search the list of PHY devices on the mdio bus for the
+	 * PHY with the requested name */
+	d = bus_find_device_by_name(&mdio_bus_type, NULL, bus_id);
+	if (!d) {
+		pr_err("PHY %s not found\n", bus_id);
+		return ERR_PTR(-ENODEV);
+	}
+	phydev = to_phy_device(d);
 
-	if (phydev->irq > 0)
-		phy_start_interrupts(phydev);
+	rc = phy_connect_direct(dev, phydev, handler, flags, interface);
+	if (rc)
+		return ERR_PTR(rc);
 
 	return phydev;
 }
@@ -347,9 +377,9 @@ void phy_disconnect(struct phy_device *phydev)
 EXPORT_SYMBOL(phy_disconnect);
 
 /**
- * phy_attach - attach a network device to a particular PHY device
+ * phy_attach_direct - attach a network device to a given PHY device pointer
  * @dev: network device to attach
- * @bus_id: PHY device to attach
+ * @phydev: Pointer to phy_device to attach
  * @flags: PHY device's dev_flags
  * @interface: PHY device's interface
  *
@@ -360,22 +390,10 @@ EXPORT_SYMBOL(phy_disconnect);
  *     the attaching device, and given a callback for link status
  *     change.  The phy_device is returned to the attaching driver.
  */
-struct phy_device *phy_attach(struct net_device *dev,
-		const char *bus_id, u32 flags, phy_interface_t interface)
+int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
+		      u32 flags, phy_interface_t interface)
 {
-	struct bus_type *bus = &mdio_bus_type;
-	struct phy_device *phydev;
-	struct device *d;
-
-	/* Search the list of PHY devices on the mdio bus for the
-	 * PHY with the requested name */
-	d = bus_find_device_by_name(bus, NULL, bus_id);
-	if (d) {
-		phydev = to_phy_device(d);
-	} else {
-		printk(KERN_ERR "%s not found\n", bus_id);
-		return ERR_PTR(-ENODEV);
-	}
+	struct device *d = &phydev->dev;
 
 	/* Assume that if there is no driver, that it doesn't
 	 * exist, and we should use the genphy driver. */
@@ -388,13 +406,12 @@ struct phy_device *phy_attach(struct net_device *dev,
 			err = device_bind_driver(d);
 
 		if (err)
-			return ERR_PTR(err);
+			return err;
 	}
 
 	if (phydev->attached_dev) {
-		printk(KERN_ERR "%s: %s already attached\n",
-				dev->name, bus_id);
-		return ERR_PTR(-EBUSY);
+		dev_err(&dev->dev, "PHY already attached\n");
+		return -EBUSY;
 	}
 
 	phydev->attached_dev = dev;
@@ -412,13 +429,48 @@ struct phy_device *phy_attach(struct net_device *dev,
 		err = phy_scan_fixups(phydev);
 
 		if (err < 0)
-			return ERR_PTR(err);
+			return err;
 
 		err = phydev->drv->config_init(phydev);
 
 		if (err < 0)
-			return ERR_PTR(err);
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_attach_direct);
+
+/**
+ * phy_attach - attach a network device to a particular PHY device
+ * @dev: network device to attach
+ * @bus_id: Bus ID of PHY device to attach
+ * @flags: PHY device's dev_flags
+ * @interface: PHY device's interface
+ *
+ * Description: Same as phy_attach_direct() except that a PHY bus_id
+ *     string is passed instead of a pointer to a struct phy_device.
+ */
+struct phy_device *phy_attach(struct net_device *dev,
+		const char *bus_id, u32 flags, phy_interface_t interface)
+{
+	struct bus_type *bus = &mdio_bus_type;
+	struct phy_device *phydev;
+	struct device *d;
+	int rc;
+
+	/* Search the list of PHY devices on the mdio bus for the
+	 * PHY with the requested name */
+	d = bus_find_device_by_name(bus, NULL, bus_id);
+	if (!d) {
+		pr_err("PHY %s not found\n", bus_id);
+		return ERR_PTR(-ENODEV);
 	}
+	phydev = to_phy_device(d);
+
+	rc = phy_attach_direct(dev, phydev, flags, interface);
+	if (rc)
+		return ERR_PTR(rc);
 
 	return phydev;
 }
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bf0b5f112dc..c216e4e503b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -447,8 +447,13 @@ struct phy_device* get_phy_device(struct mii_bus *bus, int addr);
 int phy_device_register(struct phy_device *phy);
 int phy_clear_interrupt(struct phy_device *phydev);
 int phy_config_interrupt(struct phy_device *phydev, u32 interrupts);
+int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
+		u32 flags, phy_interface_t interface);
 struct phy_device * phy_attach(struct net_device *dev,
 		const char *bus_id, u32 flags, phy_interface_t interface);
+int phy_connect_direct(struct net_device *dev, struct phy_device *phydev,
+		void (*handler)(struct net_device *), u32 flags,
+		phy_interface_t interface);
 struct phy_device * phy_connect(struct net_device *dev, const char *bus_id,
 		void (*handler)(struct net_device *), u32 flags,
 		phy_interface_t interface);
-- 
cgit v1.2.3-70-g09d2


From 8bc487d150b939e69830c39322df4ee486efe381 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 25 Apr 2009 12:52:56 +0000
Subject: openfirmware: Add OF phylib support code

Add support for parsing the device tree for PHY devices on an MDIO bus.
Currently many of the PowerPC ethernet drivers are open coding a solution
for reading data out of the device tree to find the correct PHY device.
This patch implements a set of common routines to:

a) let MDIO bus drivers register phy_devices described in the tree, and
b) let MAC drivers find the correct phy_device via the tree.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/of/Kconfig      |   6 +++
 drivers/of/Makefile     |   1 +
 drivers/of/of_mdio.c    | 139 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of_mdio.h |  22 ++++++++
 4 files changed, 168 insertions(+)
 create mode 100644 drivers/of/of_mdio.c
 create mode 100644 include/linux/of_mdio.h

(limited to 'include/linux')

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index f821dbc952a..6fe043bd377 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -19,3 +19,9 @@ config OF_SPI
 	depends on OF && PPC_OF && SPI
 	help
 	  OpenFirmware SPI accessors
+
+config OF_MDIO
+	def_tristate PHYLIB
+	depends on OF && PHYLIB
+	help
+	  OpenFirmware MDIO bus (Ethernet PHY) accessors
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 4c3c6f8e36f..bdfb5f5d4b0 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_OF_DEVICE) += device.o platform.o
 obj-$(CONFIG_OF_GPIO)   += gpio.o
 obj-$(CONFIG_OF_I2C)	+= of_i2c.o
 obj-$(CONFIG_OF_SPI)	+= of_spi.o
+obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
new file mode 100644
index 00000000000..aee967d7f76
--- /dev/null
+++ b/drivers/of/of_mdio.c
@@ -0,0 +1,139 @@
+/*
+ * OF helpers for the MDIO (Ethernet PHY) API
+ *
+ * Copyright (c) 2009 Secret Lab Technologies, Ltd.
+ *
+ * This file is released under the GPLv2
+ *
+ * This file provides helper functions for extracting PHY device information
+ * out of the OpenFirmware device tree and using it to populate an mii_bus.
+ */
+
+#include <linux/phy.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Grant Likely <grant.likely@secretlab.ca>");
+MODULE_LICENSE("GPL");
+
+/**
+ * of_mdiobus_register - Register mii_bus and create PHYs from the device tree
+ * @mdio: pointer to mii_bus structure
+ * @np: pointer to device_node of MDIO bus.
+ *
+ * This function registers the mii_bus structure and registers a phy_device
+ * for each child node of @np.
+ */
+int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
+{
+	struct phy_device *phy;
+	struct device_node *child;
+	int rc, i;
+
+	/* Mask out all PHYs from auto probing.  Instead the PHYs listed in
+	 * the device tree are populated after the bus has been registered */
+	mdio->phy_mask = ~0;
+
+	/* Clear all the IRQ properties */
+	if (mdio->irq)
+		for (i=0; i<PHY_MAX_ADDR; i++)
+			mdio->irq[i] = PHY_POLL;
+
+	/* Register the MDIO bus */
+	rc = mdiobus_register(mdio);
+	if (rc)
+		return rc;
+
+	/* Loop over the child nodes and register a phy_device for each one */
+	for_each_child_of_node(np, child) {
+		const u32 *addr;
+		int len;
+
+		/* A PHY must have a reg property in the range [0-31] */
+		addr = of_get_property(child, "reg", &len);
+		if (!addr || len < sizeof(*addr) || *addr >= 32 || *addr < 0) {
+			dev_err(&mdio->dev, "%s has invalid PHY address\n",
+				child->full_name);
+			continue;
+		}
+
+		if (mdio->irq) {
+			mdio->irq[*addr] = irq_of_parse_and_map(child, 0);
+			if (!mdio->irq[*addr])
+				mdio->irq[*addr] = PHY_POLL;
+		}
+
+		phy = get_phy_device(mdio, *addr);
+		if (!phy) {
+			dev_err(&mdio->dev, "error probing PHY at address %i\n",
+				*addr);
+			continue;
+		}
+		phy_scan_fixups(phy);
+
+		/* Associate the OF node with the device structure so it
+		 * can be looked up later */
+		of_node_get(child);
+		dev_archdata_set_node(&phy->dev.archdata, child);
+
+		/* All data is now stored in the phy struct; register it */
+		rc = phy_device_register(phy);
+		if (rc) {
+			phy_device_free(phy);
+			of_node_put(child);
+			continue;
+		}
+
+		dev_dbg(&mdio->dev, "registered phy %s at address %i\n",
+			child->name, *addr);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(of_mdiobus_register);
+
+/**
+ * of_phy_find_device - Give a PHY node, find the phy_device
+ * @phy_np: Pointer to the phy's device tree node
+ *
+ * Returns a pointer to the phy_device.
+ */
+struct phy_device *of_phy_find_device(struct device_node *phy_np)
+{
+	struct device *d;
+	int match(struct device *dev, void *phy_np)
+	{
+		return dev_archdata_get_node(&dev->archdata) == phy_np;
+	}
+
+	if (!phy_np)
+		return NULL;
+
+	d = bus_find_device(&mdio_bus_type, NULL, phy_np, match);
+	return d ? to_phy_device(d) : NULL;
+}
+EXPORT_SYMBOL(of_phy_find_device);
+
+/**
+ * of_phy_connect - Connect to the phy described in the device tree
+ * @dev: pointer to net_device claiming the phy
+ * @phy_np: Pointer to device tree node for the PHY
+ * @hndlr: Link state callback for the network device
+ * @iface: PHY data interface type
+ *
+ * Returns a pointer to the phy_device if successfull.  NULL otherwise
+ */
+struct phy_device *of_phy_connect(struct net_device *dev,
+				  struct device_node *phy_np,
+				  void (*hndlr)(struct net_device *), u32 flags,
+				  phy_interface_t iface)
+{
+	struct phy_device *phy = of_phy_find_device(phy_np);
+
+	if (!phy)
+		return NULL;
+
+	return phy_connect_direct(dev, phy, hndlr, flags, iface) ? NULL : phy;
+}
+EXPORT_SYMBOL(of_phy_connect);
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
new file mode 100644
index 00000000000..c9663c69030
--- /dev/null
+++ b/include/linux/of_mdio.h
@@ -0,0 +1,22 @@
+/*
+ * OF helpers for the MDIO (Ethernet PHY) API
+ *
+ * Copyright (c) 2009 Secret Lab Technologies, Ltd.
+ *
+ * This file is released under the GPLv2
+ */
+
+#ifndef __LINUX_OF_MDIO_H
+#define __LINUX_OF_MDIO_H
+
+#include <linux/phy.h>
+#include <linux/of.h>
+
+extern int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np);
+extern struct phy_device *of_phy_find_device(struct device_node *phy_np);
+extern struct phy_device *of_phy_connect(struct net_device *dev,
+					 struct device_node *phy_np,
+					 void (*hndlr)(struct net_device *),
+					 u32 flags, phy_interface_t iface);
+
+#endif /* __LINUX_OF_MDIO_H */
-- 
cgit v1.2.3-70-g09d2


From aa73832c5a80d6c52c69b18af858d88fa595dd3c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Sat, 25 Apr 2009 12:53:33 +0000
Subject: net: Rework fs_enet driver to use of_mdio infrastructure

This patch simplifies the driver by making use of more common code.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/fs_enet/fs_enet-main.c | 69 ++++++--------------------------------
 drivers/net/fs_enet/mii-bitbang.c  | 29 ++--------------
 drivers/net/fs_enet/mii-fec.c      | 26 +-------------
 include/linux/fs_enet_pd.h         |  6 ++--
 4 files changed, 16 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index a9cbc3191a2..9604aaed61d 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -36,6 +36,8 @@
 #include <linux/fs.h>
 #include <linux/platform_device.h>
 #include <linux/phy.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
 
@@ -752,9 +754,10 @@ static int fs_init_phy(struct net_device *dev)
 	fep->oldlink = 0;
 	fep->oldspeed = 0;
 	fep->oldduplex = -1;
-	if(fep->fpi->bus_id)
-		phydev = phy_connect(dev, fep->fpi->bus_id, &fs_adjust_link, 0,
-				PHY_INTERFACE_MODE_MII);
+	if(fep->fpi->phy_node)
+		phydev = of_phy_connect(dev, fep->fpi->phy_node,
+					&fs_adjust_link, 0,
+					PHY_INTERFACE_MODE_MII);
 	else {
 		printk("No phy bus ID specified in BSP code\n");
 		return -EINVAL;
@@ -962,57 +965,6 @@ static void cleanup_immap(void)
 
 /**************************************************************************************/
 
-static int __devinit find_phy(struct device_node *np,
-                              struct fs_platform_info *fpi)
-{
-	struct device_node *phynode, *mdionode;
-	int ret = 0, len, bus_id;
-	const u32 *data;
-
-	data  = of_get_property(np, "fixed-link", NULL);
-	if (data) {
-		snprintf(fpi->bus_id, 16, "%x:%02x", 0, *data);
-		return 0;
-	}
-
-	data = of_get_property(np, "phy-handle", &len);
-	if (!data || len != 4)
-		return -EINVAL;
-
-	phynode = of_find_node_by_phandle(*data);
-	if (!phynode)
-		return -EINVAL;
-
-	data = of_get_property(phynode, "reg", &len);
-	if (!data || len != 4) {
-		ret = -EINVAL;
-		goto out_put_phy;
-	}
-
-	mdionode = of_get_parent(phynode);
-	if (!mdionode) {
-		ret = -EINVAL;
-		goto out_put_phy;
-	}
-
-	bus_id = of_get_gpio(mdionode, 0);
-	if (bus_id < 0) {
-		struct resource res;
-		ret = of_address_to_resource(mdionode, 0, &res);
-		if (ret)
-			goto out_put_mdio;
-		bus_id = res.start;
-	}
-
-	snprintf(fpi->bus_id, 16, "%x:%02x", bus_id, *data);
-
-out_put_mdio:
-	of_node_put(mdionode);
-out_put_phy:
-	of_node_put(phynode);
-	return ret;
-}
-
 #ifdef CONFIG_FS_ENET_HAS_FEC
 #define IS_FEC(match) ((match)->data == &fs_fec_ops)
 #else
@@ -1062,9 +1014,9 @@ static int __devinit fs_enet_probe(struct of_device *ofdev,
 	fpi->rx_copybreak = 240;
 	fpi->use_napi = 1;
 	fpi->napi_weight = 17;
-
-	ret = find_phy(ofdev->node, fpi);
-	if (ret)
+	fpi->phy_node = of_parse_phandle(ofdev->node, "phy-handle", 0);
+	if ((!fpi->phy_node) && (!of_get_property(ofdev->node, "fixed-link",
+						  NULL)))
 		goto out_free_fpi;
 
 	privsize = sizeof(*fep) +
@@ -1136,6 +1088,7 @@ out_cleanup_data:
 out_free_dev:
 	free_netdev(ndev);
 	dev_set_drvdata(&ofdev->dev, NULL);
+	of_node_put(fpi->phy_node);
 out_free_fpi:
 	kfree(fpi);
 	return ret;
@@ -1151,7 +1104,7 @@ static int fs_enet_remove(struct of_device *ofdev)
 	fep->ops->free_bd(ndev);
 	fep->ops->cleanup_data(ndev);
 	dev_set_drvdata(fep->dev, NULL);
-
+	of_node_put(fep->fpi->phy_node);
 	free_netdev(ndev);
 	return 0;
 }
diff --git a/drivers/net/fs_enet/mii-bitbang.c b/drivers/net/fs_enet/mii-bitbang.c
index 49b6645d7e0..93b481b0e3c 100644
--- a/drivers/net/fs_enet/mii-bitbang.c
+++ b/drivers/net/fs_enet/mii-bitbang.c
@@ -22,6 +22,7 @@
 #include <linux/mii.h>
 #include <linux/platform_device.h>
 #include <linux/mdio-bitbang.h>
+#include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 
 #include "fs_enet.h"
@@ -149,31 +150,12 @@ static int __devinit fs_mii_bitbang_init(struct mii_bus *bus,
 	return 0;
 }
 
-static void __devinit add_phy(struct mii_bus *bus, struct device_node *np)
-{
-	const u32 *data;
-	int len, id, irq;
-
-	data = of_get_property(np, "reg", &len);
-	if (!data || len != 4)
-		return;
-
-	id = *data;
-	bus->phy_mask &= ~(1 << id);
-
-	irq = of_irq_to_resource(np, 0, NULL);
-	if (irq != NO_IRQ)
-		bus->irq[id] = irq;
-}
-
 static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
                                         const struct of_device_id *match)
 {
-	struct device_node *np = NULL;
 	struct mii_bus *new_bus;
 	struct bb_info *bitbang;
 	int ret = -ENOMEM;
-	int i;
 
 	bitbang = kzalloc(sizeof(struct bb_info), GFP_KERNEL);
 	if (!bitbang)
@@ -196,17 +178,10 @@ static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
 	if (!new_bus->irq)
 		goto out_unmap_regs;
 
-	for (i = 0; i < PHY_MAX_ADDR; i++)
-		new_bus->irq[i] = -1;
-
-	while ((np = of_get_next_child(ofdev->node, np)))
-		if (!strcmp(np->type, "ethernet-phy"))
-			add_phy(new_bus, np);
-
 	new_bus->parent = &ofdev->dev;
 	dev_set_drvdata(&ofdev->dev, new_bus);
 
-	ret = mdiobus_register(new_bus);
+	ret = of_mdiobus_register(new_bus, ofdev->node);
 	if (ret)
 		goto out_free_irqs;
 
diff --git a/drivers/net/fs_enet/mii-fec.c b/drivers/net/fs_enet/mii-fec.c
index 61aaae444b4..75a09994d66 100644
--- a/drivers/net/fs_enet/mii-fec.c
+++ b/drivers/net/fs_enet/mii-fec.c
@@ -100,23 +100,6 @@ static int fs_enet_fec_mii_reset(struct mii_bus *bus)
 	return 0;
 }
 
-static void __devinit add_phy(struct mii_bus *bus, struct device_node *np)
-{
-	const u32 *data;
-	int len, id, irq;
-
-	data = of_get_property(np, "reg", &len);
-	if (!data || len != 4)
-		return;
-
-	id = *data;
-	bus->phy_mask &= ~(1 << id);
-
-	irq = of_irq_to_resource(np, 0, NULL);
-	if (irq != NO_IRQ)
-		bus->irq[id] = irq;
-}
-
 static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
                                         const struct of_device_id *match)
 {
@@ -163,17 +146,10 @@ static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
 	if (!new_bus->irq)
 		goto out_unmap_regs;
 
-	for (i = 0; i < PHY_MAX_ADDR; i++)
-		new_bus->irq[i] = -1;
-
-	while ((np = of_get_next_child(ofdev->node, np)))
-		if (!strcmp(np->type, "ethernet-phy"))
-			add_phy(new_bus, np);
-
 	new_bus->parent = &ofdev->dev;
 	dev_set_drvdata(&ofdev->dev, new_bus);
 
-	ret = mdiobus_register(new_bus);
+	ret = of_mdiobus_register(new_bus, ofdev->node);
 	if (ret)
 		goto out_free_irqs;
 
diff --git a/include/linux/fs_enet_pd.h b/include/linux/fs_enet_pd.h
index 8300cab30f9..51b793466ff 100644
--- a/include/linux/fs_enet_pd.h
+++ b/include/linux/fs_enet_pd.h
@@ -17,6 +17,7 @@
 #define FS_ENET_PD_H
 
 #include <linux/string.h>
+#include <linux/of_mdio.h>
 #include <asm/types.h>
 
 #define FS_ENET_NAME	"fs_enet"
@@ -130,10 +131,7 @@ struct fs_platform_info {
 	
 	u32 device_flags;
 
-	int phy_addr;		/* the phy address (-1 no phy) */
-	char bus_id[16];
-	int phy_irq;		/* the phy irq (if it exists)  */
-
+	struct device_node *phy_node;
 	const struct fs_mii_bus_info *bus_info;
 
 	int rx_ring, tx_ring;	/* number of buffers on rx     */
-- 
cgit v1.2.3-70-g09d2


From f85ba78068ac137fe9c1f50d25405d2783d75c77 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 27 Apr 2009 03:23:54 -0700
Subject: tun: add IFF_TUN_EXCL flag to avoid opening a persistent device.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When creating a certain types of VPN, NetworkManager will first attempt
to find an available tun device by iterating through 'vpn%d' until it
finds one that isn't already busy. Then it'll set that to be persistent
and owned by the otherwise unprivileged user that the VPN dæmon itself
runs as.

There's a race condition here -- during the period where the vpn%d
device is created and we're waiting for the VPN dæmon to actually
connect and use it, if we try to create _another_ device we could end up
re-using the same one -- because trying to open it again doesn't get
-EBUSY as it would while it's _actually_ busy.

So solve this, we add an IFF_TUN_EXCL flag which causes tun_set_iff() to
fail if it would be opening an existing persistent tundevice -- so that
we can make sure we're getting an entirely _new_ device.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 2 ++
 include/linux/if_tun.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 589f0ca668d..94622e5fb93 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -874,6 +874,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
+		if (ifr->ifr_flags & IFF_TUN_EXCL)
+			return -EBUSY;
 		if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
 			tun = netdev_priv(dev);
 		else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 049d6c9428d..915ba5789f0 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -55,6 +55,7 @@
 #define IFF_NO_PI	0x1000
 #define IFF_ONE_QUEUE	0x2000
 #define IFF_VNET_HDR	0x4000
+#define IFF_TUN_EXCL	0x8000
 
 /* Features for GSO (TUNSETOFFLOAD). */
 #define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
-- 
cgit v1.2.3-70-g09d2


From edbd9e30306067c3a45c035eb95a6f49daaa2337 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 27 Apr 2009 05:44:29 -0700
Subject: gro: Fix handling of headers that extend over the tail

The skb_gro_* code fails to handle the case where a header starts
in the linear area but ends in the frags area.  Since the goal
of skb_gro_* is to optimise the case of completely non-linear
packets, we can simply bail out if we have anything in the linear
area.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31167451d08..c9ef4191607 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1137,7 +1137,7 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb)
 
 static inline void *skb_gro_mac_header(struct sk_buff *skb)
 {
-	return skb_mac_header(skb) < skb->data ? skb_mac_header(skb) :
+	return skb_headlen(skb) ? skb_mac_header(skb) :
 	       page_address(skb_shinfo(skb)->frags[0].page) +
 	       skb_shinfo(skb)->frags[0].page_offset;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index e48c08af76a..6785b067ad5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2378,18 +2378,13 @@ void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
 	unsigned int offset = skb_gro_offset(skb);
 
 	hlen += offset;
-	if (hlen <= skb_headlen(skb))
-		return skb->data + offset;
-
-	if (unlikely(!skb_shinfo(skb)->nr_frags ||
-		     skb_shinfo(skb)->frags[0].size <=
-		     hlen - skb_headlen(skb) ||
+	if (unlikely(skb_headlen(skb) ||
+		     skb_shinfo(skb)->frags[0].size < hlen ||
 		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
 		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
 
 	return page_address(skb_shinfo(skb)->frags[0].page) +
-	       skb_shinfo(skb)->frags[0].page_offset +
-	       offset - skb_headlen(skb);
+	       skb_shinfo(skb)->frags[0].page_offset + offset;
 }
 EXPORT_SYMBOL(skb_gro_header);
 
-- 
cgit v1.2.3-70-g09d2


From 36e7b1b8dac1a785abca3a121b6b0b79f1a8d7df Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 27 Apr 2009 05:44:45 -0700
Subject: gro: Fix COMPLETE checksum handling

On a brand new GRO skb, we cannot call ip_hdr since the header
may lie in the non-linear area.  This patch adds the helper
skb_gro_network_header to handle this.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++++
 net/ipv4/tcp_ipv4.c       | 2 +-
 net/ipv6/tcp_ipv6.c       | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c9ef4191607..fe20d171acf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1142,6 +1142,13 @@ static inline void *skb_gro_mac_header(struct sk_buff *skb)
 	       skb_shinfo(skb)->frags[0].page_offset;
 }
 
+static inline void *skb_gro_network_header(struct sk_buff *skb)
+{
+	return skb_headlen(skb) ? skb_network_header(skb) :
+	       page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset + skb_network_offset(skb);
+}
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5d427f86b41..bda74e8aed7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2343,7 +2343,7 @@ void tcp4_proc_exit(void)
 
 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
-	struct iphdr *iph = ip_hdr(skb);
+	struct iphdr *iph = skb_gro_network_header(skb);
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4b5aa185426..d9dd94b6bf6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -943,7 +943,7 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb)
 
 struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
-	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct ipv6hdr *iph = skb_gro_network_header(skb);
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
-- 
cgit v1.2.3-70-g09d2


From 879c5e6b7cb4c689d08ca9b2e353d8ab3dc425d5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 17 Jun 2009 11:47:48 -0400
Subject: jbd2: convert instrumentation from markers to tracepoints

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c        |   5 +-
 fs/jbd2/commit.c            |  13 ++--
 fs/jbd2/journal.c           |  69 ++++++++++++++++++
 include/linux/jbd2.h        |   6 ++
 include/trace/events/jbd2.h | 168 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 252 insertions(+), 9 deletions(-)
 create mode 100644 include/trace/events/jbd2.h

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 17159cacbd9..5d70b3e6d49 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,9 +20,9 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <trace/events/jbd2.h>
 
 /*
  * Unlink a buffer from a transaction checkpoint list.
@@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
-	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
-		   journal->j_devname, result);
+	trace_jbd2_checkpoint(journal, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0b7d3b8226f..7b4088b2364 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -26,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <trace/events/jbd2.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 		 * block allocation  with delalloc. We need to write
 		 * only allocated blocks here.
 		 */
+		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 		err = journal_submit_inode_data_buffers(mapping);
 		if (!ret)
 			ret = err;
@@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
-	trace_mark(jbd2_start_commit, "dev %s transaction %d",
-		   journal->j_devname, commit_transaction->t_tid);
+	trace_jbd2_start_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	if (commit_transaction->t_synchronous_commit)
 		write_op = WRITE_SYNC_PLUG;
+	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.u.run.rs_flushing = jiffies;
 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 					       stats.u.run.rs_flushing);
@@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_COMMIT;
 	spin_unlock(&journal->j_state_lock);
 
+	trace_jbd2_commit_logging(journal, commit_transaction);
 	stats.u.run.rs_logging = jiffies;
 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 						 stats.u.run.rs_logging);
@@ -1054,9 +1057,7 @@ restart_loop:
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
-	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, commit_transaction->t_tid,
-		   journal->j_tail_sequence);
+	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 	if (to_free)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 62be7d294ec..18bfd5dab64 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -38,6 +38,10 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/math64.h>
+#include <linux/hash.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -2377,6 +2381,71 @@ static void __exit journal_exit(void)
 	jbd2_journal_destroy_caches();
 }
 
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ *
+ * The caller should use rcu_read_lock() in order to make sure the
+ * device name stays valid until its done with it.  We use
+ * rcu_read_lock() as well to make sure we're safe in case the caller
+ * gets sloppy, and because rcu_read_lock() is cheap and can be safely
+ * nested.
+ */
+struct devname_cache {
+	struct rcu_head	rcu;
+	dev_t		device;
+	char		devname[BDEVNAME_SIZE];
+};
+#define CACHE_SIZE_BITS 6
+static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
+static DEFINE_SPINLOCK(devname_cache_lock);
+
+static void free_devcache(struct rcu_head *rcu)
+{
+	kfree(rcu);
+}
+
+const char *jbd2_dev_to_name(dev_t device)
+{
+	int	i = hash_32(device, CACHE_SIZE_BITS);
+	char	*ret;
+	struct block_device *bd;
+
+	rcu_read_lock();
+	if (devcache[i] && devcache[i]->device == device) {
+		ret = devcache[i]->devname;
+		rcu_read_unlock();
+		return ret;
+	}
+	rcu_read_unlock();
+
+	spin_lock(&devname_cache_lock);
+	if (devcache[i]) {
+		if (devcache[i]->device == device) {
+			ret = devcache[i]->devname;
+			spin_unlock(&devname_cache_lock);
+			return ret;
+		}
+		call_rcu(&devcache[i]->rcu, free_devcache);
+	}
+	devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+	if (!devcache[i]) {
+		spin_unlock(&devname_cache_lock);
+		return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+	}
+	devcache[i]->device = device;
+	bd = bdget(device);
+	if (bd) {
+		bdevname(bd, devcache[i]->devname);
+		bdput(bd);
+	} else
+		__bdevname(device, devcache[i]->devname);
+	ret = devcache[i]->devname;
+	spin_unlock(&devname_cache_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_dev_to_name);
+
 MODULE_LICENSE("GPL");
 module_init(journal_init);
 module_exit(journal_exit);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index cc02393bfce..d97eb652d6c 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1315,6 +1315,12 @@ extern int jbd_blocks_per_page(struct inode *inode);
 #define BUFFER_TRACE2(bh, bh2, info)	do {} while (0)
 #define JBUFFER_TRACE(jh, info)	do {} while (0)
 
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ */
+extern const char *jbd2_dev_to_name(dev_t device);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_JBD2_H */
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
new file mode 100644
index 00000000000..845b0b4b48f
--- /dev/null
+++ b/include/trace/events/jbd2.h
@@ -0,0 +1,168 @@
+#if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_JBD2_H
+
+#include <linux/jbd2.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM jbd2
+
+TRACE_EVENT(jbd2_checkpoint,
+
+	TP_PROTO(journal_t *journal, int result),
+
+	TP_ARGS(journal, result),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	result			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->result		= result;
+	),
+
+	TP_printk("dev %s result %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->result)
+);
+
+TRACE_EVENT(jbd2_start_commit,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_commit_locking,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_commit_flushing,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_commit_logging,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_end_commit,
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+		__field(	int,	head		  	  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+		__entry->head		= journal->j_tail_sequence;
+	),
+
+	TP_printk("dev %s transaction %d sync %d head %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit, __entry->head)
+);
+
+TRACE_EVENT(jbd2_submit_inode_data,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+	),
+
+	TP_printk("dev %s ino %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino)
+);
+
+#endif /* _TRACE_JBD2_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3-70-g09d2


From e686307fdc84f249490e6c9da92fcb2424491f14 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 17 Apr 2009 08:41:21 +0200
Subject: loop: use BIO list management functions

Now that the bio list management stuff is generic, convert loop to use
bio lists instead of its own private bio list implementation.

Cc:  Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/loop.c | 26 +++++++-------------------
 include/linux/bio.h  |  2 +-
 include/linux/loop.h |  3 +--
 3 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ddae8082589..9ca4bb01465 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -511,11 +511,7 @@ out:
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
-	if (lo->lo_biotail) {
-		lo->lo_biotail->bi_next = bio;
-		lo->lo_biotail = bio;
-	} else
-		lo->lo_bio = lo->lo_biotail = bio;
+	bio_list_add(&lo->lo_bio_list, bio);
 }
 
 /*
@@ -523,16 +519,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
  */
 static struct bio *loop_get_bio(struct loop_device *lo)
 {
-	struct bio *bio;
-
-	if ((bio = lo->lo_bio)) {
-		if (bio == lo->lo_biotail)
-			lo->lo_biotail = NULL;
-		lo->lo_bio = bio->bi_next;
-		bio->bi_next = NULL;
-	}
-
-	return bio;
+	return bio_list_pop(&lo->lo_bio_list);
 }
 
 static int loop_make_request(struct request_queue *q, struct bio *old_bio)
@@ -609,12 +596,13 @@ static int loop_thread(void *data)
 
 	set_user_nice(current, -20);
 
-	while (!kthread_should_stop() || lo->lo_bio) {
+	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
 
 		wait_event_interruptible(lo->lo_event,
-				lo->lo_bio || kthread_should_stop());
+				!bio_list_empty(&lo->lo_bio_list) ||
+				kthread_should_stop());
 
-		if (!lo->lo_bio)
+		if (bio_list_empty(&lo->lo_bio_list))
 			continue;
 		spin_lock_irq(&lo->lo_lock);
 		bio = loop_get_bio(lo);
@@ -841,7 +829,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
-	lo->lo_bio = lo->lo_biotail = NULL;
+	bio_list_init(&lo->lo_bio_list);
 
 	/*
 	 * set queue make_request_fn, and add limits based on lower level
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b214fd672a..f37ca8c726b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -506,7 +506,7 @@ static inline int bio_has_data(struct bio *bio)
 }
 
 /*
- * BIO list managment for use by remapping drivers (e.g. DM or MD).
+ * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
  * A bio_list anchors a singly-linked list of bios chained through the bi_next
  * member of the bio.  The bio_list also caches the last list member to allow
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 40725447f5e..66c194e2d9b 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -56,8 +56,7 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
-	struct bio 		*lo_bio;
-	struct bio		*lo_biotail;
+	struct bio_list		lo_bio_list;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
 	struct task_struct	*lo_thread;
-- 
cgit v1.2.3-70-g09d2


From 214ae19104141404f18ad6651752eb38e3dd584e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide kill unused ide_cmd->special

Impact: removal of unused field

No one uses ide_cmd->special anymore.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ide.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fffb078..846a1e13240 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -324,7 +324,6 @@ struct ide_cmd {
 	unsigned int		cursg_ofs;
 
 	struct request		*rq;		/* copy of request */
-	void			*special;	/* valid_t generally */
 };
 
 /* ATAPI packet command flags */
-- 
cgit v1.2.3-70-g09d2


From e69d800f7e8797a8e3423380ee9d8ca1cb90c388 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide: add helpers for preparing sense requests

This is in preparation of removing the queueing of a sense request out
of the IRQ handler path.

Use struct request_sense as a general sense buffer for all ATAPI
devices ide-{floppy,tape,cd}.

tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as
      it can cause deadlock.  Converted to use inline struct request
      and blk_rq_init().

    * Added xfer / cdb len selection depending on device type.

    * All sense prep logics folded into ide_prep_sense() which never
      fails.

    * hwif->rq clearing and sense_rq used handling moved into
      ide_queue_sense_rq().

    * blk_rq_map_kern() conversion is moved to later patch.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ide.h     | 11 +++++++++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2894577237b..c6e03485a63 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+	struct request_sense *sense = &drive->sense_data;
+	struct request *sense_rq = &drive->sense_rq;
+	unsigned int cmd_len, sense_len;
+
+	debug_log("%s: enter\n", __func__);
+
+	switch (drive->media) {
+	case ide_floppy:
+		cmd_len = 255;
+		sense_len = 18;
+		break;
+	case ide_tape:
+		cmd_len = 20;
+		sense_len = 20;
+		break;
+	default:
+		cmd_len = 18;
+		sense_len = 18;
+	}
+
+	BUG_ON(sense_len > sizeof(*sense));
+
+	if (blk_sense_request(rq) || drive->sense_rq_armed)
+		return;
+
+	memset(sense, 0, sizeof(*sense));
+
+	blk_rq_init(rq->q, sense_rq);
+	sense_rq->rq_disk = rq->rq_disk;
+
+	sense_rq->data = sense;
+	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+	sense_rq->cmd[4] = cmd_len;
+	sense_rq->data_len = sense_len;
+
+	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_flags |= REQ_PREEMPT;
+
+	if (drive->media == ide_tape)
+		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+	drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+	BUG_ON(!drive->sense_rq_armed);
+
+	drive->sense_rq.special = special;
+	drive->sense_rq_armed = false;
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, &drive->sense_rq,
+			ELEVATOR_INSERT_FRONT, 0);
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
  * We queue a request sense packet command in the head of the request list.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 846a1e13240..a69ccac5641 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -602,6 +605,11 @@ struct ide_drive_s {
 
 	struct ide_atapi_pc request_sense_pc;
 	struct request request_sense_rq;
+
+	/* current sense rq and buffer */
+	bool sense_rq_armed;
+	struct request sense_rq;
+	struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 068753203e6cd085664a62e0fc0636e19b148a12 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-atapi: convert ide-{floppy,tape} to using preallocated sense
 buffer

Since we're issuing REQ_TYPE_SENSE now we need to allow those types of
rqs in the ->do_request callbacks. As a future improvement, sense_len
assignment might be unified across all ATAPI devices. Borislav to
check with specs and test.

As a result, get rid of ide_queue_pc_head() and
drive->request_sense_rq.

tj: * Init request sense ide_atapi_pc from sense request.  In the
      longer timer, it would probably better to fold
      ide_create_request_sense_cmd() into its only current user -
      ide_floppy_get_format_progress().

    * ide_retry_pc() no longer takes @disk.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c  | 48 ++++++++++++++----------------------------------
 drivers/ide/ide-floppy.c |  4 +++-
 drivers/ide/ide-tape.c   |  7 +++++--
 include/linux/ide.h      |  3 +--
 4 files changed, 23 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c6e03485a63..972c522516f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-			      struct ide_atapi_pc *pc, struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = (char *)pc;
-	rq->rq_disk = disk;
-
-	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
-	}
-
-	memcpy(rq->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 /*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
@@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-	struct request *rq = &drive->request_sense_rq;
+	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	ide_create_request_sense_cmd(drive, pc);
+
+	/* init pc from sense_rq */
+	ide_init_pc(pc);
+	memcpy(pc->c, sense_rq->cmd, 12);
+	pc->buf = sense_rq->data;
+	pc->req_xfer = sense_rq->data_len;
+
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	ide_queue_pc_head(drive, disk, pc, rq);
+
+	ide_queue_sense_rq(drive, pc);
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
 			/* Retry operation */
-			ide_retry_pc(drive, rq->rq_disk);
+			ide_retry_pc(drive);
 
 			/* queued, but not started */
 			return ide_stopped;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 94600331a27..d3302cc891e 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq)) {
+	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
 		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
@@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aadf53cfac6..8324dfa78a3 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
 						tape->name);
 			/* Retry operation */
-			ide_retry_pc(drive, tape->disk);
+			ide_retry_pc(drive);
 			return ide_stopped;
 		}
 		pc->error = 0;
@@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			(unsigned long long)rq->sector, rq->nr_sectors,
 			rq->current_nr_sectors);
 
-	if (!blk_special_request(rq)) {
+	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
@@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a69ccac5641..9e67ccac3c1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -604,7 +604,6 @@ struct ide_drive_s {
 	unsigned long atapi_flags;
 
 	struct ide_atapi_pc request_sense_pc;
-	struct request request_sense_rq;
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
@@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
 void ide_queue_sense_rq(ide_drive_t *drive, void *special);
-- 
cgit v1.2.3-70-g09d2


From 02e7cf8f848841ca21864ccd019e480b73c323b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd,atapi: use bio for internal commands

Impact: unify request data buffer handling

rq->data is used mostly to pass kernel buffer through request queue
without using bio.  There are only a couple of places which still do
this in kernel and converting to bio isn't difficult.

This patch converts ide-cd and atapi to use bio instead of rq->data
for request sense and internal pc commands.  With previous change to
unify sense request handling, this is relatively easily achieved by
adding blk_rq_map_kern() during sense_rq prep and PC issue.

If blk_rq_map_kern() fails for sense, the error is deferred till sense
issue and aborts the failed command which triggered the sense.  Note
that this is a slim possibility as sense prep is done on each command
issue, so for the above condition to actually trigger, all preps since
the last sense issue till the issue of the request which would require
a sense should fail.

* do_request functions might sleep now.  This should be okay as ide
  request_fn - do_ide_request() - is invoked only from make_request
  and plug work.  Make sure this is the case by adding might_sleep()
  to do_ide_request().

* Functions which access the read sense data before the sense request
  is complete now should access bio_data(sense_rq->bio) as the sense
  buffer might have been copied during blk_rq_map_kern().

* ide-tape updated to map sg.

* cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC
  special case.  Simplified.

* tp_ops->output/input_data path dropped from ide_pc_intr().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 52 +++++++++++++++++++++++++++++--------------------
 drivers/ide/ide-cd.c    | 28 +++++++++++++-------------
 drivers/ide/ide-io.c    |  3 +++
 drivers/ide/ide-tape.c  |  3 +++
 include/linux/ide.h     |  2 +-
 5 files changed, 52 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 972c522516f..5cefe12f562 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+					GFP_NOIO);
+		if (error)
+			goto put_req;
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
 	blk_put_request(rq);
-
 	return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	struct request *sense_rq = &drive->sense_rq;
 	unsigned int cmd_len, sense_len;
+	int err;
 
 	debug_log("%s: enter\n", __func__);
 
@@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
-	sense_rq->rq_disk = rq->rq_disk;
 
-	sense_rq->data = sense;
+	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+			      GFP_NOIO);
+	if (unlikely(err)) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: failed to map sense buffer\n",
+			       drive->name);
+		return;
+	}
+
+	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->data_len = sense_len;
-
 	sense_rq->cmd_type = REQ_TYPE_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
@@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ide_prep_sense);
 
-void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
-	BUG_ON(!drive->sense_rq_armed);
+	/* deferred failure from ide_prep_sense() */
+	if (!drive->sense_rq_armed) {
+		printk(KERN_WARNING "%s: failed queue sense request\n",
+		       drive->name);
+		return -ENOMEM;
+	}
 
 	drive->sense_rq.special = special;
 	drive->sense_rq_armed = false;
@@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	elv_add_request(drive->queue, &drive->sense_rq,
 			ELEVATOR_INSERT_FRONT, 0);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
@@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive)
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
-	pc->buf = sense_rq->data;
+	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
 	pc->req_xfer = sense_rq->data_len;
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	ide_queue_sense_rq(drive, pc);
+	if (ide_queue_sense_rq(drive, pc))
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xferfunc;
 	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
@@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape)
+			if (drive->media == ide_tape && !rq->bio)
 				done = ide_rq_bytes(rq); /* FIXME */
 			else
 				done = blk_rq_bytes(rq);
@@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-	if (drive->media == ide_floppy && pc->buf == NULL) {
+	if (drive->media == ide_tape && pc->bh)
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
+	else {
 		done = min_t(unsigned int, bcount, cmd->nleft);
 		ide_pio_bytes(drive, cmd, write, done);
-	} else if (drive->media == ide_tape && pc->bh) {
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	} else {
-		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-		xferfunc(drive, NULL, pc->cur_pos, done);
 	}
 
 	/* Update the current position */
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 061d7bbcd34..673628790f1 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
 	 * For REQ_TYPE_SENSE, "rq->special" points to the original
-	 * failed request
+	 * failed request.  Also, the sense data should be read
+	 * directly from rq which might be different from the original
+	 * sense buffer if it got copied during mapping.
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct request_sense *sense = &drive->sense_data;
+	void *sense = bio_data(rq->bio);
 
 	if (failed) {
 		if (failed->sense) {
@@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		ide_queue_sense_rq(drive, NULL);
+		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
 	return 1;
 
 end_request:
@@ -412,8 +414,7 @@ end_request:
 
 		hwif->rq = NULL;
 
-		ide_queue_sense_rq(drive, rq);
-		return 1;
+		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
 }
@@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq->cmd_flags |= cmd_flags;
 		rq->timeout = timeout;
 		if (buffer) {
-			rq->data = buffer;
-			rq->data_len = *bufflen;
+			error = blk_rq_map_kern(drive->queue, rq, buffer,
+						*bufflen, GFP_NOIO);
+			if (error) {
+				blk_put_request(rq);
+				return error;
+			}
 		}
 
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	drive->dma = 0;
 
 	/* sg request */
-	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+	if (rq->bio) {
 		struct request_queue *q = drive->queue;
+		char *buf = bio_data(rq->bio);
 		unsigned int alignment;
-		char *buf;
-
-		if (rq->bio)
-			buf = bio_data(rq->bio);
-		else
-			buf = rq->data;
 
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9b9e8b1aae5..3245c2dbda3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q)
 
 	spin_unlock_irq(q->queue_lock);
 
+	/* HLD do_request() callback might sleep, make sure it's okay */
+	might_sleep();
+
 	if (ide_lock_host(host, hwif))
 		goto plug_device_2;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8324dfa78a3..9b762a2d5d9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -850,6 +850,9 @@ out:
 
 	cmd.rq = rq;
 
+	ide_init_sg_cmd(&cmd, pc->req_xfer);
+	ide_map_sg(drive, &cmd);
+
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9e67ccac3c1..1957461ac76 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
-- 
cgit v1.2.3-70-g09d2


From 29d1a4371035e01b0d079bc5aa88b50f5af7a566 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-atapi: kill unused fields and callbacks

Impact: remove fields and code paths which are no longer necessary

Now that ide-tape uses standard mechanisms to transfer data, special
case handling for bh handling can be dropped from ide-atapi.  Drop the
followings.

* pc->cur_pos, b_count, bh and b_data
* drive->pc_update_buffers() and pc_io_buffers().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 17 ++++-------------
 drivers/ide/ide-tape.c  |  1 -
 include/linux/ide.h     | 12 ------------
 3 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b9dd4503cbc..afe5a432387 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
+		} else
 			pc->xferred = pc->req_xfer;
-			if (drive->pc_update_buffers)
-				drive->pc_update_buffers(drive, pc);
-		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (drive->media == ide_tape && pc->bh)
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	else {
-		done = min_t(unsigned int, bcount, cmd->nleft);
-		ide_pio_bytes(drive, cmd, write, done);
-	}
+	done = min_t(unsigned int, bcount, cmd->nleft);
+	ide_pio_bytes(drive, cmd, write, done);
 
-	/* Update the current position */
+	/* Update transferred byte count */
 	pc->xferred += done;
-	pc->cur_pos += done;
 
 	bcount -= done;
 
@@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 		/* We haven't transferred any data yet */
 		pc->xferred = 0;
-		pc->cur_pos = pc->buf;
 
 		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2599579e417..8dfc68892d6 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1957461ac76..34c128f0a33 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,11 +362,7 @@ struct ide_atapi_pc {
 
 	/* data buffer */
 	u8 *buf;
-	/* current buffer position */
-	u8 *cur_pos;
 	int buf_size;
-	/* missing/available data on the current buffer */
-	int b_count;
 
 	/* the corresponding request */
 	struct request *rq;
@@ -379,10 +375,6 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-	/* idetape only */
-	struct idetape_bh *bh;
-	char *b_data;
-
 	unsigned long timeout;
 };
 
@@ -595,10 +587,6 @@ struct ide_drive_s {
 	/* callback for packet commands */
 	int  (*pc_callback)(struct ide_drive_s *, int);
 
-	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-			      unsigned int, int);
-
 	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
 	unsigned long atapi_flags;
-- 
cgit v1.2.3-70-g09d2


From a7f557923441186a3cdbabc54f1bcacf42b63bf5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: block: kill blk_start_queueing()

blk_start_queueing() is identical to __blk_run_queue() except that it
doesn't check for recursion.  None of the current users depends on
blk_start_queueing() running request_fn directly.  Replace usages of
blk_start_queueing() with [__]blk_run_queue() and kill it.

[ Impact: removal of mostly duplicate interface function ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/as-iosched.c     |  6 +-----
 block/blk-core.c       | 28 ++--------------------------
 block/cfq-iosched.c    |  6 +++---
 block/elevator.c       |  7 +++----
 include/linux/blkdev.h |  1 -
 5 files changed, 9 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index c48fa670d22..45bd07059c2 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1312,12 +1312,8 @@ static void as_merged_requests(struct request_queue *q, struct request *req,
 static void as_work_handler(struct work_struct *work)
 {
 	struct as_data *ad = container_of(work, struct as_data, antic_work);
-	struct request_queue *q = ad->q;
-	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_start_queueing(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_run_queue(ad->q);
 }
 
 static int as_may_queue(struct request_queue *q, int rw)
diff --git a/block/blk-core.c b/block/blk-core.c
index 02f53bc00e4..8b4a0af7d69 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -433,9 +433,7 @@ EXPORT_SYMBOL(__blk_run_queue);
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
- *    May be used to restart queueing when a request has completed. Also
- *    See @blk_start_queueing.
- *
+ *    May be used to restart queueing when a request has completed.
  */
 void blk_run_queue(struct request_queue *q)
 {
@@ -894,28 +892,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_get_request);
 
-/**
- * blk_start_queueing - initiate dispatch of requests to device
- * @q:		request queue to kick into gear
- *
- * This is basically a helper to remove the need to know whether a queue
- * is plugged or not if someone just wants to initiate dispatch of requests
- * for this queue. Should be used to start queueing on a device outside
- * of ->request_fn() context. Also see @blk_run_queue.
- *
- * The queue lock must be held with interrupts disabled.
- */
-void blk_start_queueing(struct request_queue *q)
-{
-	if (!blk_queue_plugged(q)) {
-		if (unlikely(blk_queue_stopped(q)))
-			return;
-		q->request_fn(q);
-	} else
-		__generic_unplug_device(q);
-}
-EXPORT_SYMBOL(blk_start_queueing);
-
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
@@ -984,7 +960,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
-	blk_start_queueing(q);
+	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a55a9bd75bd..def0c698a4b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2088,7 +2088,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
-				blk_start_queueing(cfqd->queue);
+			__blk_run_queue(cfqd->queue);
 			}
 			cfq_mark_cfqq_must_dispatch(cfqq);
 		}
@@ -2100,7 +2100,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		blk_start_queueing(cfqd->queue);
+		__blk_run_queue(cfqd->queue);
 	}
 }
 
@@ -2345,7 +2345,7 @@ static void cfq_kick_queue(struct work_struct *work)
 	struct request_queue *q = cfqd->queue;
 
 	spin_lock_irq(q->queue_lock);
-	blk_start_queueing(q);
+	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a907257..2e0fb21485b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -599,7 +599,7 @@ void elv_quiesce_start(struct request_queue *q)
 	 */
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
-		blk_start_queueing(q);
+		__blk_run_queue(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
@@ -643,8 +643,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
-		blk_remove_plug(q);
-		blk_start_queueing(q);
+		__blk_run_queue(q);
 		break;
 
 	case ELEVATOR_INSERT_SORT:
@@ -971,7 +970,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			blk_start_queueing(q);
+			__blk_run_queue(q);
 		}
 	}
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2755d5c6da2..12e20de44b6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -797,7 +797,6 @@ extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *);
 extern void blk_run_queue(struct request_queue *);
-extern void blk_start_queueing(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
-- 
cgit v1.2.3-70-g09d2


From 5efccd17ceb0fc43837a331297c2c407969d7201 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: reorder request completion functions

Reorder request completion functions such that

* All request completion functions are located together.

* Functions which are used by only one caller is put right above the
  caller.

* end_request() is put after other completion functions but before
  blk_update_request().

This change is for completion function cleanup which will follow.

[ Impact: cleanup, code reorganization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c       | 144 ++++++++++++++++++++++++-------------------------
 include/linux/blkdev.h |  16 +++---
 2 files changed, 80 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index cf10dfcda99..406a93e526b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1683,6 +1683,35 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
+/**
+ * blk_rq_bytes - Returns bytes left to complete in the entire request
+ * @rq: the request being processed
+ **/
+unsigned int blk_rq_bytes(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->hard_nr_sectors << 9;
+
+	return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_bytes);
+
+/**
+ * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
+ * @rq: the request being processed
+ **/
+unsigned int blk_rq_cur_bytes(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->current_nr_sectors << 9;
+
+	if (rq->bio)
+		return rq->bio->bi_size;
+
+	return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
@@ -1797,6 +1826,22 @@ static int __end_that_request_first(struct request *req, int error,
 	return 1;
 }
 
+static int end_that_request_data(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
+{
+	if (rq->bio) {
+		if (__end_that_request_first(rq, error, nr_bytes))
+			return 1;
+
+		/* Bidi request must be completed as a whole */
+		if (blk_bidi_rq(rq) &&
+		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * queue lock must be held
  */
@@ -1825,78 +1870,6 @@ static void end_that_request_last(struct request *req, int error)
 	}
 }
 
-/**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- * @rq: the request being processed
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->hard_nr_sectors << 9;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- * @rq: the request being processed
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->current_nr_sectors << 9;
-
-	if (rq->bio)
-		return rq->bio->bi_size;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:	the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
- **/
-void end_request(struct request *req, int uptodate)
-{
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_request(req, error, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-static int end_that_request_data(struct request *rq, int error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
-{
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
-
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
-
-	return 0;
-}
-
 /**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
@@ -2006,6 +1979,33 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
+/**
+ * end_request - end I/O on the current segment of the request
+ * @req:	the request being processed
+ * @uptodate:	error value or %0/%1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ **/
+void end_request(struct request *req, int uptodate)
+{
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_request(req, error, req->hard_cur_sectors << 9);
+}
+EXPORT_SYMBOL(end_request);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:           the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12e20de44b6..156ffd9de96 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -831,6 +831,14 @@ static inline void blk_run_address_space(struct address_space *mapping)
 
 extern void blkdev_dequeue_request(struct request *req);
 
+/*
+ * blk_end_request() takes bytes instead of sectors as a complete size.
+ * blk_rq_bytes() returns bytes left to complete in the entire request.
+ * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
+ */
+extern unsigned int blk_rq_bytes(struct request *rq);
+extern unsigned int blk_rq_cur_bytes(struct request *rq);
+
 /*
  * blk_end_request() and friends.
  * __blk_end_request() and end_request() must be called with
@@ -857,14 +865,6 @@ extern void blk_abort_queue(struct request_queue *);
 extern void blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
 
-/*
- * blk_end_request() takes bytes instead of sectors as a complete size.
- * blk_rq_bytes() returns bytes left to complete in the entire request.
- * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
- */
-extern unsigned int blk_rq_bytes(struct request *rq);
-extern unsigned int blk_rq_cur_bytes(struct request *rq);
-
 /*
  * Access functions for manipulating queue properties
  */
-- 
cgit v1.2.3-70-g09d2


From 0b302d5aa7975006fa2ec3d66386610b9b36c669 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: kill blk_end_request_callback()

With recent IDE updates, blk_end_request_callback() doesn't have any
user now.  Kill it.

[ Impact: removal of unused convoluted interface ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c       | 48 +++---------------------------------------------
 include/linux/blkdev.h |  3 ---
 2 files changed, 3 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 678ede23ed0..2f277ea0e59 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1971,10 +1971,6 @@ static void end_that_request_last(struct request *req, int error)
  * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  * @bidi_bytes:   number of bytes to complete @rq->next_rq
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non %0, this helper returns without
- *                completion of the request.
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
@@ -1985,8 +1981,7 @@ static void end_that_request_last(struct request *req, int error)
  *     %1 - this request is not freed yet, it still has pending buffers.
  **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
-		      unsigned int bidi_bytes,
-		      int (drv_callback)(struct request *))
+		      unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
@@ -1994,10 +1989,6 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
 		return 1;
 
-	/* Special feature for tricky drivers */
-	if (drv_callback && drv_callback(rq))
-		return 1;
-
 	add_disk_randomness(rq->rq_disk);
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -2023,7 +2014,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
  **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	return blk_end_io(rq, error, nr_bytes, 0, NULL);
+	return blk_end_io(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL_GPL(blk_end_request);
 
@@ -2070,7 +2061,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 			 unsigned int bidi_bytes)
 {
-	return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
+	return blk_end_io(rq, error, nr_bytes, bidi_bytes);
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
@@ -2131,39 +2122,6 @@ void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-/**
- * blk_end_request_callback - Special helper function for tricky drivers
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non %0, this helper returns without
- *                completion of the request.
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is used only for existing tricky drivers.
- *     (e.g. cdrom_newpc_intr() of ide-cd)
- *     This interface will be removed when such drivers are rewritten.
- *     Don't use this interface in other places anymore.
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - this request is not freed yet.
- *          this request still has pending buffers or
- *          the driver doesn't want to finish this request yet.
- **/
-int blk_end_request_callback(struct request *rq, int error,
-			     unsigned int nr_bytes,
-			     int (drv_callback)(struct request *))
-{
-	return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
-}
-EXPORT_SYMBOL_GPL(blk_end_request_callback);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 156ffd9de96..1fa9dcf9aa6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -855,9 +855,6 @@ extern int __blk_end_request(struct request *rq, int error,
 extern int blk_end_bidi_request(struct request *rq, int error,
 				unsigned int nr_bytes, unsigned int bidi_bytes);
 extern void end_request(struct request *, int);
-extern int blk_end_request_callback(struct request *rq, int error,
-				unsigned int nr_bytes,
-				int (drv_callback)(struct request *));
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 2e60e02297cf54e367567f2d85b2ca56b1c4a906 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: clean up request completion API

Request completion has gone through several changes and became a bit
messy over the time.  Clean it up.

1. end_that_request_data() is a thin wrapper around
   end_that_request_data_first() which checks whether bio is NULL
   before doing anything and handles bidi completion.
   blk_update_request() is a thin wrapper around
   end_that_request_data() which clears nr_sectors on the last
   iteration but doesn't use the bidi completion.

   Clean it up by moving the initial bio NULL check and nr_sectors
   clearing on the last iteration into end_that_request_data() and
   renaming it to blk_update_request(), which makes blk_end_io() the
   only user of end_that_request_data().  Collapse
   end_that_request_data() into blk_end_io().

2. There are four visible completion variants - blk_end_request(),
   __blk_end_request(), blk_end_bidi_request() and end_request().
   blk_end_request() and blk_end_bidi_request() uses blk_end_request()
   as the backend but __blk_end_request() and end_request() use
   separate implementation in __blk_end_request() due to different
   locking rules.

   blk_end_bidi_request() is identical to blk_end_io().  Collapse
   blk_end_io() into blk_end_bidi_request(), separate out request
   update into internal helper blk_update_bidi_request() and add
   __blk_end_bidi_request().  Redefine [__]blk_end_request() as thin
   inline wrappers around [__]blk_end_bidi_request().

3. As the whole request issue/completion usages are about to be
   modified and audited, it's a good chance to convert completion
   functions return bool which better indicates the intended meaning
   of return values.

4. The function name end_that_request_last() is from the days when it
   was a public interface and slighly confusing.  Give it a proper
   internal name - blk_finish_request().

5. Add description explaning that blk_end_bidi_request() can be safely
   used for uni requests as suggested by Boaz Harrosh.

The only visible behavior change is from #1.  nr_sectors counts are
cleared after the final iteration no matter which function is used to
complete the request.  I couldn't find any place where the code
assumes those nr_sectors counters contain the values for the last
segment and this change is good as it makes the API much more
consistent as the end result is now same whether a request is
completed using [__]blk_end_request() alone or in combination with
blk_update_request().

API further cleaned up per Christoph's suggestion.

[ Impact: cleanup, rq->*nr_sectors always updated after req completion ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 block/blk-core.c       | 226 ++++++++++++++++---------------------------------
 include/linux/blkdev.h |  94 +++++++++++++++++---
 2 files changed, 157 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2f277ea0e59..89cc05d9a7a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1808,25 +1808,35 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 }
 
 /**
- * __end_that_request_first - end I/O on a request
- * @req:      the request being processed
+ * blk_update_request - Special helper function for request stacking drivers
+ * @rq:	      the request being processed
  * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
+ * @nr_bytes: number of bytes to complete @rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @req, and sets it up
- *     for the next range of segments (if any) in the cluster.
+ *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
+ *     the request structure even if @rq doesn't have leftover.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ *     This special helper function is only for request stacking drivers
+ *     (e.g. request-based dm) so that they can handle partial completion.
+ *     Actual device drivers should use blk_end_request instead.
+ *
+ *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ *     %false return from this function.
  *
  * Return:
- *     %0 - we are done with this request, call end_that_request_last()
- *     %1 - still buffers pending for this request
+ *     %false - this request doesn't have any more data
+ *     %true  - this request has more data
  **/
-static int __end_that_request_first(struct request *req, int error,
-				    int nr_bytes)
+bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 
+	if (!req->bio)
+		return false;
+
 	trace_block_rq_complete(req->q, req);
 
 	/*
@@ -1903,8 +1913,16 @@ static int __end_that_request_first(struct request *req, int error,
 	/*
 	 * completely done
 	 */
-	if (!req->bio)
-		return 0;
+	if (!req->bio) {
+		/*
+		 * Reset counters so that the request stacking driver
+		 * can find how many bytes remain in the request
+		 * later.
+		 */
+		req->nr_sectors = req->hard_nr_sectors = 0;
+		req->current_nr_sectors = req->hard_cur_sectors = 0;
+		return false;
+	}
 
 	/*
 	 * if the request wasn't completed, update state
@@ -1918,29 +1936,31 @@ static int __end_that_request_first(struct request *req, int error,
 
 	blk_recalc_rq_sectors(req, total_bytes >> 9);
 	blk_recalc_rq_segments(req);
-	return 1;
+	return true;
 }
+EXPORT_SYMBOL_GPL(blk_update_request);
 
-static int end_that_request_data(struct request *rq, int error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool blk_update_bidi_request(struct request *rq, int error,
+				    unsigned int nr_bytes,
+				    unsigned int bidi_bytes)
 {
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
+	if (blk_update_request(rq, error, nr_bytes))
+		return true;
 
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
+	/* Bidi request must be completed as a whole */
+	if (unlikely(blk_bidi_rq(rq)) &&
+	    blk_update_request(rq->next_rq, error, bidi_bytes))
+		return true;
 
-	return 0;
+	add_disk_randomness(rq->rq_disk);
+
+	return false;
 }
 
 /*
  * queue lock must be held
  */
-static void end_that_request_last(struct request *req, int error)
+static void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
@@ -1966,161 +1986,65 @@ static void end_that_request_last(struct request *req, int error)
 }
 
 /**
- * blk_end_io - Generic end_io function to complete a request.
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete @rq
- * @bidi_bytes:   number of bytes to complete @rq->next_rq
+ * blk_end_bidi_request - Complete a bidi request
+ * @rq:         the request to complete
+ * @error:      %0 for success, < %0 for error
+ * @nr_bytes:   number of bytes to complete @rq
+ * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
- *     If @rq has leftover, sets it up for the next range of segments.
+ *     Drivers that supports bidi can safely call this member for any
+ *     type of request, bidi or uni.  In the later case @bidi_bytes is
+ *     just ignored.
  *
  * Return:
- *     %0 - we are done with this request
- *     %1 - this request is not freed yet, it still has pending buffers.
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  **/
-static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
-		      unsigned int bidi_bytes)
+bool blk_end_bidi_request(struct request *rq, int error,
+			  unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
-	unsigned long flags = 0UL;
-
-	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
-		return 1;
+	unsigned long flags;
 
-	add_disk_randomness(rq->rq_disk);
+	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+		return true;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	end_that_request_last(rq, error);
+	blk_finish_request(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	return 0;
-}
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	return blk_end_io(rq, error, nr_bytes, 0);
-}
-EXPORT_SYMBOL_GPL(blk_end_request);
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
-		return 1;
-
-	add_disk_randomness(rq->rq_disk);
-
-	end_that_request_last(rq, error);
-
-	return 0;
+	return false;
 }
-EXPORT_SYMBOL_GPL(__blk_end_request);
+EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
- * blk_end_bidi_request - Helper function for drivers to complete bidi request.
- * @rq:         the bidi request being processed
+ * __blk_end_bidi_request - Complete a bidi request with queue lock held
+ * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
+ *     Identical to blk_end_bidi_request() except that queue lock is
+ *     assumed to be locked on entry and remains so on return.
  *
  * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
-			 unsigned int bidi_bytes)
-{
-	return blk_end_io(rq, error, nr_bytes, bidi_bytes);
-}
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:	the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  **/
-void end_request(struct request *req, int uptodate)
+bool __blk_end_bidi_request(struct request *rq, int error,
+			    unsigned int nr_bytes, unsigned int bidi_bytes)
 {
-	int error = 0;
+	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+		return true;
 
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
+	blk_finish_request(rq, error);
 
-	__blk_end_request(req, error, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-/**
- * blk_update_request - Special helper function for request stacking drivers
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete @rq
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
- *     the request structure even if @rq doesn't have leftover.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is only for request stacking drivers
- *     (e.g. request-based dm) so that they can handle partial completion.
- *     Actual device drivers should use blk_end_request instead.
- */
-void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	if (!end_that_request_data(rq, error, nr_bytes, 0)) {
-		/*
-		 * These members are not updated in end_that_request_data()
-		 * when all bios are completed.
-		 * Update them so that the request stacking driver can find
-		 * how many bytes remain in the request later.
-		 */
-		rq->nr_sectors = rq->hard_nr_sectors = 0;
-		rq->current_nr_sectors = rq->hard_cur_sectors = 0;
-	}
+	return false;
 }
-EXPORT_SYMBOL_GPL(blk_update_request);
+EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1fa9dcf9aa6..501f6845cc7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -840,27 +840,97 @@ extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
 /*
- * blk_end_request() and friends.
- * __blk_end_request() and end_request() must be called with
- * the request queue spinlock acquired.
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ *
+ * blk_end_request() and friends.  __blk_end_request() and
+ * end_request() must be called with the request queue spinlock
+ * acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
  * This prevents code duplication in drivers.
  */
-extern int blk_end_request(struct request *rq, int error,
-				unsigned int nr_bytes);
-extern int __blk_end_request(struct request *rq, int error,
-				unsigned int nr_bytes);
-extern int blk_end_bidi_request(struct request *rq, int error,
-				unsigned int nr_bytes, unsigned int bidi_bytes);
-extern void end_request(struct request *, int);
+extern bool blk_update_request(struct request *rq, int error,
+			       unsigned int nr_bytes);
+extern bool blk_end_bidi_request(struct request *rq, int error,
+				 unsigned int nr_bytes,
+				 unsigned int bidi_bytes);
+extern bool __blk_end_bidi_request(struct request *rq, int error,
+				   unsigned int nr_bytes,
+				   unsigned int bidi_bytes);
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+static inline bool blk_end_request(struct request *rq, int error,
+				   unsigned int nr_bytes)
+{
+	return blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+static inline bool __blk_end_request(struct request *rq, int error,
+				     unsigned int nr_bytes)
+{
+	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+
+/**
+ * end_request - end I/O on the current segment of the request
+ * @rq:		the request being processed
+ * @uptodate:	error value or %0/%1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ **/
+static inline void end_request(struct request *rq, int uptodate)
+{
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0);
+}
+
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
-extern void blk_update_request(struct request *rq, int error,
-			       unsigned int nr_bytes);
 
 /*
  * Access functions for manipulating queue properties
-- 
cgit v1.2.3-70-g09d2


From 40cbbb781d3eba5d6ac0860db078af490e5c7c6b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:19 +0900
Subject: block: implement and use [__]blk_end_request_all()

There are many [__]blk_end_request() call sites which call it with
full request length and expect full completion.  Many of them ensure
that the request actually completes by doing BUG_ON() the return
value, which is awkward and error-prone.

This patch adds [__]blk_end_request_all() which takes @rq and @error
and fully completes the request.  BUG_ON() is added to to ensure that
this actually happens.

Most conversions are simple but there are a few noteworthy ones.

* cdrom/viocd: viocd_end_request() replaced with direct calls to
  __blk_end_request_all().

* s390/block/dasd: dasd_end_request() replaced with direct calls to
  __blk_end_request_all().

* s390/char/tape_block: tapeblock_end_request() replaced with direct
  calls to blk_end_request_all().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 arch/arm/plat-omap/mailbox.c        | 11 +++--------
 block/blk-barrier.c                 |  9 ++-------
 block/blk-core.c                    |  2 +-
 block/elevator.c                    |  2 +-
 drivers/block/cpqarray.c            |  3 +--
 drivers/block/sx8.c                 |  3 +--
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xen-blkfront.c        |  4 +---
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               | 25 ++++---------------------
 drivers/memstick/core/mspro_block.c |  2 +-
 drivers/s390/block/dasd.c           | 17 ++++-------------
 drivers/s390/char/tape_block.c      | 15 ++++-----------
 drivers/scsi/scsi_lib.c             |  2 +-
 include/linux/blkdev.h              | 32 ++++++++++++++++++++++++++++++++
 15 files changed, 58 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 0abfbaa5987..cf81bad8aec 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -192,8 +192,7 @@ static void mbox_tx_work(struct work_struct *work)
 		}
 
 		spin_lock(q->queue_lock);
-		if (__blk_end_request(rq, 0, 0))
-			BUG();
+		__blk_end_request_all(rq, 0);
 		spin_unlock(q->queue_lock);
 	}
 }
@@ -224,10 +223,7 @@ static void mbox_rx_work(struct work_struct *work)
 			break;
 
 		msg = (mbox_msg_t) rq->data;
-
-		if (blk_end_request(rq, 0, 0))
-			BUG();
-
+		blk_end_request_all(rq, 0);
 		mbox->rxq->callback((void *)msg);
 	}
 }
@@ -337,8 +333,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 
 		*p = (mbox_msg_t) rq->data;
 
-		if (blk_end_request(rq, 0, 0))
-			BUG();
+		blk_end_request_all(rq, 0);
 
 		if (unlikely(mbox_seq_test(mbox, *p))) {
 			pr_info("mbox: Illegal seq bit!(%08x) ignored\n", *p);
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 20b4111fa05..c8d087655ef 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -106,10 +106,7 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	 */
 	q->ordseq = 0;
 	rq = q->orig_bar_rq;
-
-	if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
-		BUG();
-
+	__blk_end_request_all(rq, q->orderr);
 	return true;
 }
 
@@ -252,9 +249,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * with prejudice.
 			 */
 			elv_dequeue_request(q, rq);
-			if (__blk_end_request(rq, -EOPNOTSUPP,
-					      blk_rq_bytes(rq)))
-				BUG();
+			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
 		}
diff --git a/block/blk-core.c b/block/blk-core.c
index b84250d3019..0520cc70458 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1780,7 +1780,7 @@ struct request *elv_next_request(struct request_queue *q)
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
-			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
diff --git a/block/elevator.c b/block/elevator.c
index b03b8752e18..1af5d9f04af 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -810,7 +810,7 @@ void elv_abort_queue(struct request_queue *q)
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		trace_block_rq_abort(q, rq);
-		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+		__blk_end_request_all(rq, -EIO);
 	}
 }
 EXPORT_SYMBOL(elv_abort_queue);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index ca268ca1115..488a8f4a60a 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1024,8 +1024,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
 				cmd->req.sg[i].size, ddir);
 
 	DBGPX(printk("Done with %p\n", rq););
-	if (__blk_end_request(rq, error, blk_rq_bytes(rq)))
-		BUG();
+	__blk_end_request_all(rq, error);
 }
 
 /*
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index ff0448e4bf0..60e85bb6f79 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -749,8 +749,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
 	struct request *req = crq->rq;
 	int rc;
 
-	rc = __blk_end_request(req, error, blk_rq_bytes(req));
-	assert(rc == 0);
+	__blk_end_request_all(req, error);
 
 	rc = carm_put_request(host, crq);
 	assert(rc == 0);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5d34764c8a8..50745e64414 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -62,7 +62,7 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
-		__blk_end_request(vbr->req, error, blk_rq_bytes(vbr->req));
+		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8f905089b72..cd6cfe3b51e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -551,7 +551,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 	for (i = info->ring.rsp_cons; i != rp; i++) {
 		unsigned long id;
-		int ret;
 
 		bret = RING_GET_RESPONSE(&info->ring, i);
 		id   = bret->id;
@@ -578,8 +577,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 
-			ret = __blk_end_request(req, error, blk_rq_bytes(req));
-			BUG_ON(ret);
+			__blk_end_request_all(req, error);
 			break;
 		default:
 			BUG();
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 2eecb779437..fee9a9e83fc 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -632,7 +632,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 		* before handling ending the request */
 		spin_lock(&gdrom_lock);
 		list_del_init(&req->queuelist);
-		__blk_end_request(req, err, blk_rq_bytes(req));
+		__blk_end_request_all(req, err);
 	}
 	spin_unlock(&gdrom_lock);
 	kfree(read_command);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 13929356135..cc3efa096e1 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -291,23 +291,6 @@ static int send_request(struct request *req)
 	return 0;
 }
 
-static void viocd_end_request(struct request *req, int error)
-{
-	int nsectors = req->hard_nr_sectors;
-
-	/*
-	 * Make sure it's fully ended, and ensure that we process
-	 * at least one sector.
-	 */
-	if (blk_pc_request(req))
-		nsectors = (req->data_len + 511) >> 9;
-	if (!nsectors)
-		nsectors = 1;
-
-	if (__blk_end_request(req, error, nsectors << 9))
-		BUG();
-}
-
 static int rwreq;
 
 static void do_viocd_request(struct request_queue *q)
@@ -316,11 +299,11 @@ static void do_viocd_request(struct request_queue *q)
 
 	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
 		if (!blk_fs_request(req))
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
 			printk(VIOCD_KERN_WARNING
 					"unable to send message to OS/400!");
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		} else
 			rwreq++;
 	}
@@ -531,9 +514,9 @@ return_complete:
 					"with rc %d:0x%04X: %s\n",
 					req, event->xRc,
 					bevent->sub_result, err->msg);
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		} else
-			viocd_end_request(req, 0);
+			__blk_end_request_all(req, 0);
 
 		/* restart handling of incoming requests */
 		spin_unlock_irqrestore(&viocd_reqlock, flags);
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index de143deb06f..a41634699f8 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -826,7 +826,7 @@ static void mspro_block_submit_req(struct request_queue *q)
 
 	if (msb->eject) {
 		while ((req = elv_next_request(q)) != NULL)
-			__blk_end_request(req, -ENODEV, blk_rq_bytes(req));
+			__blk_end_request_all(req, -ENODEV);
 
 		return;
 	}
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index d1815272c43..fabec95686b 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1613,15 +1613,6 @@ void dasd_block_clear_timer(struct dasd_block *block)
 	del_timer(&block->timer);
 }
 
-/*
- * posts the buffer_cache about a finalized request
- */
-static inline void dasd_end_request(struct request *req, int error)
-{
-	if (__blk_end_request(req, error, blk_rq_bytes(req)))
-		BUG();
-}
-
 /*
  * Process finished error recovery ccw.
  */
@@ -1676,7 +1667,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "Rejecting write request %p",
 				      req);
 			blkdev_dequeue_request(req);
-			dasd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		cqr = basedev->discipline->build_cp(basedev, block, req);
@@ -1705,7 +1696,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "on request %p",
 				      PTR_ERR(cqr), req);
 			blkdev_dequeue_request(req);
-			dasd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		/*
@@ -1731,7 +1722,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
 	status = cqr->block->base->discipline->free_cp(cqr, req);
 	if (status <= 0)
 		error = status ? status : -EIO;
-	dasd_end_request(req, error);
+	__blk_end_request_all(req, error);
 }
 
 /*
@@ -2040,7 +2031,7 @@ static void dasd_flush_request_queue(struct dasd_block *block)
 	spin_lock_irq(&block->request_queue_lock);
 	while ((req = elv_next_request(block->request_queue))) {
 		blkdev_dequeue_request(req);
-		dasd_end_request(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 	}
 	spin_unlock_irq(&block->request_queue_lock);
 }
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index f32e89e7c4f..86596d3813b 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -73,13 +73,6 @@ tapeblock_trigger_requeue(struct tape_device *device)
 /*
  * Post finished request.
  */
-static void
-tapeblock_end_request(struct request *req, int error)
-{
-	if (blk_end_request(req, error, blk_rq_bytes(req)))
-		BUG();
-}
-
 static void
 __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 {
@@ -90,7 +83,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 
 	device = ccw_req->device;
 	req = (struct request *) data;
-	tapeblock_end_request(req, (ccw_req->rc == 0) ? 0 : -EIO);
+	blk_end_request_all(req, (ccw_req->rc == 0) ? 0 : -EIO);
 	if (ccw_req->rc == 0)
 		/* Update position. */
 		device->blk_data.block_position =
@@ -118,7 +111,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req)
 	ccw_req = device->discipline->bread(device, req);
 	if (IS_ERR(ccw_req)) {
 		DBF_EVENT(1, "TBLOCK: bread failed\n");
-		tapeblock_end_request(req, -EIO);
+		blk_end_request_all(req, -EIO);
 		return PTR_ERR(ccw_req);
 	}
 	ccw_req->callback = __tapeblock_end_request;
@@ -131,7 +124,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req)
 		 * Start/enqueueing failed. No retries in
 		 * this case.
 		 */
-		tapeblock_end_request(req, -EIO);
+		blk_end_request_all(req, -EIO);
 		device->discipline->free_bread(ccw_req);
 	}
 
@@ -177,7 +170,7 @@ tapeblock_requeue(struct work_struct *work) {
 			DBF_EVENT(1, "TBLOCK: Rejecting write request\n");
 			blkdev_dequeue_request(req);
 			spin_unlock_irq(&device->blk_data.request_queue_lock);
-			tapeblock_end_request(req, -EIO);
+			blk_end_request_all(req, -EIO);
 			spin_lock_irq(&device->blk_data.request_queue_lock);
 			continue;
 		}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d1cb64ad1a3..756ac7c93de 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -922,7 +922,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			if (driver_byte(result) & DRIVER_SENSE)
 				scsi_print_sense("", cmd);
 		}
-		blk_end_request(req, -EIO, blk_rq_bytes(req));
+		blk_end_request_all(req, -EIO);
 		scsi_next_command(cmd);
 		break;
 	case ACTION_REPREP:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 501f6845cc7..e33c8356b3d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -882,6 +882,22 @@ static inline bool blk_end_request(struct request *rq, int error,
 	return blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 
+/**
+ * blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.
+ */
+static inline void blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+
+	pending = blk_end_request(rq, error, blk_rq_bytes(rq));
+	BUG_ON(pending);
+}
+
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -901,6 +917,22 @@ static inline bool __blk_end_request(struct request *rq, int error,
 	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 
+/**
+ * __blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.  Must be called with queue lock held.
+ */
+static inline void __blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+
+	pending = __blk_end_request(rq, error, blk_rq_bytes(rq));
+	BUG_ON(pending);
+}
+
 /**
  * end_request - end I/O on the current segment of the request
  * @rq:		the request being processed
-- 
cgit v1.2.3-70-g09d2


From f06d9a2b52e246a66b606130cea3f0d7b7be17a7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:19 +0900
Subject: block: replace end_request() with [__]blk_end_request_cur()

end_request() has been kept around for backward compatibility;
however, it's about time for it to go away.

* There aren't too many users left.

* Its use of @updtodate is pretty confusing.

* In some cases, newer code ends up using mixture of end_request() and
  [__]blk_end_request[_all](), which is way too confusing.

So, add [__]blk_end_request_cur() and replace end_request() with it.
Most conversions are straightforward.  Noteworthy ones are...

* paride/pcd: next_request() updated to take 0/-errno instead of 1/0.

* paride/pf: pf_end_request() and next_request() updated to take
  0/-errno instead of 1/0.

* xd: xd_readwrite() updated to return 0/-errno instead of 1/0.

* mtd/mtd_blkdevs: blktrans_discard_request() updated to return
  0/-errno instead of 1/0.  Unnecessary local variable res
  initialization removed from mtd_blktrans_thread().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Joerg Dorchain <joerg@dorchain.net>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Laurent Vivier <Laurent@lvivier.info>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: unsik Kim <donari75@gmail.com>
---
 drivers/block/amiflop.c         | 10 ++++-----
 drivers/block/ataflop.c         | 14 ++++++-------
 drivers/block/hd.c              | 14 ++++++-------
 drivers/block/mg_disk.c         | 16 +++++++-------
 drivers/block/paride/pcd.c      | 12 +++++------
 drivers/block/paride/pd.c       |  5 +++--
 drivers/block/paride/pf.c       | 28 ++++++++++++-------------
 drivers/block/ps3disk.c         |  6 +++---
 drivers/block/swim.c            | 14 ++++++-------
 drivers/block/swim3.c           | 26 +++++++++++------------
 drivers/block/xd.c              | 15 +++++++-------
 drivers/block/xen-blkfront.c    |  2 +-
 drivers/block/xsysace.c         |  4 ++--
 drivers/block/z2ram.c           |  4 ++--
 drivers/cdrom/gdrom.c           |  6 +++---
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/mtd/mtd_blkdevs.c       | 22 ++++++++++----------
 drivers/sbus/char/jsflash.c     |  8 +++----
 include/linux/blkdev.h          | 46 ++++++++++++++++++++---------------------
 19 files changed, 128 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8df436ff706..b99a2a606d0 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1359,7 +1359,7 @@ static void redo_fd_request(void)
 #endif
 		block = CURRENT->sector + cnt;
 		if ((int)block > floppy->blocks) {
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 
@@ -1373,11 +1373,11 @@ static void redo_fd_request(void)
 
 		if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) {
 			printk(KERN_WARNING "do_fd_request: unknown command\n");
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		if (get_track(drive, track) == -1) {
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 
@@ -1391,7 +1391,7 @@ static void redo_fd_request(void)
 
 			/* keep the drive spinning while writes are scheduled */
 			if (!fd_motor_on(drive)) {
-				end_request(CURRENT, 0);
+				__blk_end_request_cur(CURRENT, -EIO);
 				goto repeat;
 			}
 			/*
@@ -1410,7 +1410,7 @@ static void redo_fd_request(void)
 	CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 	CURRENT->sector += CURRENT->current_nr_sectors;
 
-	end_request(CURRENT, 1);
+	__blk_end_request_cur(CURRENT, 0);
 	goto repeat;
 }
 
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 4234c11c1e4..44a8702136a 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -612,7 +612,7 @@ static void fd_error( void )
 	CURRENT->errors++;
 	if (CURRENT->errors >= MAX_ERRORS) {
 		printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 	}
 	else if (CURRENT->errors == RECALIBRATE_ERRORS) {
 		printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
@@ -734,7 +734,7 @@ static void do_fd_action( int drive )
 			/* all sectors finished */
 			CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 			CURRENT->sector += CURRENT->current_nr_sectors;
-			end_request(CURRENT, 1);
+			__blk_end_request_cur(CURRENT, 0);
 			redo_fd_request();
 			return;
 		    }
@@ -1141,7 +1141,7 @@ static void fd_rwsec_done1(int status)
 		/* all sectors finished */
 		CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 		CURRENT->sector += CURRENT->current_nr_sectors;
-		end_request(CURRENT, 1);
+		__blk_end_request_cur(CURRENT, 0);
 		redo_fd_request();
 	}
 	return;
@@ -1414,7 +1414,7 @@ repeat:
 	if (!UD.connected) {
 		/* drive not connected */
 		printk(KERN_ERR "Unknown Device: fd%d\n", drive );
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
 		
@@ -1430,12 +1430,12 @@ repeat:
 		/* user supplied disk type */
 		if (--type >= NUM_DISK_MINORS) {
 			printk(KERN_WARNING "fd%d: invalid disk format", drive );
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		if (minor2disktype[type].drive_types > DriveType)  {
 			printk(KERN_WARNING "fd%d: unsupported disk format", drive );
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		type = minor2disktype[type].index;
@@ -1445,7 +1445,7 @@ repeat:
 	}
 	
 	if (CURRENT->sector + 1 > UDT->blocks) {
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index baaa9e486e5..5cb300b81c6 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -410,7 +410,7 @@ static void bad_rw_intr(void)
 	if (req != NULL) {
 		struct hd_i_struct *disk = req->rq_disk->private_data;
 		if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			disk->special_op = disk->recalibrate = 1;
 		} else if (req->errors % RESET_FREQ == 0)
 			reset = 1;
@@ -466,7 +466,7 @@ ok_to_read:
 		req->buffer+512);
 #endif
 	if (req->current_nr_sectors <= 0)
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	if (i > 0) {
 		SET_HANDLER(&read_intr);
 		return;
@@ -505,7 +505,7 @@ ok_to_write:
 	--req->current_nr_sectors;
 	req->buffer += 512;
 	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	if (i > 0) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
@@ -548,7 +548,7 @@ static void hd_times_out(unsigned long dummy)
 #ifdef DEBUG
 		printk("%s: too many errors\n", name);
 #endif
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 	}
 	hd_request();
 	spin_unlock_irq(hd_queue->queue_lock);
@@ -563,7 +563,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req)
 	}
 	if (disk->head > 16) {
 		printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 	}
 	disk->special_op = 0;
 	return 1;
@@ -607,7 +607,7 @@ repeat:
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
 			req->rq_disk->disk_name, block, nsect);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		goto repeat;
 	}
 
@@ -647,7 +647,7 @@ repeat:
 			break;
 		default:
 			printk("unknown hd-command\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			break;
 		}
 	}
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index f3898353d0a..408c2bd8a43 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -285,7 +285,7 @@ static void mg_bad_rw_intr(struct mg_host *host)
 	if (req != NULL)
 		if (++req->errors >= MG_MAX_ERRORS ||
 				host->error == MG_ERR_TIMEOUT)
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 }
 
 static unsigned int mg_out(struct mg_host *host,
@@ -351,7 +351,7 @@ static void mg_read(struct request *req)
 
 		if (req->current_nr_sectors <= 0) {
 			MG_DBG("remain : %d sects\n", remains);
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			if (remains > 0)
 				req = elv_next_request(host->breq);
 		}
@@ -395,7 +395,7 @@ static void mg_write(struct request *req)
 
 		if (req->current_nr_sectors <= 0) {
 			MG_DBG("remain : %d sects\n", remains);
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			if (remains > 0)
 				req = elv_next_request(host->breq);
 		}
@@ -448,7 +448,7 @@ ok_to_read:
 
 	/* let know if current segment done */
 	if (req->current_nr_sectors <= 0)
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 
 	/* set handler if read remains */
 	if (i > 0) {
@@ -497,7 +497,7 @@ ok_to_write:
 
 	/* let know if current segment or all done */
 	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 
 	/* write 1 sector and set handler if remains */
 	if (i > 0) {
@@ -563,7 +563,7 @@ static void mg_request_poll(struct request_queue *q)
 			default:
 				printk(KERN_WARNING "%s:%d unknown command\n",
 						__func__, __LINE__);
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				break;
 			}
 		}
@@ -617,7 +617,7 @@ static unsigned int mg_issue_req(struct request *req,
 	default:
 		printk(KERN_WARNING "%s:%d unknown command\n",
 				__func__, __LINE__);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		break;
 	}
 	return MG_ERR_NONE;
@@ -655,7 +655,7 @@ static void mg_request(struct request_queue *q)
 					"%s: bad access: sector=%d, count=%d\n",
 					req->rq_disk->disk_name,
 					sect_num, sect_cnt);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index e91d4b4b014..9fd57c2aa46 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -735,16 +735,16 @@ static void do_pcd_request(struct request_queue * q)
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
 		} else
-			end_request(pcd_req, 0);
+			__blk_end_request_cur(pcd_req, -EIO);
 	}
 }
 
-static inline void next_request(int success)
+static inline void next_request(int err)
 {
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pcd_lock, saved_flags);
-	end_request(pcd_req, success);
+	__blk_end_request_cur(pcd_req, err);
 	pcd_busy = 0;
 	do_pcd_request(pcd_queue);
 	spin_unlock_irqrestore(&pcd_lock, saved_flags);
@@ -781,7 +781,7 @@ static void pcd_start(void)
 
 	if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) {
 		pcd_bufblk = -1;
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
@@ -796,7 +796,7 @@ static void do_pcd_read(void)
 	pcd_retries = 0;
 	pcd_transfer();
 	if (!pcd_count) {
-		next_request(1);
+		next_request(0);
 		return;
 	}
 
@@ -815,7 +815,7 @@ static void do_pcd_read_drq(void)
 			return;
 		}
 		pcd_bufblk = -1;
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9299455b0af..0732df4e901 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -410,7 +410,8 @@ static void run_fsm(void)
 				pd_claimed = 0;
 				phase = NULL;
 				spin_lock_irqsave(&pd_lock, saved_flags);
-				end_request(pd_req, res);
+				__blk_end_request_cur(pd_req,
+						      res == Ok ? 0 : -EIO);
 				pd_req = elv_next_request(pd_queue);
 				if (!pd_req)
 					stop = 1;
@@ -477,7 +478,7 @@ static int pd_next_buf(void)
 	if (pd_count)
 		return 0;
 	spin_lock_irqsave(&pd_lock, saved_flags);
-	end_request(pd_req, 1);
+	__blk_end_request_cur(pd_req, 0);
 	pd_count = pd_req->current_nr_sectors;
 	pd_buf = pd_req->buffer;
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index bef3b997ba3..3871e3586d6 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -750,10 +750,10 @@ static int pf_ready(void)
 
 static struct request_queue *pf_queue;
 
-static void pf_end_request(int uptodate)
+static void pf_end_request(int err)
 {
 	if (pf_req) {
-		end_request(pf_req, uptodate);
+		__blk_end_request_cur(pf_req, err);
 		pf_req = NULL;
 	}
 }
@@ -773,7 +773,7 @@ repeat:
 	pf_count = pf_req->current_nr_sectors;
 
 	if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) {
-		pf_end_request(0);
+		pf_end_request(-EIO);
 		goto repeat;
 	}
 
@@ -788,7 +788,7 @@ repeat:
 		pi_do_claimed(pf_current->pi, do_pf_write);
 	else {
 		pf_busy = 0;
-		pf_end_request(0);
+		pf_end_request(-EIO);
 		goto repeat;
 	}
 }
@@ -805,7 +805,7 @@ static int pf_next_buf(void)
 		return 1;
 	if (!pf_count) {
 		spin_lock_irqsave(&pf_spin_lock, saved_flags);
-		pf_end_request(1);
+		pf_end_request(0);
 		pf_req = elv_next_request(pf_queue);
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
@@ -816,12 +816,12 @@ static int pf_next_buf(void)
 	return 0;
 }
 
-static inline void next_request(int success)
+static inline void next_request(int err)
 {
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pf_spin_lock, saved_flags);
-	pf_end_request(success);
+	pf_end_request(err);
 	pf_busy = 0;
 	do_pf_request(pf_queue);
 	spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
@@ -844,7 +844,7 @@ static void do_pf_read_start(void)
 			pi_do_claimed(pf_current->pi, do_pf_read_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 	pf_mask = STAT_DRQ;
@@ -863,7 +863,7 @@ static void do_pf_read_drq(void)
 				pi_do_claimed(pf_current->pi, do_pf_read_start);
 				return;
 			}
-			next_request(0);
+			next_request(-EIO);
 			return;
 		}
 		pi_read_block(pf_current->pi, pf_buf, 512);
@@ -871,7 +871,7 @@ static void do_pf_read_drq(void)
 			break;
 	}
 	pi_disconnect(pf_current->pi);
-	next_request(1);
+	next_request(0);
 }
 
 static void do_pf_write(void)
@@ -890,7 +890,7 @@ static void do_pf_write_start(void)
 			pi_do_claimed(pf_current->pi, do_pf_write_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
@@ -903,7 +903,7 @@ static void do_pf_write_start(void)
 				pi_do_claimed(pf_current->pi, do_pf_write_start);
 				return;
 			}
-			next_request(0);
+			next_request(-EIO);
 			return;
 		}
 		pi_write_block(pf_current->pi, pf_buf, 512);
@@ -923,11 +923,11 @@ static void do_pf_write_done(void)
 			pi_do_claimed(pf_current->pi, do_pf_write_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 	pi_disconnect(pf_current->pi);
-	next_request(1);
+	next_request(0);
 }
 
 static int __init pf_init(void)
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index bccc42bb921..d23b54bc2f5 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -158,7 +158,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
 			__LINE__, op, res);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		return 0;
 	}
 
@@ -180,7 +180,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
 			__func__, __LINE__, res);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		return 0;
 	}
 
@@ -205,7 +205,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 				break;
 		} else {
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 	}
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index d22cc385693..6544a7b06bf 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -532,39 +532,39 @@ static void redo_fd_request(struct request_queue *q)
 
 		fs = req->rq_disk->private_data;
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (req->current_nr_sectors == 0) {
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			continue;
 		}
 		if (!fs->disk_in) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (rq_data_dir(req) == WRITE) {
 			if (fs->write_protected) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 		}
 		switch (rq_data_dir(req)) {
 		case WRITE:
 			/* NOT IMPLEMENTED */
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
 			if (floppy_read_sectors(fs, req->sector,
 						req->current_nr_sectors,
 						req->buffer)) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 			req->nr_sectors -= req->current_nr_sectors;
 			req->sector += req->current_nr_sectors;
 			req->buffer += req->current_nr_sectors * 512;
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			break;
 		}
 	}
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 612965307ba..5904f7b73c6 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -320,15 +320,15 @@ static void start_request(struct floppy_state *fs)
 #endif
 
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (req->current_nr_sectors == 0) {
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			continue;
 		}
 		if (fs->ejected) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
@@ -336,7 +336,7 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 		}
@@ -508,7 +508,7 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					end_request(fd_req, 0);
+					__blk_end_request_cur(fd_req, -EIO);
 					fs->state = idle;
 					return;
 				}
@@ -540,7 +540,7 @@ static void scan_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		end_request(fd_req, 0);
+		__blk_end_request_cur(fd_req, -EIO);
 		fs->state = idle;
 		start_request(fs);
 	} else {
@@ -559,7 +559,7 @@ static void seek_timeout(unsigned long data)
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	printk(KERN_ERR "swim3: seek timeout\n");
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -583,7 +583,7 @@ static void settle_timeout(unsigned long data)
 		return;
 	}
 	printk(KERN_ERR "swim3: seek settle timeout\n");
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -615,7 +615,7 @@ static void xfer_timeout(unsigned long data)
 	fd_req->current_nr_sectors -= s;
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -646,7 +646,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					end_request(fd_req, 0);
+					__blk_end_request_cur(fd_req, -EIO);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -731,7 +731,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
 				       (long)fd_req->sector, err);
-				end_request(fd_req, 0);
+				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 			}
 		} else {
@@ -740,7 +740,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
 				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
 				       fs->state, rq_data_dir(fd_req), intr, err);
-				end_request(fd_req, 0);
+				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 				start_request(fs);
 				break;
@@ -749,7 +749,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			fd_req->current_nr_sectors -= fs->scount;
 			fd_req->buffer += fs->scount * 512;
 			if (fd_req->current_nr_sectors <= 0) {
-				end_request(fd_req, 1);
+				__blk_end_request_cur(fd_req, 0);
 				fs->state = idle;
 			} else {
 				fs->req_sector += fs->scount;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 64b496fce98..6f6ad82ec0c 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -314,21 +314,22 @@ static void do_xd_request (struct request_queue * q)
 		int retry;
 
 		if (!blk_fs_request(req)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (block + count > get_capacity(req->rq_disk)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (rw != READ && rw != WRITE) {
 			printk("do_xd_request: unknown request\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
 			res = xd_readwrite(rw, disk, req->buffer, block, count);
-		end_request(req, res);	/* wrap up, 0 = fail, 1 = success */
+		/* wrap up, 0 = success, -errno = fail */
+		__blk_end_request_cur(req, res);
 	}
 }
 
@@ -418,7 +419,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 				printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write"));
 				xd_recalibrate(drive);
 				spin_lock_irq(&xd_lock);
-				return (0);
+				return -EIO;
 			case 2:
 				if (sense[0] & 0x30) {
 					printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing"));
@@ -439,7 +440,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 				else
 					printk(" - no valid disk address\n");
 				spin_lock_irq(&xd_lock);
-				return (0);
+				return -EIO;
 		}
 		if (xd_dma_buffer)
 			for (i=0; i < (temp * 0x200); i++)
@@ -448,7 +449,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 		count -= temp, buffer += temp * 0x200, block += temp;
 	}
 	spin_lock_irq(&xd_lock);
-	return (1);
+	return 0;
 }
 
 /* xd_recalibrate: recalibrate a given drive and reset controller if necessary */
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index cd6cfe3b51e..b4564479f64 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -302,7 +302,7 @@ static void do_blkif_request(struct request_queue *rq)
 	while ((req = elv_next_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 		if (!blk_fs_request(req)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 4aecf5dc6a9..b1e1d7e5ab1 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -466,7 +466,7 @@ struct request *ace_get_next_request(struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 	}
 	return req;
 }
@@ -494,7 +494,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 
 		/* Drop all pending requests */
 		while ((req = elv_next_request(ace->queue)) != NULL)
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 80754cdd311..b66ad58a3c3 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -77,7 +77,7 @@ static void do_z2_request(struct request_queue *q)
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
 				req->sector, req->current_nr_sectors);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		while (len) {
@@ -93,7 +93,7 @@ static void do_z2_request(struct request_queue *q)
 			start += size;
 			len -= size;
 		}
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	}
 }
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index fee9a9e83fc..cab2b1fb2fe 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -654,17 +654,17 @@ static void gdrom_request(struct request_queue *rq)
 	while ((req = elv_next_request(rq)) != NULL) {
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 		}
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_NOTICE "GDROM: Read only device -");
 			printk(" write request ignored\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 		}
 		if (req->nr_sectors)
 			gdrom_request_handler_dma(req);
 		else
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 	}
 }
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index a443e136dc4..221317e6a00 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -923,7 +923,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 				break;
 			}
 		} else
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 	}
 };
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index a49a9c8f2cb..76c4c8d1307 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -54,33 +54,33 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
 	    req->cmd[0] == REQ_LB_OP_DISCARD)
-		return !tr->discard(dev, block, nsect);
+		return tr->discard(dev, block, nsect);
 
 	if (!blk_fs_request(req))
-		return 0;
+		return -EIO;
 
 	if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk))
-		return 0;
+		return -EIO;
 
 	switch(rq_data_dir(req)) {
 	case READ:
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
-				return 0;
-		return 1;
+				return -EIO;
+		return 0;
 
 	case WRITE:
 		if (!tr->writesect)
-			return 0;
+			return -EIO;
 
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->writesect(dev, block, buf))
-				return 0;
-		return 1;
+				return -EIO;
+		return 0;
 
 	default:
 		printk(KERN_NOTICE "Unknown request %u\n", rq_data_dir(req));
-		return 0;
+		return -EIO;
 	}
 }
 
@@ -96,7 +96,7 @@ static int mtd_blktrans_thread(void *arg)
 	while (!kthread_should_stop()) {
 		struct request *req;
 		struct mtd_blktrans_dev *dev;
-		int res = 0;
+		int res;
 
 		req = elv_next_request(rq);
 
@@ -119,7 +119,7 @@ static int mtd_blktrans_thread(void *arg)
 
 		spin_lock_irq(rq->queue_lock);
 
-		end_request(req, res);
+		__blk_end_request_cur(req, res);
 	}
 	spin_unlock_irq(rq->queue_lock);
 
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index a85ad05e854..09617884a50 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -192,25 +192,25 @@ static void jsfd_do_request(struct request_queue *q)
 		size_t len = req->current_nr_sectors << 9;
 
 		if ((offset + len) > jdp->dsize) {
-               		end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_ERR "jsfd: write\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		if ((jdp->dbase & 0xff000000) != 0x20000000) {
 			printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
 
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e33c8356b3d..cfeb3c2feb2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -845,9 +845,8 @@ extern unsigned int blk_rq_cur_bytes(struct request *rq);
  * blk_update_request() completes given number of bytes and updates
  * the request without completing it.
  *
- * blk_end_request() and friends.  __blk_end_request() and
- * end_request() must be called with the request queue spinlock
- * acquired.
+ * blk_end_request() and friends.  __blk_end_request() must be called
+ * with the request queue spinlock acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
@@ -898,6 +897,19 @@ static inline void blk_end_request_all(struct request *rq, int error)
 	BUG_ON(pending);
 }
 
+/**
+ * blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.
+ */
+static inline void blk_end_request_cur(struct request *rq, int error)
+{
+	blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+}
+
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -934,29 +946,17 @@ static inline void __blk_end_request_all(struct request *rq, int error)
 }
 
 /**
- * end_request - end I/O on the current segment of the request
- * @rq:		the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
+ * __blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
  *
  * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
- **/
-static inline void end_request(struct request *rq, int uptodate)
+ *     Complete the current consecutively mapped chunk from @rq.  Must
+ *     be called with queue lock held.
+ */
+static inline void __blk_end_request_cur(struct request *rq, int error)
 {
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0);
+	__blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 extern void blk_complete_request(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 731ec497e5888c6792ad62613ae9be97eebcd7ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:20 +0900
Subject: block: kill rq->data

Now that all block request data transfer is done via bio, rq->data
isn't used.  Kill it.

While at it, make the roles of rq->special and buffer clear.

[ Impact: drop now unncessary field from struct request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
---
 block/blk-core.c        | 5 ++---
 block/blk-map.c         | 6 +++---
 block/scsi_ioctl.c      | 1 -
 drivers/scsi/scsi_lib.c | 1 -
 include/linux/blkdev.h  | 5 ++---
 5 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0520cc70458..6dd180cf15d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -189,10 +189,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 						(unsigned long long)rq->sector,
 						rq->nr_sectors,
 						rq->current_nr_sectors);
-	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, data %p, len %u\n",
+	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 						rq->bio, rq->biotail,
-						rq->buffer, rq->data,
-						rq->data_len);
+						rq->buffer, rq->data_len);
 
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
diff --git a/block/blk-map.c b/block/blk-map.c
index f103729b462..694fefad34e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -156,7 +156,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
 
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 unmap_rq:
 	blk_rq_unmap_user(bio);
@@ -235,7 +235,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	blk_queue_bounce(q, &bio);
 	bio_get(bio);
 	blk_rq_bio_prep(q, rq, bio);
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -313,7 +313,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 
 	blk_rq_bio_prep(q, rq, bio);
 	blk_queue_bounce(q, &rq->bio);
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 82a0ca2f672..bb9aa43551b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -500,7 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->data = NULL;
 	rq->data_len = 0;
 	rq->extra_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 756ac7c93de..aa9fc572e45 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1088,7 +1088,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 			return ret;
 	} else {
 		BUG_ON(req->data_len);
-		BUG_ON(req->data);
 
 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
 		req->buffer = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cfeb3c2feb2..12c545e2737 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -211,8 +211,8 @@ struct request {
 
 	unsigned short ioprio;
 
-	void *special;
-	char *buffer;
+	void *special;		/* opaque pointer available for LLD use */
+	char *buffer;		/* kaddr of the current segment if available */
 
 	int tag;
 	int errors;
@@ -229,7 +229,6 @@ struct request {
 	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
-	void *data;
 	void *sense;
 
 	unsigned long deadline;
-- 
cgit v1.2.3-70-g09d2


From 9fd8d0e1bcb848257968d9a7d73ca4d890ea8bd1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:04 +0900
Subject: block: make blk_end_request_cur() return bool

In the process of mindlessly copying [__]blk_end_request_all(),
[__]blk_end_request_cur() ended up returning void even though they're
partial completion functions.  Fix it.

[ Impact: fix braindead API ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12c545e2737..3a5b1bd6582 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -903,10 +903,14 @@ static inline void blk_end_request_all(struct request *rq, int error)
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  */
-static inline void blk_end_request_cur(struct request *rq, int error)
+static inline bool blk_end_request_cur(struct request *rq, int error)
 {
-	blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 /**
@@ -952,10 +956,14 @@ static inline void __blk_end_request_all(struct request *rq, int error)
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
  *     be called with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  */
-static inline void __blk_end_request_cur(struct request *rq, int error)
+static inline bool __blk_end_request_cur(struct request *rq, int error)
 {
-	__blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return __blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 extern void blk_complete_request(struct request *);
-- 
cgit v1.2.3-70-g09d2


From eec9462088a26c046d4db3100796a340a50890b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:14 +0900
Subject: mg_disk: fold mg_disk.h into mg_disk.c

include/linux/mg_disk.h is used only by drivers/block/mg_disk.c.  No
reason to put it in a separate header.  Fold it into mg_disk.c.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 186 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mg_disk.h | 206 ------------------------------------------------
 2 files changed, 185 insertions(+), 207 deletions(-)
 delete mode 100644 include/linux/mg_disk.h

(limited to 'include/linux')

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 2c00ad90cd6..2c6127ee834 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -22,10 +22,194 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
-#include <linux/mg_disk.h>
 
 #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
 
+/* name for block device */
+#define MG_DISK_NAME "mgd"
+/* name for platform device */
+#define MG_DEV_NAME "mg_disk"
+
+#define MG_DISK_MAJ 0
+#define MG_DISK_MAX_PART 16
+#define MG_SECTOR_SIZE 512
+#define MG_MAX_SECTS 256
+
+/* Register offsets */
+#define MG_BUFF_OFFSET			0x8000
+#define MG_STORAGE_BUFFER_SIZE		0x200
+#define MG_REG_OFFSET			0xC000
+#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
+#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
+#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
+#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
+#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
+#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
+#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
+#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
+#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
+#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
+#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
+
+/* "Drive Select/Head Register" bit values */
+#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
+#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
+
+
+/* "Device Control Register" bit values */
+#define MG_REG_CTRL_INTR_ENABLE			0x0
+#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
+#define MG_REG_CTRL_RESET			(0x1<<2)
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
+#define MG_REG_CTRL_DPD_DISABLE			0x0
+#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
+
+/* Status register bit */
+/* error bit in status register */
+#define MG_REG_STATUS_BIT_ERROR			0x01
+/* corrected error in status register */
+#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
+/* data request bit in status register */
+#define MG_REG_STATUS_BIT_DATA_REQ		0x08
+/* DSC - Drive Seek Complete */
+#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
+/* DWF - Drive Write Fault */
+#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
+#define MG_REG_STATUS_BIT_READY			0x40
+#define MG_REG_STATUS_BIT_BUSY			0x80
+
+/* handy status */
+#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
+#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
+				(MG_REG_STATUS_BIT_BUSY | \
+				 MG_REG_STATUS_BIT_WRITE_FAULT | \
+				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
+
+/* Error register */
+#define MG_REG_ERR_AMNF		0x01
+#define MG_REG_ERR_ABRT		0x04
+#define MG_REG_ERR_IDNF		0x10
+#define MG_REG_ERR_UNC		0x40
+#define MG_REG_ERR_BBK		0x80
+
+/* error code for others */
+#define MG_ERR_NONE		0
+#define MG_ERR_TIMEOUT		0x100
+#define MG_ERR_INIT_STAT	0x101
+#define MG_ERR_TRANSLATION	0x102
+#define MG_ERR_CTRL_RST		0x103
+#define MG_ERR_INV_STAT		0x104
+#define MG_ERR_RSTOUT		0x105
+
+#define MG_MAX_ERRORS	6	/* Max read/write errors */
+
+/* command */
+#define MG_CMD_RD 0x20
+#define MG_CMD_WR 0x30
+#define MG_CMD_SLEEP 0x99
+#define MG_CMD_WAKEUP 0xC3
+#define MG_CMD_ID 0xEC
+#define MG_CMD_WR_CONF 0x3C
+#define MG_CMD_RD_CONF 0x40
+
+/* operation mode */
+#define MG_OP_CASCADE (1 << 0)
+#define MG_OP_CASCADE_SYNC_RD (1 << 1)
+#define MG_OP_CASCADE_SYNC_WR (1 << 2)
+#define MG_OP_INTERLEAVE (1 << 3)
+
+/* synchronous */
+#define MG_BURST_LAT_4 (3 << 4)
+#define MG_BURST_LAT_5 (4 << 4)
+#define MG_BURST_LAT_6 (5 << 4)
+#define MG_BURST_LAT_7 (6 << 4)
+#define MG_BURST_LAT_8 (7 << 4)
+#define MG_BURST_LEN_4 (1 << 1)
+#define MG_BURST_LEN_8 (2 << 1)
+#define MG_BURST_LEN_16 (3 << 1)
+#define MG_BURST_LEN_32 (4 << 1)
+#define MG_BURST_LEN_CONT (0 << 1)
+
+/* timeout value (unit: ms) */
+#define MG_TMAX_CONF_TO_CMD	1
+#define MG_TMAX_WAIT_RD_DRQ	10
+#define MG_TMAX_WAIT_WR_DRQ	500
+#define MG_TMAX_RST_TO_BUSY	10
+#define MG_TMAX_HDRST_TO_RDY	500
+#define MG_TMAX_SWRST_TO_RDY	500
+#define MG_TMAX_RSTOUT		3000
+
+/* device attribution */
+/* use mflash as boot device */
+#define MG_BOOT_DEV		(1 << 0)
+/* use mflash as storage device */
+#define MG_STORAGE_DEV		(1 << 1)
+/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
+#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
+
+#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
+
+/* names of GPIO resource */
+#define MG_RST_PIN	"mg_rst"
+/* except MG_BOOT_DEV, reset-out pin should be assigned */
+#define MG_RSTOUT_PIN	"mg_rstout"
+
+/* private driver data */
+struct mg_drv_data {
+	/* disk resource */
+	u32 use_polling;
+
+	/* device attribution */
+	u32 dev_attr;
+
+	/* internally used */
+	struct mg_host *host;
+};
+
+/* main structure for mflash driver */
+struct mg_host {
+	struct device *dev;
+
+	struct request_queue *breq;
+	spinlock_t lock;
+	struct gendisk *gd;
+
+	struct timer_list timer;
+	void (*mg_do_intr) (struct mg_host *);
+
+	u16 id[ATA_ID_WORDS];
+
+	u16 cyls;
+	u16 heads;
+	u16 sectors;
+	u32 n_sectors;
+	u32 nres_sectors;
+
+	void __iomem *dev_base;
+	unsigned int irq;
+	unsigned int rst;
+	unsigned int rstout;
+
+	u32 major;
+	u32 error;
+};
+
+/*
+ * Debugging macro and defines
+ */
+#undef DO_MG_DEBUG
+#ifdef DO_MG_DEBUG
+#  define MG_DBG(fmt, args...) \
+	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
+#else /* CONFIG_MG_DEBUG */
+#  define MG_DBG(fmt, args...) do { } while (0)
+#endif /* CONFIG_MG_DEBUG */
+
 static void mg_request(struct request_queue *);
 
 static void mg_dump_status(const char *msg, unsigned int stat,
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index 1f76b1ebf62..00000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-#include <linux/blkdev.h>
-#include <linux/ata.h>
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET			0x8000
-#define MG_STORAGE_BUFFER_SIZE		0x200
-#define MG_REG_OFFSET			0xC000
-#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
-#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
-#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
-#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
-#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
-
-/* "Drive Select/Head Register" bit values */
-#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
-#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
-
-
-/* "Device Control Register" bit values */
-#define MG_REG_CTRL_INTR_ENABLE			0x0
-#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
-#define MG_REG_CTRL_RESET			(0x1<<2)
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
-#define MG_REG_CTRL_DPD_DISABLE			0x0
-#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
-
-/* Status register bit */
-/* error bit in status register */
-#define MG_REG_STATUS_BIT_ERROR			0x01
-/* corrected error in status register */
-#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
-/* data request bit in status register */
-#define MG_REG_STATUS_BIT_DATA_REQ		0x08
-/* DSC - Drive Seek Complete */
-#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
-/* DWF - Drive Write Fault */
-#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
-#define MG_REG_STATUS_BIT_READY			0x40
-#define MG_REG_STATUS_BIT_BUSY			0x80
-
-/* handy status */
-#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
-				(MG_REG_STATUS_BIT_BUSY | \
-				 MG_REG_STATUS_BIT_WRITE_FAULT | \
-				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
-
-/* Error register */
-#define MG_REG_ERR_AMNF		0x01
-#define MG_REG_ERR_ABRT		0x04
-#define MG_REG_ERR_IDNF		0x10
-#define MG_REG_ERR_UNC		0x40
-#define MG_REG_ERR_BBK		0x80
-
-/* error code for others */
-#define MG_ERR_NONE		0
-#define MG_ERR_TIMEOUT		0x100
-#define MG_ERR_INIT_STAT	0x101
-#define MG_ERR_TRANSLATION	0x102
-#define MG_ERR_CTRL_RST		0x103
-#define MG_ERR_INV_STAT		0x104
-#define MG_ERR_RSTOUT		0x105
-
-#define MG_MAX_ERRORS	6	/* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD	1
-#define MG_TMAX_WAIT_RD_DRQ	10
-#define MG_TMAX_WAIT_WR_DRQ	500
-#define MG_TMAX_RST_TO_BUSY	10
-#define MG_TMAX_HDRST_TO_RDY	500
-#define MG_TMAX_SWRST_TO_RDY	500
-#define MG_TMAX_RSTOUT		3000
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	struct mg_host *host;
-};
-
-/* main structure for mflash driver */
-struct mg_host {
-	struct device *dev;
-
-	struct request_queue *breq;
-	spinlock_t lock;
-	struct gendisk *gd;
-
-	struct timer_list timer;
-	void (*mg_do_intr) (struct mg_host *);
-
-	u16 id[ATA_ID_WORDS];
-
-	u16 cyls;
-	u16 heads;
-	u16 sectors;
-	u32 n_sectors;
-	u32 nres_sectors;
-
-	void __iomem *dev_base;
-	unsigned int irq;
-	unsigned int rst;
-	unsigned int rstout;
-
-	u32 major;
-	u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-#endif
-- 
cgit v1.2.3-70-g09d2


From 833cc67c7722e35863c6aaee9df56b442ef957ae Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 27 Apr 2009 21:32:16 +0000
Subject: smsc911x: add fifo byteswap support V2

This is V2 of the smsc911x fifo byteswap patch.

The smsc911x hardware supports both big and little and endian
hardware configurations, and the linux smsc911x driver currently
detects word order.

For correct operation on big endian platforms lacking swapped
byte lanes the following patch is needed. Only fifo data is
swapped, register data does not require any swapping.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.c   | 13 +++++++++++++
 include/linux/smsc911x.h | 10 ++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index eb7db032a78..5113b26fc2d 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -47,6 +47,7 @@
 #include <linux/bitops.h>
 #include <linux/irq.h>
 #include <linux/io.h>
+#include <linux/swab.h>
 #include <linux/phy.h>
 #include <linux/smsc911x.h>
 #include "smsc911x.h"
@@ -175,6 +176,12 @@ static inline void
 smsc911x_tx_writefifo(struct smsc911x_data *pdata, unsigned int *buf,
 		      unsigned int wordcount)
 {
+	if (pdata->config.flags & SMSC911X_SWAP_FIFO) {
+		while (wordcount--)
+			smsc911x_reg_write(pdata, TX_DATA_FIFO, swab32(*buf++));
+		return;
+	}
+
 	if (pdata->config.flags & SMSC911X_USE_32BIT) {
 		writesl(pdata->ioaddr + TX_DATA_FIFO, buf, wordcount);
 		return;
@@ -194,6 +201,12 @@ static inline void
 smsc911x_rx_readfifo(struct smsc911x_data *pdata, unsigned int *buf,
 		     unsigned int wordcount)
 {
+	if (pdata->config.flags & SMSC911X_SWAP_FIFO) {
+		while (wordcount--)
+			*buf++ = swab32(smsc911x_reg_read(pdata, RX_DATA_FIFO));
+		return;
+	}
+
 	if (pdata->config.flags & SMSC911X_USE_32BIT) {
 		readsl(pdata->ioaddr + RX_DATA_FIFO, buf, wordcount);
 		return;
diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h
index b32725075d7..5241e4fb4ec 100644
--- a/include/linux/smsc911x.h
+++ b/include/linux/smsc911x.h
@@ -47,4 +47,14 @@ struct smsc911x_platform_config {
 #define SMSC911X_FORCE_EXTERNAL_PHY 		(BIT(3))
 #define SMSC911X_SAVE_MAC_ADDRESS		(BIT(4))
 
+/*
+ * SMSC911X_SWAP_FIFO:
+ * Enables software byte swap for fifo data. Should only be used as a
+ * "last resort" in the case of big endian mode on boards with incorrectly
+ * routed data bus to older devices such as LAN9118. Newer devices such as
+ * LAN9221 can handle this in hardware, there are registers to control
+ * this swapping but the driver doesn't currently use them.
+ */
+#define SMSC911X_SWAP_FIFO			(BIT(5))
+
 #endif /* __LINUX_SMSC911X_H__ */
-- 
cgit v1.2.3-70-g09d2


From 8dc92f7e2ecfd93f5c57da78594a7a5482e2c15e Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Mon, 27 Apr 2009 22:35:52 +0000
Subject: sctp: add feature bit for SCTP offload in hardware

this is the sctp code to enable hardware crc32c offload for
adapters that support it.

Originally by: Vlad Yasevich <vladislav.yasevich@hp.com>

modified by Jesse Brandeburg <jesse.brandeburg@intel.com>

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/sctp/output.c         | 17 ++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fe20d171acf..f8c3619d551 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -670,7 +670,9 @@ struct net_device
 #define NETIF_F_GRO		16384	/* Generic receive offload */
 #define NETIF_F_LRO		32768	/* large receive offload */
 
+/* the GSO_MASK reserves bits 16 through 23 */
 #define NETIF_F_FCOE_CRC	(1 << 24) /* FCoE CRC32 */
+#define NETIF_F_SCTP_CSUM	(1 << 25) /* SCTP checksum offload */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 7d08f522ec8..f0c91df59d4 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -412,6 +412,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 
 	/* Build the SCTP header.  */
 	sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
+	skb_reset_transport_header(nskb);
 	sh->source = htons(packet->source_port);
 	sh->dest   = htons(packet->destination_port);
 
@@ -527,15 +528,25 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * Note: Adler-32 is no longer applicable, as has been replaced
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
 	 */
-	if (!sctp_checksum_disable && !(dst->dev->features & NETIF_F_NO_CSUM)) {
+	if (!sctp_checksum_disable &&
+	    !(dst->dev->features & (NETIF_F_NO_CSUM | NETIF_F_SCTP_CSUM))) {
 		__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
 
 		/* 3) Put the resultant value into the checksum field in the
 		 *    common header, and leave the rest of the bits unchanged.
 		 */
 		sh->checksum = sctp_end_cksum(crc32);
-	} else
-		nskb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else {
+		if (dst->dev->features & NETIF_F_SCTP_CSUM) {
+			/* no need to seed psuedo checksum for SCTP */
+			nskb->ip_summed = CHECKSUM_PARTIAL;
+			nskb->csum_start = (skb_transport_header(nskb) -
+			                    nskb->head);
+			nskb->csum_offset = offsetof(struct sctphdr, checksum);
+		} else {
+			nskb->ip_summed = CHECKSUM_UNNECESSARY;
+		}
+	}
 
 	/* IP layer ECN support
 	 * From RFC 2481
-- 
cgit v1.2.3-70-g09d2


From 9ec4fa271faf2db3b8e1419c998da1ca6b094eb6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:57:18 -0700
Subject: irq, cpumask: correct CPUMASKS_OFFSTACK typo and fix fallout

CPUMASKS_OFFSTACK is not defined anywhere (it is CPUMASK_OFFSTACK).
It is a typo and init_allocate_desc_masks() is called before it set
affinity to all cpus...

Split init_alloc_desc_masks() into all_desc_masks() and init_desc_masks().

Also use CPUMASK_OFFSTACK in alloc_desc_masks().

[ Impact: fix smp_affinity copying/setup when moving irq_desc between CPUs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
LKML-Reference: <49F6546E.3040406@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h       | 27 ++++++++++++++++++---------
 kernel/irq/handle.c       |  9 ++++++---
 kernel/irq/numa_migrate.c |  2 +-
 3 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b7cbeed972e..c4953cf27e5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -424,27 +424,25 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #ifdef CONFIG_SMP
 /**
- * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
  * @cpu:	cpu which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
- * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
+#ifdef CONFIG_CPUMASK_OFFSTACK
 	int node;
 
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
-		cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 		alloc_bootmem_cpumask_var(&desc->pending_mask);
-		cpumask_clear(desc->pending_mask);
 #endif
 		return true;
 	}
@@ -453,18 +451,25 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
-	cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
-	cpumask_clear(desc->pending_mask);
+#endif
 #endif
 	return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+	cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_clear(desc->pending_mask);
+#endif
+}
+
 /**
  * init_copy_desc_masks - copy cpumasks for irq_desc
  * @old_desc:	pointer to old irq_desc struct
@@ -478,7 +483,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 					struct irq_desc *new_desc)
 {
-#ifdef CONFIG_CPUMASKS_OFFSTACK
+#ifdef CONFIG_CPUMASK_OFFSTACK
 	cpumask_copy(new_desc->affinity, old_desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -499,12 +504,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
 	return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+}
+
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 					struct irq_desc *new_desc)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d82142be8dd..882c7980010 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -115,10 +115,11 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
+	init_desc_masks(desc);
 	arch_init_chip_data(desc, cpu);
 }
 
@@ -169,7 +170,8 @@ int __init early_irq_init(void)
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-		init_alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], 0, true);
+		init_desc_masks(&desc[i]);
 		irq_desc_ptrs[i] = desc + i;
 	}
 
@@ -256,7 +258,8 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
-		init_alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], 0, true);
+		init_desc_masks(&desc[i]);
 		desc[i].kstat_irqs = kstat_irqs_all[i];
 	}
 	return arch_early_irq_init();
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d..5760d725162 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -37,7 +37,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
-	if (!init_alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
 				"for migration.\n", irq);
 		return false;
-- 
cgit v1.2.3-70-g09d2


From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:58:23 -0700
Subject: x86/irq: remove leftover code from NUMA_MIGRATE_IRQ_DESC

The original feature of migrating irq_desc dynamic was too fragile
and was causing problems: it caused crashes on systems with lots of
cards with MSI-X when user-space irq-balancer was enabled.

We now have new patches that create irq_desc according to device
numa node. This patch removes the leftover bits of the dynamic balancer.

[ Impact: remove dead code ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F654AF.8000808@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                  | 10 -------
 arch/x86/configs/x86_64_defconfig |  1 -
 arch/x86/kernel/apic/io_apic.c    | 56 +++------------------------------------
 include/linux/irq.h               | 10 -------
 kernel/irq/Makefile               |  2 +-
 kernel/irq/chip.c                 | 12 ++-------
 kernel/irq/handle.c               |  9 ++-----
 kernel/irq/numa_migrate.c         |  2 --
 8 files changed, 9 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee1..e1b2543f8ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -274,16 +274,6 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
-config NUMA_MIGRATE_IRQ_DESC
-	bool "Move irq desc when changing irq smp_affinity"
-	depends on SPARSE_IRQ && NUMA
-	depends on BROKEN
-	default n
-	---help---
-	  This enables moving irq_desc to cpu/node that irq will use handled.
-
-	  If you don't know what to do here, say N.
-
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4..27b8ce0f590 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -195,7 +195,6 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
 CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
 # CONFIG_X86_ELAN is not set
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e..9fbf0f7ec7e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -148,9 +148,6 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-	u8 move_desc_pending : 1;
-#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -254,8 +251,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
 	return 0;
 }
 
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
 static void
 init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 {
@@ -356,19 +352,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 		old_desc->chip_data = NULL;
 	}
 }
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg = desc->chip_data;
-
-	if (!cfg->move_in_progress) {
-		/* it means that domain is not changed */
-		if (!cpumask_intersects(desc->affinity, mask))
-			cfg->move_desc_pending = 1;
-	}
-}
-#endif
+/* end for move_irq_desc */
 
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +362,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -592,9 +569,6 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	/* check that before desc->addinity get updated */
-	set_extra_move_desc(desc, mask);
-
 	cpumask_copy(desc->affinity, mask);
 
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
@@ -2393,8 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	set_extra_move_desc(desc, mask);
-
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
 	irte.vector = cfg->vector;
@@ -2491,34 +2463,14 @@ static void irq_complete_move(struct irq_desc **descp)
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-		if (likely(!cfg->move_desc_pending))
-			return;
-
-		/* domain has not changed, but affinity did */
-		me = smp_processor_id();
-		if (cpumask_test_cpu(me, desc->affinity)) {
-			*descp = desc = move_irq_desc(desc, me);
-			/* get the new one */
-			cfg = desc->chip_data;
-			cfg->move_desc_pending = 0;
-		}
-#endif
+	if (likely(!cfg->move_in_progress))
 		return;
-	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-		*descp = desc = move_irq_desc(desc, me);
-		/* get the new one */
-		cfg = desc->chip_data;
-#endif
+	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 		send_cleanup_vector(cfg);
-	}
 }
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c4953cf27e5..2a34cd6281d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -212,16 +212,6 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 
 extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
-static inline struct irq_desc *
-irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
-{
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-	return irq_to_desc(irq);
-#else
-	return desc;
-#endif
-}
-
 /*
  * Migration helpers for obsolete names, they will go away:
  */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f5296..2f065277f8e 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f..13c68e71b72 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
-		desc = irq_remap_to_desc(irq, desc);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	/* Start handling the irq */
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi) {
+	if (desc->chip->eoi)
 		desc->chip->eoi(irq);
-		desc = irq_remap_to_desc(irq, desc);
-	}
 }
 
 void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip) {
+		if (desc->chip != &no_irq_chip)
 			mask_ack_irq(desc, irq);
-			desc = irq_remap_to_desc(irq, desc);
-		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 882c7980010..3e0cbc44bd7 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -458,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack) {
+		if (desc->chip->ack)
 			desc->chip->ack(irq);
-			/* get new one */
-			desc = irq_remap_to_desc(irq, desc);
-		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -473,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack) {
+	if (desc->chip->ack)
 		desc->chip->ack(irq);
-		desc = irq_remap_to_desc(irq, desc);
-	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 5760d725162..ce72bc3f4ce 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -97,9 +97,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 
 	/* free the old one */
 	free_one_irq_desc(old_desc, desc);
-	spin_unlock(&old_desc->lock);
 	kfree(old_desc);
-	spin_lock(&desc->lock);
 
 	return desc;
 
-- 
cgit v1.2.3-70-g09d2


From d5dedd4507d307eb3f35f21b6e16f336fdc0d82a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:59:21 -0700
Subject: irq: change ->set_affinity() to return status

according to Ingo, change set_affinity() in irq_chip should return int,
because that way we can handle failure cases in a much cleaner way, in
the genirq layer.

v2: fix two typos

[ Impact: extend API ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-arch@vger.kernel.org
LKML-Reference: <49F654E9.4070809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/sys_dp264.c         |  8 +++--
 arch/alpha/kernel/sys_titan.c         |  4 ++-
 arch/arm/common/gic.c                 |  4 ++-
 arch/cris/arch-v32/kernel/irq.c       |  4 ++-
 arch/ia64/hp/sim/hpsim_irq.c          |  3 +-
 arch/ia64/kernel/iosapic.c            | 10 +++---
 arch/ia64/kernel/msi_ia64.c           | 16 +++++----
 arch/ia64/sn/kernel/irq.c             |  4 ++-
 arch/ia64/sn/kernel/msi_sn.c          |  8 +++--
 arch/mips/cavium-octeon/octeon-irq.c  |  8 +++--
 arch/mips/include/asm/irq.h           |  2 +-
 arch/mips/kernel/irq-gic.c            |  5 +--
 arch/mips/mti-malta/malta-smtc.c      |  4 ++-
 arch/mips/sibyte/bcm1480/irq.c        |  8 +++--
 arch/mips/sibyte/sb1250/irq.c         |  8 +++--
 arch/parisc/kernel/irq.c              |  6 ++--
 arch/powerpc/platforms/pseries/xics.c | 12 ++++---
 arch/powerpc/sysdev/mpic.c            |  4 ++-
 arch/sparc/kernel/irq_64.c            | 12 +++++--
 arch/x86/kernel/apic/io_apic.c        | 64 ++++++++++++++++++++++-------------
 drivers/parisc/iosapic.c              |  6 ++--
 drivers/xen/events.c                  | 12 ++++---
 include/linux/irq.h                   |  2 +-
 23 files changed, 140 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index 9c9d1fd4155..5bd5259324b 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 	}
 }
 
-static void
+static int
 dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
 	cpu_set_irq_affinity(irq, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
+
+	return 0;
 }
 
-static void
+static int
 clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
 	cpu_set_irq_affinity(irq - 16, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
+
+	return 0;
 }
 
 static struct hw_interrupt_type dp264_irq_type = {
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 27f840a4ad3..8dd239ebdb9 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 
 }
 
-static void
+static int
 titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&titan_irq_lock);
 	titan_cpu_set_irq_affinity(irq - 16, *affinity);
 	titan_update_irq_hw(titan_cached_irq_mask);
 	spin_unlock(&titan_irq_lock);
+
+	return 0;
 }
 
 static void
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index c6884ba1d5e..90f6b7f52d4 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
+static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
 	void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
 	unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 	val |= 1 << (cpu + shift);
 	writel(val, reg);
 	spin_unlock(&irq_controller_lock);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index df3925cb1c7..d70b445f4a8 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
+int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&irq_lock, flags);
 	irq_allocations[irq - FIRST_IRQ].mask = *dest;
 	spin_unlock_irqrestore(&irq_lock, flags);
+
+	return 0;
 }
 
 static struct irq_chip crisv32_irq_type = {
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index cc0a3182db3..acb5047ab57 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
 {
 }
 
-static void
+static int
 hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
+	return 0;
 }
 
 static struct hw_interrupt_type irq_type_hp_sim = {
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 166e0d839fa..f92cef47bf8 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
 }
 
 
-static void
+static int
 iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cpu = cpumask_first_and(cpu_online_mask, mask);
 	if (cpu >= nr_cpu_ids)
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	dest = cpu_physical_id(cpu);
 
 	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt */
+		return -1;			/* not an IOSAPIC interrupt */
 
 	set_irq_affinity_info(irq, dest, redir);
 
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 		iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
 		iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
 	}
+
 #endif
+	return 0;
 }
 
 /*
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 2b15e233f7f..0f8ade9331b 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -12,7 +12,7 @@
 static struct irq_chip	ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq,
+static int ia64_set_msi_irq_affinity(unsigned int irq,
 				      const cpumask_t *cpu_mask)
 {
 	struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 	int cpu = first_cpu(*cpu_mask);
 
 	if (!cpu_online(cpu))
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	read_msi_msg(irq, &msg);
 
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 
 	write_msi_msg(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	struct msi_msg msg;
 	int cpu = cpumask_first(mask);
 
 	if (!cpu_online(cpu))
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	dmar_msi_read(irq, &msg);
 
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dmar_msi_write(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, mask);
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 66fd705e82c..764f26abac0 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,7 +227,7 @@ finish_up:
 	return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
+static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
 	nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 	list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
 				 sn_irq_lh[irq], list)
 		(void)sn_retarget_vector(sn_irq_info, nasid, slice);
+
+	return 0;
 }
 
 #ifdef CONFIG_SMP
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 81e428943d7..fbbfb970120 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq,
+static int sn_set_msi_irq_affinity(unsigned int irq,
 				    const struct cpumask *cpu_mask)
 {
 	struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 	cpu = cpumask_first(cpu_mask);
 	sn_irq_info = sn_msi_info[irq].sn_irq_info;
 	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
-		return;
+		return -1;
 
 	/*
 	 * Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 	new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
 	sn_msi_info[irq].sn_irq_info = new_irq_info;
 	if (new_irq_info == NULL)
-		return;
+		return -1;
 
 	/*
 	 * Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 
 	write_msi_msg(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, cpu_mask);
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c
index 1c19af8daa6..d3a0c8154be 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu;
 	int bit = irq - OCTEON_IRQ_WORKQ0;	/* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
 	 */
 	cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
 	write_unlock(&octeon_irq_ciu0_rwlock);
+
+	return 0;
 }
 #endif
 
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu;
 	int bit = irq - OCTEON_IRQ_WDOG0;	/* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
 	 */
 	cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
 	write_unlock(&octeon_irq_ciu1_rwlock);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 3214ade02d1..4f1eed107b0 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq,
+extern int plat_set_irq_affinity(unsigned int irq,
 				  const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 87deb8f6c45..3f43c2e3aa5 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	cpumask_t	tmp = CPU_MASK_NONE;
 	unsigned long	flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 	cpumask_and(&tmp, cpumask, cpu_online_mask);
 	if (cpus_empty(tmp))
-		return;
+		return -1;
 
 	/* Assumption : cpumask refers to a single CPU */
 	spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(irq_desc[irq].affinity, cpumask);
 	spin_unlock_irqrestore(&gic_lock, flags);
 
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index 5ba31888fef..499ffe5475d 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
+int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
 	cpumask_t tmask;
 	int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 
 	/* Do any generic SMTC IRQ affinity setup */
 	smtc_set_irq_affinity(irq, tmask);
+
+	return 0;
 }
 #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index 352352b3cb2..4f256a131bf 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on, k;
 	u64 cur_ints;
@@ -119,7 +119,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (cpumask_weight(mask) != 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-		return;
+		return -1;
 	}
 	i = cpumask_first(mask);
 
@@ -155,6 +155,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 	}
 	spin_unlock(&bcm1480_imr_lock);
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index c08ff582da6..e389507f1f9 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on;
 	u64 cur_ints;
@@ -114,7 +114,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (cpumask_weight(mask) > 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-		return;
+		return -1;
 	}
 
 	/* Convert logical CPU to physical CPU */
@@ -146,6 +146,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 	}
 	spin_unlock(&sb1250_imr_lock);
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 4ea4229d765..8007f1e6572 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
 	return cpu_dest;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
+static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu_dest;
 
 	cpu_dest = cpu_check_affinity(irq, dest);
 	if (cpu_dest < 0)
-		return;
+		return -1;
 
 	cpumask_copy(&irq_desc[irq].affinity, dest);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 80b513449f4..be3581a8c29 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
 	lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
 	unsigned int irq;
 	int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 
 	irq = (unsigned int)irq_map[virq].hwirq;
 	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-		return;
+		return -1;
 
 	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
 
 	if (status) {
 		printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
 			__func__, irq, status);
-		return;
+		return -1;
 	}
 
 	/*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 		printk(KERN_WARNING
 			"%s: No online cpus in the mask %s for irq %d\n",
 			__func__, cpulist, virq);
-		return;
+		return -1;
 	}
 
 	status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 	if (status) {
 		printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
 			__func__, irq, status);
-		return;
+		return -1;
 	}
+
+	return 0;
 }
 
 static struct irq_chip xics_pic_direct = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 21b95670159..f4cbd15cf22 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
 	unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 		mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
 			       mpic_physmask(cpus_addr(tmp)[0]));
 	}
+
+	return 0;
 }
 
 static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 5deabe921a4..e5e78f9cfc9 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
 	}
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq,
+static int sun4u_set_affinity(unsigned int virt_irq,
 			       const struct cpumask *mask)
 {
 	sun4u_irq_enable(virt_irq);
+
+	return 0;
 }
 
 /* Don't do anything.  The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
 		       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq,
+static int sun4v_set_affinity(unsigned int virt_irq,
 			       const struct cpumask *mask)
 {
 	unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
 	if (err != HV_EOK)
 		printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
 		       "err(%d)\n", ino, cpuid, err);
+
+	return 0;
 }
 
 static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
 		       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq,
+static int sun4v_virt_set_affinity(unsigned int virt_irq,
 				    const struct cpumask *mask)
 {
 	unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
 		printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
 		       "err(%d)\n",
 		       dev_handle, dev_ino, cpuid, err);
+
+	return 0;
 }
 
 static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9fbf0f7ec7e..5c7630b40a5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -574,13 +574,14 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }
 
-static void
+static int
 set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	unsigned int irq;
+	int ret = -1;
 
 	irq = desc->irq;
 	cfg = desc->chip_data;
@@ -591,18 +592,21 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 		/* Only the high 8 bits are valid. */
 		dest = SET_APIC_LOGICAL_ID(dest);
 		__target_IO_APIC_irq(irq, dest, cfg);
+		ret = 0;
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return ret;
 }
 
-static void
+static int
 set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc;
 
 	desc = irq_to_desc(irq);
 
-	set_ioapic_affinity_irq_desc(desc, mask);
+	return set_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -2348,24 +2352,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
  * Real vector that is used for interrupting cpu will be coming from
  * the interrupt-remapping table entry.
  */
-static void
+static int
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irte irte;
 	unsigned int dest;
 	unsigned int irq;
+	int ret = -1;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
-		return;
+		return ret;
 
 	irq = desc->irq;
 	if (get_irte(irq, &irte))
-		return;
+		return ret;
 
 	cfg = desc->chip_data;
 	if (assign_irq_vector(irq, cfg, mask))
-		return;
+		return ret;
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
@@ -2381,27 +2386,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 		send_cleanup_vector(cfg);
 
 	cpumask_copy(desc->affinity, mask);
+
+	return 0;
 }
 
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 					    const struct cpumask *mask)
 {
-	migrate_ioapic_irq_desc(desc, mask);
+	return migrate_ioapic_irq_desc(desc, mask);
 }
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
 				       const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	set_ir_ioapic_affinity_irq_desc(desc, mask);
+	return set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 						   const struct cpumask *mask)
 {
+	return 0;
 }
 #endif
 
@@ -3318,7 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3327,7 +3335,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3339,13 +3347,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg_desc(desc, &msg);
+
+	return 0;
 }
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void
+static int
 ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -3354,11 +3364,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	struct irte irte;
 
 	if (get_irte(irq, &irte))
-		return;
+		return -1;
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
@@ -3375,6 +3385,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	 */
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
+
+	return 0;
 }
 
 #endif
@@ -3528,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3537,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3549,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
+
+	return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3582,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3591,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3603,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
+
+	return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3659,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3667,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
 	target_ht_irq(irq, dest, cfg->vector);
+
+	return 0;
 }
 
 #endif
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 73348c4047e..4a9cc92d4d1 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq,
+static int iosapic_set_affinity_irq(unsigned int irq,
 				     const struct cpumask *dest)
 {
 	struct vector_info *vi = iosapic_get_vector(irq);
@@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 
 	dest_cpu = cpu_check_affinity(irq, dest);
 	if (dest_cpu < 0)
-		return;
+		return -1;
 
 	cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
 	vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
@@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 	iosapic_set_irt_data(vi, &dummy_d0, &d1);
 	iosapic_wr_irt_entry(vi, d0, d1);
 	spin_unlock_irqrestore(&iosapic_lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 30963af5dba..33389880279 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -688,13 +688,13 @@ void rebind_evtchn_irq(int evtchn, int irq)
 }
 
 /* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 {
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 
 	if (!VALID_EVTCHN(evtchn))
-		return;
+		return -1;
 
 	/* Send future instances of this interrupt to other vcpu. */
 	bind_vcpu.port = evtchn;
@@ -707,13 +707,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	 */
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
 		bind_evtchn_to_cpu(evtchn, tcpu);
-}
 
+	return 0;
+}
 
-static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
 	unsigned tcpu = cpumask_first(dest);
-	rebind_irq_to_cpu(irq, tcpu);
+
+	return rebind_irq_to_cpu(irq, tcpu);
 }
 
 int resend_irq_on_evtchn(unsigned int irq)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2a34cd6281d..8e4c18b2915 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -117,7 +117,7 @@ struct irq_chip {
 	void		(*eoi)(unsigned int irq);
 
 	void		(*end)(unsigned int irq);
-	void		(*set_affinity)(unsigned int irq,
+	int		(*set_affinity)(unsigned int irq,
 					const struct cpumask *dest);
 	int		(*retrigger)(unsigned int irq);
 	int		(*set_type)(unsigned int irq, unsigned int flow_type);
-- 
cgit v1.2.3-70-g09d2


From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:00:38 -0700
Subject: x86/irq: change irq_desc_alloc() to take node instead of cpu

This simplifies the node awareness of the code. All our allocators
only deal with a NUMA node ID locality not with CPU ids anyway - so
there's no need to maintain (and transform) a CPU id all across the
IRq layer.

v2: keep move_irq_desc related

[ Impact: cleanup, prepare IRQ code to be NUMA-aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
LKML-Reference: <49F65536.2020300@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++-----------------------
 arch/x86/lguest/boot.c         |  2 +-
 drivers/pci/intr_remapping.c   | 15 +++++------
 drivers/xen/events.c           |  2 +-
 include/linux/interrupt.h      |  2 +-
 include/linux/irq.h            | 16 +++++-------
 kernel/irq/handle.c            | 28 ++++++++------------
 kernel/irq/internals.h         |  2 +-
 kernel/irq/numa_migrate.c      | 36 +++++++++-----------------
 kernel/softirq.c               |  2 +-
 10 files changed, 66 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c7630b40a5..560b887ba27 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -129,12 +129,9 @@ struct irq_pin_list {
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
 {
 	struct irq_pin_list *pin;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
 
@@ -209,12 +206,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 	return cfg;
 }
 
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
 {
 	struct irq_cfg *cfg;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 	if (cfg) {
@@ -235,13 +229,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 	return cfg;
 }
 
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 
 	cfg = desc->chip_data;
 	if (!cfg) {
-		desc->chip_data = get_one_free_irq_cfg(cpu);
+		desc->chip_data = get_one_free_irq_cfg(node);
 		if (!desc->chip_data) {
 			printk(KERN_ERR "can not alloc irq_cfg\n");
 			BUG_ON(1);
@@ -253,7 +247,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
 
 /* for move_irq_desc */
 static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 {
 	struct irq_pin_list *old_entry, *head, *tail, *entry;
 
@@ -262,7 +256,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 	if (!old_entry)
 		return;
 
-	entry = get_one_free_irq_2_pin(cpu);
+	entry = get_one_free_irq_2_pin(node);
 	if (!entry)
 		return;
 
@@ -272,7 +266,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 	tail		= entry;
 	old_entry	= old_entry->next;
 	while (old_entry) {
-		entry = get_one_free_irq_2_pin(cpu);
+		entry = get_one_free_irq_2_pin(node);
 		if (!entry) {
 			entry = head;
 			while (entry) {
@@ -312,12 +306,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 }
 
 void arch_init_copy_chip_data(struct irq_desc *old_desc,
-				 struct irq_desc *desc, int cpu)
+				 struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 	struct irq_cfg *old_cfg;
 
-	cfg = get_one_free_irq_cfg(cpu);
+	cfg = get_one_free_irq_cfg(node);
 
 	if (!cfg)
 		return;
@@ -328,7 +322,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
 
 	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
 
-	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+	init_copy_irq_2_pin(old_cfg, cfg, node);
 }
 
 static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -615,13 +609,13 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin(cpu);
+		entry = get_one_free_irq_2_pin(node);
 		if (!entry) {
 			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
 					apic, pin);
@@ -641,7 +635,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin(cpu);
+	entry->next = get_one_free_irq_2_pin(node);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -650,7 +644,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
@@ -670,7 +664,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+		add_pin_to_irq_node(cfg, node, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -1612,7 +1606,7 @@ static void __init setup_IO_APIC_irqs(void)
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1647,13 +1641,13 @@ static void __init setup_IO_APIC_irqs(void)
 					apic->multi_timer_check(apic_id, irq))
 				continue;
 
-			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			desc = irq_to_desc_alloc_node(irq, node);
 			if (!desc) {
 				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 				continue;
 			}
 			cfg = desc->chip_data;
-			add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+			add_pin_to_irq_node(cfg, node, apic_id, pin);
 
 			setup_IO_APIC_irq(apic_id, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
@@ -2863,7 +2857,7 @@ static inline void __init check_timer(void)
 {
 	struct irq_desc *desc = irq_to_desc(0);
 	struct irq_cfg *cfg = desc->chip_data;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	int no_pin1 = 0;
@@ -2929,7 +2923,7 @@ static inline void __init check_timer(void)
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+			add_pin_to_irq_node(cfg, node, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		} else {
 			/* for edge trigger, setup_IO_APIC_irq already
@@ -2966,7 +2960,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
@@ -3185,7 +3179,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new = NULL;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
@@ -3194,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new < nr_irqs; new++) {
-		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		desc_new = irq_to_desc_alloc_node(new, node);
 		if (!desc_new) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", new);
 			continue;
@@ -3968,7 +3962,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -3976,7 +3970,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 		return -EINVAL;
 	}
 
-	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc %d\n", irq);
 		return 0;
@@ -3987,7 +3981,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 	 */
 	if (irq >= NR_IRQS_LEGACY) {
 		cfg = desc->chip_data;
-		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+		add_pin_to_irq_node(cfg, node, ioapic, pin);
 	}
 
 	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44bafc..45acbcf2568 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -636,7 +636,7 @@ static void __init lguest_init_IRQ(void)
 
 void lguest_setup_irq(unsigned int irq)
 {
-	irq_to_desc_alloc_cpu(irq, 0);
+	irq_to_desc_alloc_node(irq, 0);
 	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
 }
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea724a6..9eff36a293e 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -23,15 +23,12 @@ struct irq_2_iommu {
 };
 
 #ifdef CONFIG_GENERIC_HARDIRQS
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
 {
 	struct irq_2_iommu *iommu;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
 
 	return iommu;
 }
@@ -48,7 +45,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 	return desc->irq_2_iommu;
 }
 
-static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	struct irq_2_iommu *irq_iommu;
@@ -56,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
 	/*
 	 * alloc irq desc if not allocated already.
 	 */
-	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 		return NULL;
@@ -65,14 +62,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
 	irq_iommu = desc->irq_2_iommu;
 
 	if (!irq_iommu)
-		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
 
 	return desc->irq_2_iommu;
 }
 
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
-	return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+	return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
 }
 
 #else /* !CONFIG_SPARSE_IRQ */
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 33389880279..be437c2bc94 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -335,7 +335,7 @@ static int find_unbound_irq(void)
 	if (irq == nr_irqs)
 		panic("No available IRQ to bind to: increase nr_irqs!\n");
 
-	desc = irq_to_desc_alloc_cpu(irq, 0);
+	desc = irq_to_desc_alloc_node(irq, 0);
 	if (WARN_ON(desc == NULL))
 		return -1;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91bb76f44f1..ff374ceface 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,6 +566,6 @@ struct irq_desc;
 extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern int arch_init_chip_data(struct irq_desc *desc, int node);
 
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8e4c18b2915..a09baf8f9d9 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -187,7 +187,7 @@ struct irq_desc {
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
-	unsigned int		cpu;
+	unsigned int		node;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
@@ -201,16 +201,16 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
-					struct irq_desc *desc, int cpu);
+					struct irq_desc *desc, int node);
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
 #else /* CONFIG_SPARSE_IRQ */
-extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
 #endif /* CONFIG_SPARSE_IRQ */
 
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -422,12 +422,10 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
  */
-static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 								bool boot)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
-	int node;
-
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
 
@@ -437,8 +435,6 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 		return true;
 	}
 
-	node = cpu_to_node(cpu);
-
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
 
@@ -494,7 +490,7 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 								bool boot)
 {
 	return true;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3e0cbc44bd7..a6368db2618 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -81,12 +81,10 @@ static struct irq_desc irq_desc_init = {
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
-	int node;
 	void *ptr;
 
-	node = cpu_to_node(cpu);
 	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
 
 	/*
@@ -94,33 +92,32 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 	 * init_copy_kstat_irqs() could still use old one
 	 */
 	if (ptr) {
-		printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
-			 cpu, node);
+		printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
 		desc->kstat_irqs = ptr;
 	}
 }
 
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
 	desc->irq = irq;
 #ifdef CONFIG_SMP
-	desc->cpu = cpu;
+	desc->node = node;
 #endif
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	init_kstat_irqs(desc, node, nr_cpu_ids);
 	if (!desc->kstat_irqs) {
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, node, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
 	init_desc_masks(desc);
-	arch_init_chip_data(desc, cpu);
+	arch_init_chip_data(desc, node);
 }
 
 /*
@@ -189,11 +186,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
-	int node;
 
 	if (irq >= nr_irqs) {
 		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -212,15 +208,13 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	if (desc)
 		goto out_unlock;
 
-	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
-		 irq, cpu, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
-	init_one_irq_desc(irq, desc, cpu);
+	init_one_irq_desc(irq, desc, node);
 
 	irq_desc_ptrs[irq] = desc;
 
@@ -270,7 +264,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	return irq_to_desc(irq);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index de5f412f6a9..73468253143 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
 
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ce72bc3f4ce..2f69bee57bf 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
 
 static void init_copy_kstat_irqs(struct irq_desc *old_desc,
 				 struct irq_desc *desc,
-				 int cpu, int nr)
+				 int node, int nr)
 {
-	init_kstat_irqs(desc, cpu, nr);
+	init_kstat_irqs(desc, node, nr);
 
 	if (desc->kstat_irqs != old_desc->kstat_irqs)
 		memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-		 struct irq_desc *desc, int cpu)
+		 struct irq_desc *desc, int node)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
-	if (!alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, node, false)) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
 				"for migration.\n", irq);
 		return false;
 	}
 	spin_lock_init(&desc->lock);
-	desc->cpu = cpu;
+	desc->node = node;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
 	init_copy_desc_masks(old_desc, desc);
-	arch_init_copy_chip_data(old_desc, desc, cpu);
+	arch_init_copy_chip_data(old_desc, desc, node);
 	return true;
 }
 
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-						int cpu)
+						int node)
 {
 	struct irq_desc *desc;
 	unsigned int irq;
 	unsigned long flags;
-	int node;
 
 	irq = old_desc->irq;
 
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	if (desc && old_desc != desc)
 		goto out_unlock;
 
-	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 	if (!desc) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		desc = old_desc;
 		goto out_unlock;
 	}
-	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+	if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
 		/* still use old one */
 		kfree(desc);
 		desc = old_desc;
@@ -107,24 +105,14 @@ out_unlock:
 	return desc;
 }
 
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-	int old_cpu;
-	int node, old_node;
-
 	/* those all static, do move them */
 	if (desc->irq < NR_IRQS_LEGACY)
 		return desc;
 
-	old_cpu = desc->cpu;
-	if (old_cpu != cpu) {
-		node = cpu_to_node(cpu);
-		old_node = cpu_to_node(old_cpu);
-		if (old_node != node)
-			desc = __real_move_irq_desc(desc, cpu);
-		else
-			desc->cpu = cpu;
-	}
+	if (desc->node != node)
+		desc = __real_move_irq_desc(desc, node);
 
 	return desc;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd34851..f674f332a02 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void)
 	return 0;
 }
 
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From a2f809b08ae4dddc1015c7dcd8659e5729e45b3e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:01:20 -0700
Subject: irq: change ACPI GSI APIs to also take a device argument

We want to use dev_to_node() later on, to be aware of the 'home node'
of the GSI in question.

[ Impact: cleanup, prepare the IRQ code to be more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Len Brown <lenb@kernel.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-acpi@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
LKML-Reference: <49F65560.20904@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/kernel/acpi.c        | 5 +++--
 arch/x86/include/asm/io_apic.h | 4 ++--
 arch/x86/include/asm/mpspec.h  | 4 +++-
 arch/x86/kernel/acpi/boot.c    | 8 ++++----
 arch/x86/kernel/apic/io_apic.c | 3 ++-
 drivers/acpi/pci_irq.c         | 5 +++--
 drivers/char/hpet.c            | 4 ++--
 drivers/pnp/pnpacpi/rsparser.c | 2 +-
 include/linux/acpi.h           | 2 +-
 9 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5510317db37..baec6f00f7f 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
 		return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 
 	fadt = (struct acpi_table_fadt *)fadt_header;
 
-	acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+	acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+				 ACPI_ACTIVE_LOW);
 	return 0;
 }
 
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e43601..07f2913ba5d 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,8 +154,8 @@ extern int timer_through_8259;
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
-				   int edge_level, int active_high_low);
+extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
+				  int irq, int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cd..3ea1f531f53 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -72,7 +72,9 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+				 int active_high_low);
 extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
 extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f80..6ee96b5530f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -522,7 +522,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	unsigned int irq;
 	unsigned int plat_gsi = gsi;
@@ -539,7 +539,7 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-		plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+		plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity);
 	}
 #endif
 	acpi_gsi_to_irq(plat_gsi, &irq);
@@ -1158,7 +1158,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
@@ -1253,7 +1253,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 		}
 	}
 #endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
 	return gsi;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 560b887ba27..d9346622601 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3958,7 +3958,8 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 51b9f8280f8..2faa9e2ac89 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		/* Interrupt Line values above 0xF are forbidden */
 		if (dev->irq > 0 && (dev->irq <= 0xF)) {
 			printk(" - using IRQ %d\n", dev->irq);
-			acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE,
+			acpi_register_gsi(&dev->dev, dev->irq,
+					  ACPI_LEVEL_SENSITIVE,
 					  ACPI_ACTIVE_LOW);
 			return 0;
 		} else {
@@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		}
 	}
 
-	rc = acpi_register_gsi(gsi, triggering, polarity);
+	rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
 	if (rc < 0) {
 		dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
 			 pin_name(pin));
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 340ba4f9dc5..4a9f3492b92 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
 			break;
 		}
 
-		gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE,
+		gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE,
 					ACPI_ACTIVE_LOW);
 		if (gsi > 0)
 			break;
@@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
 		irqp = &res->data.extended_irq;
 
 		for (i = 0; i < irqp->interrupt_count; i++) {
-			irq = acpi_register_gsi(irqp->interrupts[i],
+			irq = acpi_register_gsi(NULL, irqp->interrupts[i],
 				      irqp->triggering, irqp->polarity);
 			if (irq < 0)
 				return AE_ERROR;
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index adf17856bac..7f207f335be 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
 	}
 
 	flags = irq_flags(triggering, polarity, shareable);
-	irq = acpi_register_gsi(gsi, triggering, polarity);
+	irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
 	if (irq >= 0)
 		pcibios_penalize_isa_irq(irq, 1);
 	else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88be890ee3c..51b4b0a5ce8 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num;
 extern int sbf_port;
 extern unsigned long acpi_realmode_flags;
 
-int acpi_register_gsi (u32 gsi, int triggering, int polarity);
+int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
 #ifdef CONFIG_X86_IO_APIC
-- 
cgit v1.2.3-70-g09d2


From d047f53a2ecce37e3bdf79eac5a326fbaadb3628 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:02:23 -0700
Subject: x86/irq: change MSI irq_desc to be more numa aware

Try to get irq_desc on the home node in create_irq_nr().

v2: don't check if we can move it when sparse_irq is not used
v3: use move_irq_des, if that node is not what we want

[ Impact: optimization, make MSI IRQ descriptors more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F6559F.7070005@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 17 +++++++++++++----
 include/linux/irq.h            |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 82376e021b5..9cd4806cdf5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3172,14 +3172,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
 {
 	/* Allocate an unused irq */
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new = NULL;
-	int node = cpu_to_node(boot_cpu_id);
 	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
@@ -3197,6 +3196,13 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 		if (cfg_new->vector != 0)
 			continue;
+
+#ifdef CONFIG_NUMA_IRQ_DESC
+		/* different node ?*/
+		if (desc_new->node != node)
+			desc = move_irq_desc(desc, node);
+#endif
+
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
 		break;
@@ -3214,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 int create_irq(void)
 {
+	int node = cpu_to_node(boot_cpu_id);
 	unsigned int irq_want;
 	int irq;
 
 	irq_want = nr_irqs_gsi;
-	irq = create_irq_nr(irq_want);
+	irq = create_irq_nr(irq_want, node);
 
 	if (irq == 0)
 		irq = -1;
@@ -3476,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	unsigned int irq_want;
 	struct intel_iommu *iommu = NULL;
 	int index = 0;
+	int node;
 
 	/* x86 doesn't support multiple MSI yet */
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
+	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want);
+		irq = create_irq_nr(irq_want, node);
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index a09baf8f9d9..4b95ddb5304 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -376,7 +376,7 @@ extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
 /* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want);
+extern unsigned int create_irq_nr(unsigned int irq_want, int node);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
-- 
cgit v1.2.3-70-g09d2


From 6a321cb370ad3db4ba6e405e638b3a42c41089b0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 28 Apr 2009 04:43:42 -0700
Subject: net: netif_tx_queue_stopped too expensive

netif_tx_queue_stopped(txq) is most of the time false.

Yet its cost is very expensive on SMP.

static inline int netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
	return test_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
}

I saw this on oprofile hunting and bnx2 driver bnx2_tx_int().

We probably should split "struct netdev_queue" in two parts, one
being read mostly.

__netif_tx_lock() touches _xmit_lock & xmit_lock_owner, these
deserve a separate cache line.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8c3619d551..505a3c6cb12 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -447,12 +447,18 @@ enum netdev_queue_state_t
 };
 
 struct netdev_queue {
+/*
+ * read mostly part
+ */
 	struct net_device	*dev;
 	struct Qdisc		*qdisc;
 	unsigned long		state;
-	spinlock_t		_xmit_lock;
-	int			xmit_lock_owner;
 	struct Qdisc		*qdisc_sleeping;
+/*
+ * write mostly part
+ */
+	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
+	int			xmit_lock_owner;
 } ____cacheline_aligned_in_smp;
 
 
-- 
cgit v1.2.3-70-g09d2


From abc5c44d6284fab8fb21bcfc52c0f16f980637df Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:31:25 -0400
Subject: SUNRPC: Fix error return value of svc_addr_len()

The svc_addr_len() helper function returns -EAFNOSUPPORT if it doesn't
recognize the address family of the passed-in socket address.  However,
the return type of this function is size_t, which means -EAFNOSUPPORT
is turned into a very large positive value in this case.

The check in svc_udp_recvfrom() to see if the return value is less
than zero therefore won't work at all.

Additionally, handle_connect_req() passes this value directly to
memset().  This could cause memset() to clobber a large chunk of memory
if svc_addr_len() has returned an error.  Currently the address family
of these addresses, however, is known to be supported long before
handle_connect_req() is called, so this isn't a real risk.

Change the error return value of svc_addr_len() to zero, which fits in
the range of size_t, and is safer to pass to memset() directly.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/svc_xprt.h | 5 +++--
 net/sunrpc/svcsock.c            | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0d9cb6ef28b..d790c52525c 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -118,7 +118,7 @@ static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 	return 0;
 }
 
-static inline size_t svc_addr_len(struct sockaddr *sa)
+static inline size_t svc_addr_len(const struct sockaddr *sa)
 {
 	switch (sa->sa_family) {
 	case AF_INET:
@@ -126,7 +126,8 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	case AF_INET6:
 		return sizeof(struct sockaddr_in6);
 	}
-	return -EAFNOSUPPORT;
+
+	return 0;
 }
 
 static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index af3198814c1..8b083283413 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -426,13 +426,14 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
 	} buffer;
 	struct cmsghdr *cmh = &buffer.hdr;
-	int		err, len;
 	struct msghdr msg = {
 		.msg_name = svc_addr(rqstp),
 		.msg_control = cmh,
 		.msg_controllen = sizeof(buffer),
 		.msg_flags = MSG_DONTWAIT,
 	};
+	size_t len;
+	int err;
 
 	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
 	    /* udp sockets need large rcvbuf as all pending
@@ -464,8 +465,8 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		return -EAGAIN;
 	}
 	len = svc_addr_len(svc_addr(rqstp));
-	if (len < 0)
-		return len;
+	if (len == 0)
+		return -EAFNOSUPPORT;
 	rqstp->rq_addrlen = len;
 	if (skb->tstamp.tv64 == 0) {
 		skb->tstamp = ktime_get_real();
-- 
cgit v1.2.3-70-g09d2


From 335c54bdc4d3bacdbd619ec95cd0b352435bd37f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:32:25 -0400
Subject: NFSD: Prevent a buffer overflow in svc_xprt_names()

The svc_xprt_names() function can overflow its buffer if it's so near
the end of the passed in buffer that the "name too long" string still
doesn't fit.  Of course, it could never tell if it was near the end
of the passed in buffer, since its only caller passes in zero as the
buffer length.

Let's make this API a little safer.

Change svc_xprt_names() so it *always* checks for a buffer overflow,
and change its only caller to pass in the correct buffer length.

If svc_xprt_names() does overflow its buffer, it now fails with an
ENAMETOOLONG errno, instead of trying to write a message at the end
of the buffer.  I don't like this much, but I can't figure out a clean
way that's always safe to return some of the names, *and* an
indication that the buffer was not long enough.

The displayed error when doing a 'cat /proc/fs/nfsd/portlist' is
"File name too long".

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c                |  2 +-
 include/linux/sunrpc/svc_xprt.h |  2 +-
 net/sunrpc/svc_xprt.c           | 56 ++++++++++++++++++++++++++++-------------
 3 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e051847b93f..6a1cd908e6b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -918,7 +918,7 @@ static ssize_t __write_ports_names(char *buf)
 {
 	if (nfsd_serv == NULL)
 		return 0;
-	return svc_xprt_names(nfsd_serv, buf, 0);
+	return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
 }
 
 /*
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index d790c52525c..2223ae0b5ed 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -83,7 +83,7 @@ int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
 struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 			const sa_family_t af, const unsigned short port);
-int	svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
+int	svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
 {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f200393ac87..6f33d33cc06 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1098,36 +1098,58 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 }
 EXPORT_SYMBOL_GPL(svc_find_xprt);
 
-/*
- * Format a buffer with a list of the active transports. A zero for
- * the buflen parameter disables target buffer overflow checking.
+static int svc_one_xprt_name(const struct svc_xprt *xprt,
+			     char *pos, int remaining)
+{
+	int len;
+
+	len = snprintf(pos, remaining, "%s %u\n",
+			xprt->xpt_class->xcl_name,
+			svc_xprt_local_port(xprt));
+	if (len >= remaining)
+		return -ENAMETOOLONG;
+	return len;
+}
+
+/**
+ * svc_xprt_names - format a buffer with a list of transport names
+ * @serv: pointer to an RPC service
+ * @buf: pointer to a buffer to be filled in
+ * @buflen: length of buffer to be filled in
+ *
+ * Fills in @buf with a string containing a list of transport names,
+ * each name terminated with '\n'.
+ *
+ * Returns positive length of the filled-in string on success; otherwise
+ * a negative errno value is returned if an error occurs.
  */
-int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
+int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen)
 {
 	struct svc_xprt *xprt;
-	char xprt_str[64];
-	int totlen = 0;
-	int len;
+	int len, totlen;
+	char *pos;
 
 	/* Sanity check args */
 	if (!serv)
 		return 0;
 
 	spin_lock_bh(&serv->sv_lock);
+
+	pos = buf;
+	totlen = 0;
 	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
-		len = snprintf(xprt_str, sizeof(xprt_str),
-			       "%s %d\n", xprt->xpt_class->xcl_name,
-			       svc_xprt_local_port(xprt));
-		/* If the string was truncated, replace with error string */
-		if (len >= sizeof(xprt_str))
-			strcpy(xprt_str, "name-too-long\n");
-		/* Don't overflow buffer */
-		len = strlen(xprt_str);
-		if (buflen && (len + totlen >= buflen))
+		len = svc_one_xprt_name(xprt, pos, buflen - totlen);
+		if (len < 0) {
+			*buf = '\0';
+			totlen = len;
+		}
+		if (len <= 0)
 			break;
-		strcpy(buf+totlen, xprt_str);
+
+		pos += len;
 		totlen += len;
 	}
+
 	spin_unlock_bh(&serv->sv_lock);
 	return totlen;
 }
-- 
cgit v1.2.3-70-g09d2


From bfba9ab4c64f0e5c33930711e6c073c285e01fcf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:32:33 -0400
Subject: SUNRPC: pass buffer size to svc_addsock()

Adjust the synopsis of svc_addsock() to pass in the size of the output
buffer.  Add a documenting comment.

This is a cosmetic change for now.  A subsequent patch will make sure
the buffer length is passed to one_sock_name(), where the length will
actually be useful.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c               |  2 +-
 include/linux/sunrpc/svcsock.h |  3 ++-
 net/sunrpc/svcsock.c           | 16 +++++++++++++---
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6a1cd908e6b..1f1c2159b80 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -943,7 +943,7 @@ static ssize_t __write_ports_addfd(char *buf)
 	if (err != 0)
 		goto out;
 
-	err = svc_addsock(nfsd_serv, fd, buf);
+	err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
 	if (err < 0)
 		lockd_down();
 
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 483e10380aa..e23241c53f4 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -39,7 +39,8 @@ int		svc_send(struct svc_rqst *);
 void		svc_drop(struct svc_rqst *);
 void		svc_sock_update_bufs(struct svc_serv *serv);
 int		svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
-int		svc_addsock(struct svc_serv *serv, int fd, char *name_return);
+int		svc_addsock(struct svc_serv *serv, const int fd,
+					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
 void		svc_cleanup_xprt_sock(void);
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 8b083283413..6bec1e25b54 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1128,9 +1128,19 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	return svsk;
 }
 
-int svc_addsock(struct svc_serv *serv,
-		int fd,
-		char *name_return)
+/**
+ * svc_addsock - add a listener socket to an RPC service
+ * @serv: pointer to RPC service to which to add a new listener
+ * @fd: file descriptor of the new listener
+ * @name_return: pointer to buffer to fill in with name of listener
+ * @len: size of the buffer
+ *
+ * Fills in socket name and returns positive length of name if successful.
+ * Name is terminated with '\n'.  On error, returns a negative errno
+ * value.
+ */
+int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
+		const size_t len)
 {
 	int err = 0;
 	struct socket *so = sockfd_lookup(fd, &err);
-- 
cgit v1.2.3-70-g09d2


From 8435d34dbbe75678c3cdad3d53b1e7996a79b3bf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 23 Apr 2009 19:32:40 -0400
Subject: SUNRPC: pass buffer size to svc_sock_names()

Adjust the synopsis of svc_sock_names() to pass in the size of the
output buffer.  Add a documenting comment.

This is a cosmetic change for now.  A subsequent patch will make sure
the buffer length is passed to one_sock_name(), where the length will
actually be useful.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c               |  3 ++-
 include/linux/sunrpc/svcsock.h |  4 +++-
 net/sunrpc/svcsock.c           | 19 +++++++++++++++++--
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1f1c2159b80..b64a7fbfccf 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -966,7 +966,8 @@ static ssize_t __write_ports_delfd(char *buf)
 		return -ENOMEM;
 
 	if (nfsd_serv != NULL)
-		len = svc_sock_names(buf, nfsd_serv, toclose);
+		len = svc_sock_names(nfsd_serv, buf,
+					SIMPLE_TRANSACTION_LIMIT, toclose);
 	if (len >= 0)
 		lockd_down();
 
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index e23241c53f4..82716313894 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -38,7 +38,9 @@ int		svc_recv(struct svc_rqst *, long);
 int		svc_send(struct svc_rqst *);
 void		svc_drop(struct svc_rqst *);
 void		svc_sock_update_bufs(struct svc_serv *serv);
-int		svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
+int		svc_sock_names(struct svc_serv *serv, char *buf,
+					const size_t buflen,
+					const char *toclose);
 int		svc_addsock(struct svc_serv *serv, const int fd,
 					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6bec1e25b54..032b52ea954 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -259,8 +259,23 @@ static int one_sock_name(char *buf, struct svc_sock *svsk)
 	return len;
 }
 
-int
-svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
+/**
+ * svc_sock_names - construct a list of listener names in a string
+ * @serv: pointer to RPC service
+ * @buf: pointer to a buffer to fill in with socket names
+ * @buflen: size of the buffer to be filled
+ * @toclose: pointer to '\0'-terminated C string containing the name
+ *		of a listener to be closed
+ *
+ * Fills in @buf with a '\n'-separated list of names of listener
+ * sockets.  If @toclose is not NULL, the socket named by @toclose
+ * is closed, and is not included in the output list.
+ *
+ * Returns positive length of the socket name string, or a negative
+ * errno value on error.
+ */
+int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen,
+		   const char *toclose)
 {
 	struct svc_sock *svsk, *closesk = NULL;
 	int len = 0;
-- 
cgit v1.2.3-70-g09d2


From 4ed0d3e6c64cfd9ba4ceb2099b10d1cf8ece4320 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 24 Apr 2009 17:30:20 -0700
Subject: Intel IOMMU Pass Through Support

The patch adds kernel parameter intel_iommu=pt to set up pass through
mode in context mapping entry. This disables DMAR in linux kernel; but
KVM still runs on VT-d and interrupt remapping still works.

In this mode, kernel uses swiotlb for DMA API functions but other VT-d
functionalities are enabled for KVM. KVM always uses multi level
translation page table in VT-d. By default, pass though mode is disabled
in kernel.

This is useful when people don't want to enable VT-d DMAR in kernel but
still want to use KVM and interrupt remapping for reasons like DMAR
performance concern or debug purpose.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Weidong Han <weidong@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/kernel-parameters.txt |   1 +
 arch/ia64/include/asm/iommu.h       |   1 +
 arch/ia64/kernel/pci-swiotlb.c      |   2 +-
 arch/x86/include/asm/iommu.h        |   1 +
 arch/x86/kernel/pci-dma.c           |   6 ++
 arch/x86/kernel/pci-swiotlb.c       |   3 +-
 drivers/pci/dmar.c                  |  11 ++-
 drivers/pci/intel-iommu.c           | 180 ++++++++++++++++++++++++++----------
 include/linux/dma_remapping.h       |   8 ++
 include/linux/intel-iommu.h         |   2 +
 10 files changed, 165 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 600cdd72900..fa4faeb7597 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -965,6 +965,7 @@ and is between 256 and 4096 characters. It is defined in the file
 		nomerge
 		forcesac
 		soft
+		pt	[x86, IA64]
 
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
index 0490794fe4a..37d41ca5645 100644
--- a/arch/ia64/include/asm/iommu.h
+++ b/arch/ia64/include/asm/iommu.h
@@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int iommu_pass_through;
 extern void iommu_dma_init(void);
 extern void machvec_init(const char *name);
 
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8431c..223abb13410 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -46,7 +46,7 @@ void __init swiotlb_dma_init(void)
 
 void __init pci_swiotlb_init(void)
 {
-	if (!iommu_detected) {
+	if (!iommu_detected || iommu_pass_through) {
 #ifdef CONFIG_IA64_GENERIC
 		swiotlb = 1;
 		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index af326a2975b..fd6d21bbee6 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
 extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int iommu_pass_through;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 745579bc825..8cad0d85424 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -160,6 +160,8 @@ again:
 	return page_address(page);
 }
 
+extern int iommu_pass_through;
+
 /*
  * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
  * documentation.
@@ -209,6 +211,10 @@ static __init int iommu_setup(char *p)
 #ifdef CONFIG_SWIOTLB
 		if (!strncmp(p, "soft", 4))
 			swiotlb = 1;
+		if (!strncmp(p, "pt", 2)) {
+			iommu_pass_through = 1;
+			return 1;
+		}
 #endif
 
 		gart_parse_options(p);
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e26..3a0c51e0ba6 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-	if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
+	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
+		iommu_pass_through)
 	       swiotlb = 1;
 #endif
 	if (swiotlb_force)
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index fa3a11365ec..d3d86b749ee 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -515,6 +515,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	u32 ver;
 	static int iommu_allocated = 0;
 	int agaw = 0;
+	int msagaw = 0;
 
 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 	if (!iommu)
@@ -535,12 +536,20 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	agaw = iommu_calculate_agaw(iommu);
 	if (agaw < 0) {
 		printk(KERN_ERR
-			"Cannot get a valid agaw for iommu (seq_id = %d)\n",
+		       "Cannot get a valid agaw for iommu (seq_id = %d)\n",
+		       iommu->seq_id);
+		goto error;
+	}
+	msagaw = iommu_calculate_max_sagaw(iommu);
+	if (msagaw < 0) {
+		printk(KERN_ERR
+			"Cannot get a valid max agaw for iommu (seq_id = %d)\n",
 			iommu->seq_id);
 		goto error;
 	}
 #endif
 	iommu->agaw = agaw;
+	iommu->msagaw = msagaw;
 
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 001b328adf8..13121821db7 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -53,6 +53,8 @@
 
 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
 
+#define MAX_AGAW_WIDTH 64
+
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
@@ -127,8 +129,6 @@ static inline void context_set_fault_enable(struct context_entry *context)
 	context->lo &= (((u64)-1) << 2) | 1;
 }
 
-#define CONTEXT_TT_MULTI_LEVEL 0
-
 static inline void context_set_translation_type(struct context_entry *context,
 						unsigned long value)
 {
@@ -288,6 +288,7 @@ int dmar_disabled = 1;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
+int iommu_pass_through;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -397,17 +398,13 @@ void free_iova_mem(struct iova *iova)
 
 static inline int width_to_agaw(int width);
 
-/* calculate agaw for each iommu.
- * "SAGAW" may be different across iommus, use a default agaw, and
- * get a supported less agaw for iommus that don't support the default agaw.
- */
-int iommu_calculate_agaw(struct intel_iommu *iommu)
+static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 {
 	unsigned long sagaw;
 	int agaw = -1;
 
 	sagaw = cap_sagaw(iommu->cap);
-	for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	for (agaw = width_to_agaw(max_gaw);
 	     agaw >= 0; agaw--) {
 		if (test_bit(agaw, &sagaw))
 			break;
@@ -416,6 +413,24 @@ int iommu_calculate_agaw(struct intel_iommu *iommu)
 	return agaw;
 }
 
+/*
+ * Calculate max SAGAW for each iommu.
+ */
+int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
+}
+
+/*
+ * calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+}
+
 /* in native case, each domain is related to only one iommu */
 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 {
@@ -1321,8 +1336,8 @@ static void domain_exit(struct dmar_domain *domain)
 	free_domain_mem(domain);
 }
 
-static int domain_context_mapping_one(struct dmar_domain *domain,
-				      int segment, u8 bus, u8 devfn)
+static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
+				 u8 bus, u8 devfn, int translation)
 {
 	struct context_entry *context;
 	unsigned long flags;
@@ -1335,7 +1350,10 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
 	BUG_ON(!domain->pgd);
+	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
+	       translation != CONTEXT_TT_MULTI_LEVEL);
 
 	iommu = device_to_iommu(segment, bus, devfn);
 	if (!iommu)
@@ -1395,9 +1413,18 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	}
 
 	context_set_domain_id(context, id);
-	context_set_address_width(context, iommu->agaw);
-	context_set_address_root(context, virt_to_phys(pgd));
-	context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
+
+	/*
+	 * In pass through mode, AW must be programmed to indicate the largest
+	 * AGAW value supported by hardware. And ASR is ignored by hardware.
+	 */
+	if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
+		context_set_address_width(context, iommu->agaw);
+		context_set_address_root(context, virt_to_phys(pgd));
+	} else
+		context_set_address_width(context, iommu->msagaw);
+
+	context_set_translation_type(context, translation);
 	context_set_fault_enable(context);
 	context_set_present(context);
 	domain_flush_cache(domain, context, sizeof(*context));
@@ -1422,13 +1449,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 }
 
 static int
-domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
+domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
+			int translation)
 {
 	int ret;
 	struct pci_dev *tmp, *parent;
 
 	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
-					 pdev->bus->number, pdev->devfn);
+					 pdev->bus->number, pdev->devfn,
+					 translation);
 	if (ret)
 		return ret;
 
@@ -1442,7 +1471,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
 		ret = domain_context_mapping_one(domain,
 						 pci_domain_nr(parent->bus),
 						 parent->bus->number,
-						 parent->devfn);
+						 parent->devfn, translation);
 		if (ret)
 			return ret;
 		parent = parent->bus->self;
@@ -1450,12 +1479,14 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
 	if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
 		return domain_context_mapping_one(domain,
 					pci_domain_nr(tmp->subordinate),
-					tmp->subordinate->number, 0);
+					tmp->subordinate->number, 0,
+					translation);
 	else /* this is a legacy PCI bridge */
 		return domain_context_mapping_one(domain,
 						  pci_domain_nr(tmp->bus),
 						  tmp->bus->number,
-						  tmp->devfn);
+						  tmp->devfn,
+						  translation);
 }
 
 static int domain_context_mapped(struct pci_dev *pdev)
@@ -1752,7 +1783,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev,
 		goto error;
 
 	/* context entry init */
-	ret = domain_context_mapping(domain, pdev);
+	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 	if (!ret)
 		return 0;
 error:
@@ -1853,6 +1884,23 @@ static inline void iommu_prepare_isa(void)
 }
 #endif /* !CONFIG_DMAR_FLPY_WA */
 
+/* Initialize each context entry as pass through.*/
+static int __init init_context_pass_through(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct dmar_domain *domain;
+	int ret;
+
+	for_each_pci_dev(pdev) {
+		domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+		ret = domain_context_mapping(domain, pdev,
+					     CONTEXT_TT_PASS_THROUGH);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int __init init_dmars(void)
 {
 	struct dmar_drhd_unit *drhd;
@@ -1860,6 +1908,7 @@ static int __init init_dmars(void)
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
 	int i, ret;
+	int pass_through = 1;
 
 	/*
 	 * for each drhd
@@ -1913,7 +1962,15 @@ static int __init init_dmars(void)
 			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
 			goto error;
 		}
+		if (!ecap_pass_through(iommu->ecap))
+			pass_through = 0;
 	}
+	if (iommu_pass_through)
+		if (!pass_through) {
+			printk(KERN_INFO
+			       "Pass Through is not supported by hardware.\n");
+			iommu_pass_through = 0;
+		}
 
 	/*
 	 * Start from the sane iommu hardware state.
@@ -1976,37 +2033,57 @@ static int __init init_dmars(void)
 			       "IOMMU: enable interrupt remapping failed\n");
 	}
 #endif
+	/*
+	 * If pass through is set and enabled, context entries of all pci
+	 * devices are intialized by pass through translation type.
+	 */
+	if (iommu_pass_through) {
+		ret = init_context_pass_through();
+		if (ret) {
+			printk(KERN_ERR "IOMMU: Pass through init failed.\n");
+			iommu_pass_through = 0;
+		}
+	}
 
 	/*
-	 * For each rmrr
-	 *   for each dev attached to rmrr
-	 *   do
-	 *     locate drhd for dev, alloc domain for dev
-	 *     allocate free domain
-	 *     allocate page table entries for rmrr
-	 *     if context not allocated for bus
-	 *           allocate and init context
-	 *           set present in root table for this bus
-	 *     init context with domain, translation etc
-	 *    endfor
-	 * endfor
+	 * If pass through is not set or not enabled, setup context entries for
+	 * identity mappings for rmrr, gfx, and isa.
 	 */
-	for_each_rmrr_units(rmrr) {
-		for (i = 0; i < rmrr->devices_cnt; i++) {
-			pdev = rmrr->devices[i];
-			/* some BIOS lists non-exist devices in DMAR table */
-			if (!pdev)
-				continue;
-			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
-			if (ret)
-				printk(KERN_ERR
+	if (!iommu_pass_through) {
+		/*
+		 * For each rmrr
+		 *   for each dev attached to rmrr
+		 *   do
+		 *     locate drhd for dev, alloc domain for dev
+		 *     allocate free domain
+		 *     allocate page table entries for rmrr
+		 *     if context not allocated for bus
+		 *           allocate and init context
+		 *           set present in root table for this bus
+		 *     init context with domain, translation etc
+		 *    endfor
+		 * endfor
+		 */
+		for_each_rmrr_units(rmrr) {
+			for (i = 0; i < rmrr->devices_cnt; i++) {
+				pdev = rmrr->devices[i];
+				/*
+				 * some BIOS lists non-exist devices in DMAR
+				 * table.
+				 */
+				if (!pdev)
+					continue;
+				ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+				if (ret)
+					printk(KERN_ERR
 				 "IOMMU: mapping reserved region failed\n");
+			}
 		}
-	}
 
-	iommu_prepare_gfx_mapping();
+		iommu_prepare_gfx_mapping();
 
-	iommu_prepare_isa();
+		iommu_prepare_isa();
+	}
 
 	/*
 	 * for each drhd
@@ -2117,7 +2194,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev)
 
 	/* make sure context mapping is ok */
 	if (unlikely(!domain_context_mapped(pdev))) {
-		ret = domain_context_mapping(domain, pdev);
+		ret = domain_context_mapping(domain, pdev,
+					     CONTEXT_TT_MULTI_LEVEL);
 		if (ret) {
 			printk(KERN_ERR
 				"Domain context map for %s failed",
@@ -2786,7 +2864,7 @@ int __init intel_iommu_init(void)
 	 * Check the need for DMA-remapping initialization now.
 	 * Above initialization will also be used by Interrupt-remapping.
 	 */
-	if (no_iommu || swiotlb || dmar_disabled)
+	if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
 		return -ENODEV;
 
 	iommu_init_mempool();
@@ -2806,7 +2884,15 @@ int __init intel_iommu_init(void)
 
 	init_timer(&unmap_timer);
 	force_iommu = 1;
-	dma_ops = &intel_dma_ops;
+
+	if (!iommu_pass_through) {
+		printk(KERN_INFO
+		       "Multi-level page-table translation for DMAR.\n");
+		dma_ops = &intel_dma_ops;
+	} else
+		printk(KERN_INFO
+		       "DMAR: Pass through translation for DMAR.\n");
+
 	init_iommu_sysfs();
 
 	register_iommu(&intel_iommu_ops);
@@ -3146,7 +3232,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 		return -EFAULT;
 	}
 
-	ret = domain_context_mapping(dmar_domain, pdev);
+	ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 1a455f1f86d..e0a03aff63d 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -13,6 +13,9 @@
 #define DMA_PTE_WRITE (2)
 #define DMA_PTE_SNP (1 << 11)
 
+#define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_PASS_THROUGH 2
+
 struct intel_iommu;
 struct dmar_domain;
 struct root_entry;
@@ -21,11 +24,16 @@ extern void free_dmar_iommu(struct intel_iommu *iommu);
 
 #ifdef CONFIG_DMAR
 extern int iommu_calculate_agaw(struct intel_iommu *iommu);
+extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
 	return 0;
 }
+static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return 0;
+}
 #endif
 
 extern int dmar_disabled;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index aa8c5317123..7246971a7fe 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
 #define ecap_coherent(e)	((e) & 0x1)
 #define ecap_qis(e)		((e) & 0x2)
+#define ecap_pass_through(e)	((e >> 6) & 0x1)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
@@ -302,6 +303,7 @@ struct intel_iommu {
 	spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
 	int		agaw; /* agaw of this iommu */
+	int		msagaw; /* max sagaw of this iommu */
 	unsigned int 	irq;
 	unsigned char 	name[13];    /* Device Name */
 
-- 
cgit v1.2.3-70-g09d2


From 0f9a623dd6c9b5b4dd00c232f29525bfc7a8ecf2 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:51 +0100
Subject: tracing: x86, mmiotrace: only register for die notifier when tracer
 active

Follow up to afcfe024aebd74b0984a41af9a34e009cf5badaf in Linus' tree
("x86: mmiotrace: quieten spurious warning message")

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-5-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c       | 27 ++++++++++++++++++++++-----
 arch/x86/mm/mmio-mod.c    |  2 ++
 include/linux/mmiotrace.h |  2 ++
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index a769d1a2d93..256ce643b0b 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -311,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
@@ -529,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 }
 EXPORT_SYMBOL(unregister_kmmio_probe);
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
 {
 	struct die_args *arg = args;
 
@@ -545,11 +550,23 @@ static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
 
-static int __init init_kmmio(void)
+int kmmio_init(void)
 {
 	int i;
+
 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
 	return register_die_notifier(&nb_die);
 }
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+	}
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed8b40..132772a8ec5 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
+	kmmio_init();
 	enter_uniprocessor();
 	spin_lock_irq(&trace_lock);
 	atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
 	leave_uniprocessor();
+	kmmio_cleanup();
 	pr_info(NAME "disabled.\n");
 out:
 	mutex_unlock(&mmiotrace_mutex);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 3d1b7bde128..97491f78b08 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -30,6 +30,8 @@ extern unsigned int kmmio_count;
 
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
+extern int kmmio_init(void);
+extern void kmmio_cleanup(void);
 
 #ifdef CONFIG_MMIOTRACE
 /* kmmio is active by some kmmio_probes? */
-- 
cgit v1.2.3-70-g09d2


From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:47 -0500
Subject: tracing/filters: move preds into event_filter object

Create a new event_filter object, and move the pred-related members
out of the call and subsystem objects and into the filter object - the
details of the filter implementation don't need to be exposed in the
call and subsystem in any case, and it will also help make the new
parser implementation a little cleaner.

[ Impact: refactor trace-filter code to prepare for new features ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905887.6416.119.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |   4 +-
 kernel/trace/trace.h               |  10 ++--
 kernel/trace/trace_events.c        |   3 +-
 kernel/trace/trace_events_filter.c | 107 +++++++++++++++++++++++--------------
 4 files changed, 76 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 78a9ba24cbf..46a27f2695a 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -101,8 +101,8 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
+	int			filter_active;
+	void			*filter;
 	void			*mod;
 
 #ifdef CONFIG_EVENT_PROFILE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7d55bcf50e4..1fb7d6ccadf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -731,12 +731,16 @@ struct ftrace_event_field {
 	int			size;
 };
 
+struct event_filter {
+	int			n_preds;
+	struct filter_pred	**preds;
+};
+
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
-	int			n_preds;
-	struct filter_pred	**preds;
+	void			*filter;
 };
 
 struct filter_pred;
@@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index be4d3a437c1..1cd1f37373d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	list_add(&system->list, &event_subsystems);
 
-	system->preds = NULL;
-	system->n_preds = 0;
+	system->filter = NULL;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 65418288f95..1e861eca3d0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
+	struct event_filter *filter = call->filter;
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < call->n_preds; i++) {
-		pred = call->preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		if (and_failed && !pred->or)
 			continue;
 		matched = pred->fn(pred, rec);
@@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+static void __filter_print_preds(struct event_filter *filter,
 				 struct trace_seq *s)
 {
-	char *field_name;
 	struct filter_pred *pred;
+	char *field_name;
 	int i;
 
-	if (!n_preds) {
+	if (!filter || !filter->n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < n_preds; i++) {
-		pred = preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		field_name = pred->field_name;
 		if (i)
 			trace_seq_printf(s, pred->or ? "|| " : "&& ");
@@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds,
 void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->preds, call->n_preds, s);
+	__filter_print_preds(call->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->preds, system->n_preds, s);
+	__filter_print_preds(system->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest,
 
 static void __filter_disable_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter = call->filter;
 	int i;
 
-	call->n_preds = 0;
+	call->filter_active = 0;
+	filter->n_preds = 0;
 
 	for (i = 0; i < MAX_FILTER_PRED; i++)
-		call->preds[i]->fn = filter_pred_none;
+		filter->preds[i]->fn = filter_pred_none;
 }
 
 void filter_disable_preds(struct ftrace_event_call *call)
@@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call)
 
 int init_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter;
 	struct filter_pred *pred;
 	int i;
 
-	call->n_preds = 0;
-
-	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
-	if (!call->preds)
+	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!call->filter)
 		return -ENOMEM;
 
+	call->filter_active = 0;
+	filter->n_preds = 0;
+
+	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!filter->preds)
+		goto oom;
+
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
 		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 		if (!pred)
 			goto oom;
 		pred->fn = filter_pred_none;
-		call->preds[i] = pred;
+		filter->preds[i] = pred;
 	}
 
 	return 0;
 
 oom:
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i])
-			filter_free_pred(call->preds[i]);
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
 	}
-	kfree(call->preds);
-	call->preds = NULL;
+	kfree(filter->preds);
+	kfree(call->filter);
+	call->filter = NULL;
 
 	return -ENOMEM;
 }
@@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds);
 
 static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (system->n_preds) {
-		for (i = 0; i < system->n_preds; i++)
-			filter_free_pred(system->preds[i]);
-		kfree(system->preds);
-		system->preds = NULL;
-		system->n_preds = 0;
+	if (filter && filter->n_preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter_free_pred(filter->preds[i]);
+		kfree(filter->preds);
+		kfree(filter);
+		system->filter = NULL;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
+	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (call->n_preds && !pred->compound)
+	if (filter->n_preds && !pred->compound)
 		__filter_disable_preds(call);
 
-	if (call->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
-	idx = call->n_preds;
-	filter_clear_pred(call->preds[idx]);
-	err = filter_set_pred(call->preds[idx], pred, fn);
+	idx = filter->n_preds;
+	filter_clear_pred(filter->preds[idx]);
+	err = filter_set_pred(filter->preds[idx], pred, fn);
 	if (err)
 		return err;
 
-	call->n_preds++;
+	filter->n_preds++;
+	call->filter_active = 1;
 
 	return 0;
 }
@@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
 	mutex_lock(&filter_mutex);
 
-	if (system->n_preds && !pred->compound)
+	if (filter && filter->n_preds && !pred->compound) {
 		__filter_free_subsystem_preds(system);
+		filter = NULL;
+	}
 
-	if (!system->n_preds) {
-		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+	if (!filter) {
+		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+		if (!system->filter) {
+			mutex_unlock(&filter_mutex);
+			return -ENOMEM;
+		}
+		filter = system->filter;
+		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds) {
+
+		if (!filter->preds) {
+			kfree(system->filter);
+			system->filter = NULL;
 			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
 		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED) {
+	if (filter->n_preds == MAX_FILTER_PRED) {
 		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
 	}
 
-	system->preds[system->n_preds] = pred;
-	system->n_preds++;
+	filter->preds[filter->n_preds] = pred;
+	filter->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
-			system->preds[system->n_preds] = NULL;
-			system->n_preds--;
+			filter->preds[filter->n_preds] = NULL;
+			filter->n_preds--;
 			mutex_unlock(&filter_mutex);
 			return err;
 		}
-- 
cgit v1.2.3-70-g09d2


From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:53 -0500
Subject: tracing/filters: distinguish between signed and unsigned fields

The new filter comparison ops need to be able to distinguish between
signed and unsigned field types, so add an is_signed flag/param to the
event field struct/trace_define_fields().  Also define a simple macro,
is_signed_type() to determine the signedness at compile time, used in the
trace macros.  If the is_signed_type() macro won't work with a specific
type, a new slightly modified version of TRACE_FIELD() called
TRACE_FIELD_SIGN(), allows the signedness to be set explicitly.

[ Impact: extend trace-filter code for new feature ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905893.6416.120.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h     |  7 ++++---
 include/trace/ftrace.h           | 16 ++++++++--------
 kernel/trace/trace.h             |  1 +
 kernel/trace/trace_event_types.h |  4 ++--
 kernel/trace/trace_events.c      |  3 ++-
 kernel/trace/trace_export.c      | 29 ++++++++++++++++++++++-------
 6 files changed, 39 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 46a27f2695a..e61a7403f3d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -122,8 +122,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call,
 					struct ring_buffer_event *event);
 
 extern int trace_define_field(struct ftrace_event_call *call, char *type,
-			      char *name, int offset, int size);
+			      char *name, int offset, int size, int is_signed);
 
+#define is_signed_type(type)	(((type)(-1)) < 0)
 
 /*
  * The double __builtin_constant_p is because gcc will give us an error
@@ -144,10 +145,10 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
-#define __common_field(type, item)					\
+#define __common_field(type, item, is_signed)				\
 	ret = trace_define_field(event_call, #type, "common_" #item,	\
 				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
+				 sizeof(field.ent.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1e681142f1d..edb02bc9f8f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -225,7 +225,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define __field(type, item)						\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -234,7 +234,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
 	if (ret)							\
 		return ret;
 
@@ -242,7 +242,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define __string(item, src)						       \
 	ret = trace_define_field(event_call, "__str_loc", #item,	       \
 				offsetof(typeof(field), __str_loc_##item),     \
-				sizeof(field.__str_loc_##item));
+				 sizeof(field.__str_loc_##item), 0);
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
@@ -253,11 +253,11 @@ ftrace_define_fields_##call(void)					\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	int ret;							\
 									\
-	__common_field(int, type);					\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(int, type, 1);					\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1fb7d6ccadf..866d0108fd2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -729,6 +729,7 @@ struct ftrace_event_field {
 	char			*type;
 	int			offset;
 	int			size;
+	int			is_signed;
 };
 
 struct event_filter {
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index cfcecc4fd86..5e32e375134 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 	TRACE_STRUCT(
-		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1cd1f37373d..bbbea747937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size)
+		       char *name, int offset, int size, int is_signed)
 {
 	struct ftrace_event_field *field;
 
@@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 	field->offset = offset;
 	field->size = size;
+	field->is_signed = is_signed;
 	list_add(&field->link, &call->fields);
 
 	return 0;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0cb1a142c74..d06cf898dc8 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -50,6 +50,9 @@ extern void __bad_type_size(void);
 	if (!ret)							\
 		return 0;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
 
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
@@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
+
 #undef TP_CMD
 #define TP_CMD(cmd...)	cmd
 
@@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
@@ -173,11 +188,11 @@ ftrace_define_fields_##call(void)					\
 	struct args field;						\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(unsigned char, type, 0);				\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
-- 
cgit v1.2.3-70-g09d2


From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:59 -0500
Subject: tracing/filters: a better event parser

Replace the current event parser hack with a better one.  Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:

numeric fields:

==, !=, <, <=, >, >=

string fields:

==, !=

predicates can be combined with the logical operators:

&&, ||

examples:

"common_preempt_count > 4" > filter

"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter

If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:

((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found

Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.

To clear a filter, '0' can be written to the filter file.

Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem.  Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared.  This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.

Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.

[ Impact: add new, extended trace-filter implementation ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |    2 +-
 kernel/trace/trace.h               |   66 ++-
 kernel/trace/trace_events.c        |   86 ++-
 kernel/trace/trace_events_filter.c | 1020 ++++++++++++++++++++++++++++--------
 4 files changed, 884 insertions(+), 290 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index e61a7403f3d..5fff40c9ff5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -112,7 +112,7 @@ struct ftrace_event_call {
 #endif
 };
 
-#define MAX_FILTER_PRED		8
+#define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	128
 
 extern int init_preds(struct ftrace_event_call *call);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 866d0108fd2..7736fe8c1b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -735,6 +735,7 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;
 	struct filter_pred	**preds;
+	char			*filter_string;
 };
 
 struct event_subsystem {
@@ -746,7 +747,8 @@ struct event_subsystem {
 
 struct filter_pred;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+				 int val1, int val2);
 
 struct filter_pred {
 	filter_pred_fn_t fn;
@@ -756,23 +758,18 @@ struct filter_pred {
 	char *field_name;
 	int offset;
 	int not;
-	int or;
-	int compound;
-	int clear;
+	int op;
+	int pop_n;
 };
 
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct ftrace_event_call *call,
+extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-			   struct filter_pred *pred);
-extern void filter_disable_preds(struct ftrace_event_call *call);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern void filter_print_subsystem_preds(struct event_subsystem *system,
+extern int apply_event_filter(struct ftrace_event_call *call,
+			      char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+					char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-				     struct filter_pred *pred);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+#define DEFINE_COMPARISON_PRED(type)					\
+static int filter_pred_##type(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = 0;							\
+									\
+	switch (pred->op) {						\
+	case OP_LT:							\
+		match = (*addr < val);					\
+		break;							\
+	case OP_LE:							\
+		match = (*addr <= val);					\
+		break;							\
+	case OP_GT:							\
+		match = (*addr > val);					\
+		break;							\
+	case OP_GE:							\
+		match = (*addr >= val);					\
+		break;							\
+	default:							\
+		break;							\
+	}								\
+									\
+	return match;							\
+}
+
+#define DEFINE_EQUALITY_PRED(size)					\
+static int filter_pred_##size(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	u##size *addr = (u##size *)(event + pred->offset);		\
+	u##size val = (u##size)pred->val;				\
+	int match;							\
+									\
+	match = (val == *addr) ^ pred->not;				\
+									\
+	return match;							\
+}
+
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bbbea747937..f789ca540fe 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call, s);
+	print_event_filter(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	struct ftrace_event_call *call = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_disable_preds(call);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_pred(call, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_event_filter(call, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
-
-	filter_free_pred(pred);
 
 	*ppos += cnt;
 
@@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_subsystem_preds(system, s);
+	print_subsystem_event_filter(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
 	struct event_subsystem *system = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_free_subsystem_preds(system);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_subsystem_pred(system, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_subsystem_event_filter(system, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
 
 	*ppos += cnt;
 
@@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->filter = NULL;
 
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter) {
+		pr_warning("Could not allocate filter for subsystem "
+			   "'%s'\n", name);
+		return system->entry;
+	}
+
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
-	if (!entry)
+	if (!entry) {
+		kfree(system->filter);
+		system->filter = NULL;
 		pr_warning("Could not create debugfs "
 			   "'%s/filter' entry\n", name);
+	}
 
 	return system->entry;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1e861eca3d0..f49486687ee 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -29,51 +29,130 @@
 
 static DEFINE_MUTEX(filter_mutex);
 
-static int filter_pred_64(struct filter_pred *pred, void *event)
+enum filter_op_ids
 {
-	u64 *addr = (u64 *)(event + pred->offset);
-	u64 val = (u64)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_32(struct filter_pred *pred, void *event)
-{
-	u32 *addr = (u32 *)(event + pred->offset);
-	u32 val = (u32)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_16(struct filter_pred *pred, void *event)
+	OP_OR,
+	OP_AND,
+	OP_NE,
+	OP_EQ,
+	OP_LT,
+	OP_LE,
+	OP_GT,
+	OP_GE,
+	OP_NONE,
+	OP_OPEN_PAREN,
+};
+
+struct filter_op {
+	int id;
+	char *string;
+	int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+	{ OP_OR, "||", 1 },
+	{ OP_AND, "&&", 2 },
+	{ OP_NE, "!=", 4 },
+	{ OP_EQ, "==", 4 },
+	{ OP_LT, "<", 5 },
+	{ OP_LE, "<=", 5 },
+	{ OP_GT, ">", 5 },
+	{ OP_GE, ">=", 5 },
+	{ OP_NONE, "OP_NONE", 0 },
+	{ OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+	FILT_ERR_NONE,
+	FILT_ERR_INVALID_OP,
+	FILT_ERR_UNBALANCED_PAREN,
+	FILT_ERR_TOO_MANY_OPERANDS,
+	FILT_ERR_OPERAND_TOO_LONG,
+	FILT_ERR_FIELD_NOT_FOUND,
+	FILT_ERR_ILLEGAL_FIELD_OP,
+	FILT_ERR_ILLEGAL_INTVAL,
+	FILT_ERR_BAD_SUBSYS_FILTER,
+	FILT_ERR_TOO_MANY_PREDS,
+	FILT_ERR_MISSING_FIELD,
+	FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+	"No error",
+	"Invalid operator",
+	"Unbalanced parens",
+	"Too many operands",
+	"Operand too long",
+	"Field not found",
+	"Illegal operation for field type",
+	"Illegal integer value",
+	"Couldn't find or set field in one of a subsystem's events",
+	"Too many terms in predicate expression",
+	"Missing field name and/or value",
+	"Meaningless filter expression",
+};
+
+struct opstack_op {
+	int op;
+	struct list_head list;
+};
+
+struct postfix_elt {
+	int op;
+	char *operand;
+	struct list_head list;
+};
+
+struct filter_parse_state {
+	struct filter_op *ops;
+	struct list_head opstack;
+	struct list_head postfix;
+	int lasterr;
+	int lasterr_pos;
+
+	struct {
+		char *string;
+		unsigned int cnt;
+		unsigned int tail;
+	} infix;
+
+	struct {
+		char string[MAX_FILTER_STR_VAL];
+		int pos;
+		unsigned int tail;
+	} operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+			   void *event __attribute((unused)),
+			   int val1, int val2)
 {
-	u16 *addr = (u16 *)(event + pred->offset);
-	u16 val = (u16)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 && val2;
 }
 
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+			  void *event __attribute((unused)),
+			  int val1, int val2)
 {
-	u8 *addr = (u8 *)(event + pred->offset);
-	u8 val = (u8)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 || val2;
 }
 
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int filter_pred_string(struct filter_pred *pred, void *event,
+			      int val1, int val2)
 {
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
@@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+			    int val1, int val2)
 {
 	return 0;
 }
@@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	struct event_filter *filter = call->filter;
-	int i, matched, and_failed = 0;
+	int match, top = 0, val1 = 0, val2 = 0;
+	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
+	int i;
 
 	for (i = 0; i < filter->n_preds; i++) {
 		pred = filter->preds[i];
-		if (and_failed && !pred->or)
+		if (!pred->pop_n) {
+			match = pred->fn(pred, rec, val1, val2);
+			stack[top++] = match;
 			continue;
-		matched = pred->fn(pred, rec);
-		if (!matched && !pred->or) {
-			and_failed = 1;
-			continue;
-		} else if (matched && pred->or)
-			return 1;
+		}
+		if (pred->pop_n > top) {
+			WARN_ON_ONCE(1);
+			return 0;
+		}
+		val1 = stack[--top];
+		val2 = stack[--top];
+		match = pred->fn(pred, rec, val1, val2);
+		stack[top++] = match;
 	}
 
-	if (and_failed)
-		return 0;
-
-	return 1;
+	return stack[--top];
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct event_filter *filter,
-				 struct trace_seq *s)
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
 {
-	struct filter_pred *pred;
-	char *field_name;
-	int i;
+	ps->lasterr = err;
+	ps->lasterr_pos = pos;
+}
 
-	if (!filter || !filter->n_preds) {
-		trace_seq_printf(s, "none\n");
+static void remove_filter_string(struct event_filter *filter)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = NULL;
+}
+
+static int replace_filter_string(struct event_filter *filter,
+				 char *filter_string)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+	if (!filter->filter_string)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int append_filter_string(struct event_filter *filter,
+				char *string)
+{
+	int newlen;
+	char *new_filter_string;
+
+	BUG_ON(!filter->filter_string);
+	newlen = strlen(filter->filter_string) + strlen(string) + 1;
+	new_filter_string = kmalloc(newlen, GFP_KERNEL);
+	if (!new_filter_string)
+		return -ENOMEM;
+
+	strcpy(new_filter_string, filter->filter_string);
+	strcat(new_filter_string, string);
+	kfree(filter->filter_string);
+	filter->filter_string = new_filter_string;
+
+	return 0;
+}
+
+static void append_filter_err(struct filter_parse_state *ps,
+			      struct event_filter *filter)
+{
+	int pos = ps->lasterr_pos;
+	char *buf, *pbuf;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return;
-	}
 
-	for (i = 0; i < filter->n_preds; i++) {
-		pred = filter->preds[i];
-		field_name = pred->field_name;
-		if (i)
-			trace_seq_printf(s, pred->or ? "|| " : "&& ");
-		trace_seq_printf(s, "%s ", field_name);
-		trace_seq_printf(s, pred->not ? "!= " : "== ");
-		if (pred->str_len)
-			trace_seq_printf(s, "%s\n", pred->str_val);
-		else
-			trace_seq_printf(s, "%llu\n", pred->val);
-	}
+	append_filter_string(filter, "\n");
+	memset(buf, ' ', PAGE_SIZE);
+	if (pos > PAGE_SIZE - 128)
+		pos = 0;
+	buf[pos] = '^';
+	pbuf = &buf[pos] + 1;
+
+	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+	append_filter_string(filter, buf);
+	free_page((unsigned long) buf);
 }
 
-void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
+	struct event_filter *filter = call->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
-void filter_print_subsystem_preds(struct event_subsystem *system,
+void print_subsystem_event_filter(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
+	struct event_filter *filter = system->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
@@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 }
 
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
 	if (!pred)
 		return;
@@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest,
 			   filter_pred_fn_t fn)
 {
 	*dest = *src;
-	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-	if (!dest->field_name)
-		return -ENOMEM;
+	if (src->field_name) {
+		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+		if (!dest->field_name)
+			return -ENOMEM;
+	}
 	dest->fn = fn;
 
 	return 0;
 }
 
-static void __filter_disable_preds(struct ftrace_event_call *call)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
 	int i;
@@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
-{
-	mutex_lock(&filter_mutex);
-	__filter_disable_preds(call);
-	mutex_unlock(&filter_mutex);
-}
-
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -258,48 +386,43 @@ oom:
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-static void __filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (filter && filter->n_preds) {
+	if (filter->n_preds) {
 		for (i = 0; i < filter->n_preds; i++)
 			filter_free_pred(filter->preds[i]);
 		kfree(filter->preds);
-		kfree(filter);
-		system->filter = NULL;
+		filter->preds = NULL;
+		filter->n_preds = 0;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
-		if (!strcmp(call->system, system->name))
-			__filter_disable_preds(call);
+		if (!strcmp(call->system, system->name)) {
+			filter_disable_preds(call);
+			remove_filter_string(call->filter);
+		}
 	}
 }
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
-{
-	mutex_lock(&filter_mutex);
-	__filter_free_subsystem_preds(system);
-	mutex_unlock(&filter_mutex);
-}
-
-static int filter_add_pred_fn(struct ftrace_event_call *call,
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+			      struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
 	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (filter->n_preds && !pred->compound)
-		__filter_disable_preds(call);
-
-	if (filter->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
+	}
 
 	idx = filter->n_preds;
 	filter_clear_pred(filter->preds[idx]);
@@ -321,94 +444,132 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+	if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+		return 0;
+
+	return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+					     int field_is_signed)
+{
+	filter_pred_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_64;
+		else if (field_is_signed)
+			fn = filter_pred_s64;
+		else
+			fn = filter_pred_u64;
+		break;
+	case 4:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_32;
+		else if (field_is_signed)
+			fn = filter_pred_s32;
+		else
+			fn = filter_pred_u32;
+		break;
+	case 2:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_16;
+		else if (field_is_signed)
+			fn = filter_pred_s16;
+		else
+			fn = filter_pred_u16;
+		break;
+	case 1:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_8;
+		else if (field_is_signed)
+			fn = filter_pred_s8;
+		else
+			fn = filter_pred_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct ftrace_event_call *call,
+			   struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
 
+	pred->fn = filter_pred_none;
+
+	if (pred->op == OP_AND) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+	} else if (pred->op == OP_OR) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+	}
+
 	field = find_event_field(call, pred->field_name);
-	if (!field)
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return -EINVAL;
+	}
 
-	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
+	if (!is_legal_op(field, pred->op)) {
+		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+		return -EINVAL;
+	}
+
 	if (is_string_field(field->type)) {
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return filter_add_pred_fn(call, pred, fn);
+		if (pred->op == OP_NE)
+			pred->not = 1;
+		return filter_add_pred_fn(ps, call, pred, fn);
 	} else {
-		if (strict_strtoull(pred->str_val, 0, &val))
+		if (strict_strtoull(pred->str_val, 0, &val)) {
+			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
 			return -EINVAL;
+		}
 		pred->val = val;
 	}
 
-	switch (field->size) {
-	case 8:
-		fn = filter_pred_64;
-		break;
-	case 4:
-		fn = filter_pred_32;
-		break;
-	case 2:
-		fn = filter_pred_16;
-		break;
-	case 1:
-		fn = filter_pred_8;
-		break;
-	default:
+	fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+	if (!fn) {
+		parse_error(ps, FILT_ERR_INVALID_OP, 0);
 		return -EINVAL;
 	}
 
-	return filter_add_pred_fn(call, pred, fn);
-}
-
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
-{
-	int err;
-
-	mutex_lock(&filter_mutex);
-	err = __filter_add_pred(call, pred);
-	mutex_unlock(&filter_mutex);
+	if (pred->op == OP_NE)
+		pred->not = 1;
 
-	return err;
+	return filter_add_pred_fn(ps, call, pred, fn);
 }
 
-int filter_add_subsystem_pred(struct event_subsystem *system,
-			      struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+				     struct event_subsystem *system,
+				     struct filter_pred *pred,
+				     char *filter_string)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
-	mutex_lock(&filter_mutex);
-
-	if (filter && filter->n_preds && !pred->compound) {
-		__filter_free_subsystem_preds(system);
-		filter = NULL;
-	}
-
-	if (!filter) {
-		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-		if (!system->filter) {
-			mutex_unlock(&filter_mutex);
-			return -ENOMEM;
-		}
-		filter = system->filter;
+	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 
-		if (!filter->preds) {
-			kfree(system->filter);
-			system->filter = NULL;
-			mutex_unlock(&filter_mutex);
+		if (!filter->preds)
 			return -ENOMEM;
-		}
 	}
 
 	if (filter->n_preds == MAX_FILTER_PRED) {
-		mutex_unlock(&filter_mutex);
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
 
@@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		err = __filter_add_pred(call, pred);
-		if (err == -ENOMEM) {
-			filter->preds[filter->n_preds] = NULL;
-			filter->n_preds--;
-			mutex_unlock(&filter_mutex);
+		err = filter_add_pred(ps, call, pred);
+		if (err) {
+			filter_free_subsystem_preds(system);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 			return err;
 		}
+		replace_filter_string(call->filter, filter_string);
 	}
 
-	mutex_unlock(&filter_mutex);
+	return 0;
+}
+
+static void parse_init(struct filter_parse_state *ps,
+		       struct filter_op *ops,
+		       char *infix_string)
+{
+	memset(ps, '\0', sizeof(*ps));
+
+	ps->infix.string = infix_string;
+	ps->infix.cnt = strlen(infix_string);
+	ps->ops = ops;
+
+	INIT_LIST_HEAD(&ps->opstack);
+	INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+
+	return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+	if (ps->infix.tail == strlen(ps->infix.string))
+		return 0;
+
+	return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+	ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+				      int a, int b)
+{
+	return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+	int i;
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (ps->ops[i].string[0] == c)
+			return 1;
+	}
 
 	return 0;
 }
 
-/*
- * The filter format can be
- *   - 0, which means remove all filter preds
- *   - [||/&&] <field> ==/!= <val>
- */
-int filter_parse(char **pbuf, struct filter_pred *pred)
-{
-	char *tok, *val_str = NULL;
-	int tok_n = 0;
-
-	while ((tok = strsep(pbuf, " \n"))) {
-		if (tok_n == 0) {
-			if (!strcmp(tok, "0")) {
-				pred->clear = 1;
-				return 0;
-			} else if (!strcmp(tok, "&&")) {
-				pred->or = 0;
-				pred->compound = 1;
-			} else if (!strcmp(tok, "||")) {
-				pred->or = 1;
-				pred->compound = 1;
-			} else
-				pred->field_name = tok;
-			tok_n = 1;
-			continue;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+	char nextc = infix_peek(ps);
+	char opstr[3];
+	int i;
+
+	opstr[0] = firstc;
+	opstr[1] = nextc;
+	opstr[2] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string)) {
+			infix_advance(ps);
+			return ps->ops[i].id;
 		}
-		if (tok_n == 1) {
-			if (!pred->field_name)
-				pred->field_name = tok;
-			else if (!strcmp(tok, "!="))
-				pred->not = 1;
-			else if (!strcmp(tok, "=="))
-				pred->not = 0;
-			else {
-				pred->field_name = NULL;
+	}
+
+	opstr[1] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string))
+			return ps->ops[i].id;
+	}
+
+	return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+	ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+		return -EINVAL;
+
+	ps->operand.string[ps->operand.tail++] = c;
+
+	return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+	struct opstack_op *opstack_op;
+
+	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+	if (!opstack_op)
+		return -ENOMEM;
+
+	opstack_op->op = op;
+	list_add(&opstack_op->list, &ps->opstack);
+
+	return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+	return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+	return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+	int op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+	op = opstack_op->op;
+	list_del(&opstack_op->list);
+
+	kfree(opstack_op);
+
+	return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+	while (!filter_opstack_empty(ps))
+		filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+	return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = OP_NONE;
+	elt->operand = kstrdup(operand, GFP_KERNEL);
+	if (!elt->operand) {
+		kfree(elt);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = op;
+	elt->operand = NULL;
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+
+	while (!list_empty(&ps->postfix)) {
+		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+		kfree(elt->operand);
+		list_del(&elt->list);
+	}
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+	int op, top_op;
+	char ch;
+
+	while ((ch = infix_next(ps))) {
+		if (isspace(ch))
+			continue;
+
+		if (is_op_char(ps, ch)) {
+			op = infix_get_op(ps, ch);
+			if (op == OP_NONE) {
+				parse_error(ps, FILT_ERR_INVALID_OP, 0);
 				return -EINVAL;
 			}
-			tok_n = 2;
+
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			while (!filter_opstack_empty(ps)) {
+				top_op = filter_opstack_top(ps);
+				if (!is_precedence_lower(ps, top_op, op)) {
+					top_op = filter_opstack_pop(ps);
+					postfix_append_op(ps, top_op);
+					continue;
+				}
+				break;
+			}
+
+			filter_opstack_push(ps, op);
 			continue;
 		}
-		if (tok_n == 2) {
-			if (pred->compound) {
-				if (!strcmp(tok, "!="))
-					pred->not = 1;
-				else if (!strcmp(tok, "=="))
-					pred->not = 0;
-				else {
-					pred->field_name = NULL;
-					return -EINVAL;
-				}
-			} else {
-				val_str = tok;
-				break; /* done */
+
+		if (ch == '(') {
+			filter_opstack_push(ps, OP_OPEN_PAREN);
+			continue;
+		}
+
+		if (ch == ')') {
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			top_op = filter_opstack_pop(ps);
+			while (top_op != OP_NONE) {
+				if (top_op == OP_OPEN_PAREN)
+					break;
+				postfix_append_op(ps, top_op);
+				top_op = filter_opstack_pop(ps);
+			}
+			if (top_op == OP_NONE) {
+				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+				return -EINVAL;
 			}
-			tok_n = 3;
 			continue;
 		}
-		if (tok_n == 3) {
-			val_str = tok;
-			break; /* done */
+		if (append_operand_char(ps, ch)) {
+			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+			return -EINVAL;
+		}
+	}
+
+	if (strlen(curr_operand(ps)))
+		postfix_append_operand(ps, curr_operand(ps));
+
+	while (!filter_opstack_empty(ps)) {
+		top_op = filter_opstack_pop(ps);
+		if (top_op == OP_NONE)
+			break;
+		if (top_op == OP_OPEN_PAREN) {
+			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+			return -EINVAL;
+		}
+		postfix_append_op(ps, top_op);
+	}
+
+	return 0;
+}
+
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->field_name = kstrdup(operand1, GFP_KERNEL);
+	if (!pred->field_name) {
+		kfree(pred);
+		return NULL;
+	}
+
+	strcpy(pred->str_val, operand2);
+	pred->str_len = strlen(operand2);
+
+	pred->op = op;
+
+	return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->op = op;
+
+	return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+	int n_normal_preds = 0, n_logical_preds = 0;
+	struct postfix_elt *elt;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			n_logical_preds++;
+			continue;
 		}
+		n_normal_preds++;
 	}
 
-	if (!val_str || !strlen(val_str)
-	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
-		pred->field_name = NULL;
+	if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
 		return -EINVAL;
 	}
 
-	strcpy(pred->str_val, val_str);
-	pred->str_len = strlen(val_str);
+	return 0;
+}
 
-	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-	if (!pred->field_name)
-		return -ENOMEM;
+static int replace_preds(struct event_subsystem *system,
+			 struct ftrace_event_call *call,
+			 struct filter_parse_state *ps,
+			 char *filter_string)
+{
+	char *operand1 = NULL, *operand2 = NULL;
+	struct filter_pred *pred;
+	struct postfix_elt *elt;
+	int err;
+
+	err = check_preds(ps);
+	if (err)
+		return err;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE) {
+			if (!operand1)
+				operand1 = elt->operand;
+			else if (!operand2)
+				operand2 = elt->operand;
+			else {
+				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			pred = create_logical_pred(elt->op);
+			if (call) {
+				err = filter_add_pred(ps, call, pred);
+				filter_free_pred(pred);
+			} else
+				err = filter_add_subsystem_pred(ps, system,
+							pred, filter_string);
+			if (err)
+				return err;
+
+			operand1 = operand2 = NULL;
+			continue;
+		}
+
+		if (!operand1 || !operand2) {
+			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+			return -EINVAL;
+		}
+
+		pred = create_pred(elt->op, operand1, operand2);
+		if (call) {
+			err = filter_add_pred(ps, call, pred);
+			filter_free_pred(pred);
+		} else
+			err = filter_add_subsystem_pred(ps, system, pred,
+							filter_string);
+		if (err)
+			return err;
+
+		operand1 = operand2 = NULL;
+	}
 
 	return 0;
 }
 
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_disable_preds(call);
+		remove_filter_string(call->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_disable_preds(call);
+	replace_filter_string(call->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, call->filter);
+		goto out;
+	}
+
+	err = replace_preds(NULL, call, ps, filter_string);
+	if (err)
+		append_filter_err(ps, call->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+				 char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_free_subsystem_preds(system);
+		remove_filter_string(system->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_free_subsystem_preds(system);
+	replace_filter_string(system->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, system->filter);
+		goto out;
+	}
+
+	err = replace_preds(system, NULL, ps, filter_string);
+	if (err)
+		append_filter_err(ps, system->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
 
-- 
cgit v1.2.3-70-g09d2


From 829b42dd395c5801f6ae87da87ecbdcfd5ef1a6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:59 +0200
Subject: perf_counter, x86: declare perf_max_counters only for
 CONFIG_PERF_COUNTERS

This is only needed for CONFIG_PERF_COUNTERS enabled.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 98143288530..be10b3ffe32 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -512,12 +512,13 @@ struct perf_cpu_context {
 	int			recursion[4];
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+
 /*
  * Set by architecture code:
  */
 extern int perf_max_counters;
 
-#ifdef CONFIG_PERF_COUNTERS
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
-- 
cgit v1.2.3-70-g09d2


From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:03 +0200
Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu

This patch renames struct hw_perf_counter_ops into struct pmu. It
introduces a structure to describe a cpu specific pmu (performance
monitoring unit). It may contain ops and data. The new name of the
structure fits better, is shorter, and thus better to handle. Where it
was appropriate, names of function and variable have been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 25 +++++++-------
 arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++-----------
 include/linux/perf_counter.h       |  9 +++--
 kernel/perf_counter.c              | 68 ++++++++++++++++++--------------------
 4 files changed, 66 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd76d0fa2c3..d9bbe5efc64 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
 	return 0;
 }
 
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
 
@@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable)
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_perf_read(counter);
+			power_pmu_read(counter);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
 		}
@@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 	counter->oncpu = cpu;
 	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
-		counter->hw_ops->enable(counter);
+		counter->pmu->enable(counter);
 }
 
 /*
@@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
  * re-enable the PMU in order to get hw_perf_restore to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
@@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter)
 /*
  * Remove a counter from the PMU.
  */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
@@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_save(flags);
 	pmudis = hw_perf_save_disable();
 
-	power_perf_read(counter);
+	power_pmu_read(counter);
 
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
-struct hw_perf_counter_ops power_perf_ops = {
-	.enable = power_perf_enable,
-	.disable = power_perf_disable,
-	.read = power_perf_read
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
 };
 
 /* Number of perf_counters counting hardware events */
@@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	unsigned long ev;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
@@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	if (err)
 		return ERR_PTR(err);
-	return &power_perf_ops;
+	return &power_pmu;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ad663d5ad2d..95de980c74a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_generic_disable(struct perf_counter *counter,
-			   struct hw_perf_counter *hwc, unsigned int idx)
+__x86_pmu_disable(struct perf_counter *counter,
+		  struct hw_perf_counter *hwc, unsigned int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter,
 }
 
 static void
-__pmc_generic_enable(struct perf_counter *counter,
-			  struct hw_perf_counter *hwc, int idx)
+__x86_pmu_enable(struct perf_counter *counter,
+		 struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
@@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static int pmc_generic_enable(struct perf_counter *counter)
+static int x86_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -667,7 +667,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	/*
@@ -676,7 +676,7 @@ try_generic:
 	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__pmc_generic_enable(counter, hwc, idx);
+	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
 }
@@ -731,13 +731,13 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void pmc_generic_disable(struct perf_counter *counter)
+static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__pmc_generic_enable(counter, hwc, idx);
+		__x86_pmu_enable(counter, hwc, idx);
 }
 
 /*
@@ -805,7 +805,7 @@ again:
 
 		perf_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__pmc_generic_disable(counter, &counter->hw, bit);
+			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
@@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void pmc_generic_read(struct perf_counter *counter)
+static void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
-static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.enable		= pmc_generic_enable,
-	.disable	= pmc_generic_disable,
-	.read		= pmc_generic_read,
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
 };
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
@@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	if (err)
 		return ERR_PTR(err);
 
-	return &x86_perf_counter_ops;
+	return &pmu;
 }
 
 /*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index be10b3ffe32..c3db52dc876 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -334,9 +334,9 @@ struct hw_perf_counter {
 struct perf_counter;
 
 /**
- * struct hw_perf_counter_ops - performance counter hw ops
+ * struct pmu - generic performance monitoring unit
  */
-struct hw_perf_counter_ops {
+struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
@@ -381,7 +381,7 @@ struct perf_counter {
 	struct list_head		sibling_list;
 	int 				nr_siblings;
 	struct perf_counter		*group_leader;
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
 	enum perf_counter_active_state	prev_state;
@@ -519,8 +519,7 @@ struct perf_cpu_context {
  */
 extern int perf_max_counters;
 
-extern const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter);
+extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 09396098dd0..582108addef 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
@@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter,
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_stopped = ctx->time;
-	counter->hw_ops->disable(counter);
+	counter->pmu->disable(counter);
 	counter->oncpu = -1;
 
 	if (!is_software_counter(counter))
@@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter,
 	 */
 	smp_wmb();
 
-	if (counter->hw_ops->enable(counter)) {
+	if (counter->pmu->enable(counter)) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->oncpu = -1;
 		return -EAGAIN;
@@ -1096,7 +1095,7 @@ static void __read(void *info)
 	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 	update_counter_times(counter);
 	local_irq_restore(flags);
 }
@@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		leader = counter->group_leader;
 		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 			if (sub != counter)
-				sub->hw_ops->read(sub);
+				sub->pmu->read(sub);
 
 			group_entry.event = sub->hw_event.config;
 			group_entry.counter = atomic64_read(&sub->count);
@@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	struct pt_regs *regs;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 
 	regs = get_irq_regs();
 	/*
@@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_generic = {
+static const struct pmu perf_ops_generic = {
 	.enable		= perf_swcounter_enable,
 	.disable	= perf_swcounter_disable,
 	.read		= perf_swcounter_read,
@@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 	cpu_clock_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+static const struct pmu perf_ops_cpu_clock = {
 	.enable		= cpu_clock_perf_counter_enable,
 	.disable	= cpu_clock_perf_counter_disable,
 	.read		= cpu_clock_perf_counter_read,
@@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, time);
 }
 
-static const struct hw_perf_counter_ops perf_ops_task_clock = {
+static const struct pmu perf_ops_task_clock = {
 	.enable		= task_clock_perf_counter_enable,
 	.disable	= task_clock_perf_counter_disable,
 	.read		= task_clock_perf_counter_read,
@@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+static const struct pmu perf_ops_cpu_migrations = {
 	.enable		= cpu_migrations_perf_counter_enable,
 	.disable	= cpu_migrations_perf_counter_disable,
 	.read		= cpu_migrations_perf_counter_read,
@@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
@@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter)
 	return &perf_ops_generic;
 }
 #else
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
 #endif
 
-static const struct hw_perf_counter_ops *
-sw_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
-	const struct hw_perf_counter_ops *hw_ops = NULL;
+	const struct pmu *pmu = NULL;
 	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
@@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 		 * use the cpu_clock counter instead.
 		 */
 		if (counter->ctx->task)
-			hw_ops = &perf_ops_task_clock;
+			pmu = &perf_ops_task_clock;
 		else
-			hw_ops = &perf_ops_cpu_clock;
+			pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_generic;
+		pmu = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_cpu_migrations;
+			pmu = &perf_ops_cpu_migrations;
 		break;
 	}
 
-	if (hw_ops)
+	if (pmu)
 		hwc->irq_period = hw_event->irq_period;
 
-	return hw_ops;
+	return pmu;
 }
 
 /*
@@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu *pmu;
 	struct perf_counter *counter;
 	long err;
 
@@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
-	counter->hw_ops			= NULL;
+	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
-	hw_ops = NULL;
+	pmu = NULL;
 
 	if (perf_event_raw(hw_event)) {
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
 	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_SOFTWARE:
-		hw_ops = sw_perf_counter_init(counter);
+		pmu = sw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_TRACEPOINT:
-		hw_ops = tp_perf_counter_init(counter);
+		pmu = tp_perf_counter_init(counter);
 		break;
 	}
 done:
 	err = 0;
-	if (!hw_ops)
+	if (!pmu)
 		err = -EINVAL;
-	else if (IS_ERR(hw_ops))
-		err = PTR_ERR(hw_ops);
+	else if (IS_ERR(pmu))
+		err = PTR_ERR(pmu);
 
 	if (err) {
 		kfree(counter);
 		return ERR_PTR(err);
 	}
 
-	counter->hw_ops = hw_ops;
+	counter->pmu = pmu;
 
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
-- 
cgit v1.2.3-70-g09d2


From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:17 +0200
Subject: perf_counter, x86: consistent use of type int for counter index

The type of counter index is sometimes implemented as unsigned
int. This patch changes this to have a consistent usage of int.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 include/linux/perf_counter.h       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7fd4a35515..d8beebeb270 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config)
 
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, unsigned int __idx)
+		    struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter,
 
 static inline void
 __x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, unsigned int idx)
+		  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 static inline void
 __pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, unsigned int __idx)
+		   struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned int idx = hwc->idx;
+	int idx = hwc->idx;
 
 	/*
 	 * Must be done before we disable, otherwise the nmi handler
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c3db52dc876..41aed427005 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -318,7 +318,7 @@ struct hw_perf_counter {
 			unsigned long			config_base;
 			unsigned long			counter_base;
 			int				nmi;
-			unsigned int			idx;
+			int				idx;
 		};
 		union { /* software */
 			atomic64_t			count;
-- 
cgit v1.2.3-70-g09d2


From b1fca26631f76a5e8b18435a43f5d82b8734da4b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 23 Mar 2009 18:22:09 +0100
Subject: mutex: add atomic_dec_and_mutex_lock()

Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab..93054fc3635 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From c654b8a9cba6002aad1c01919e4928a79a4a6dcf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 16 Apr 2009 17:33:25 -0400
Subject: nfsd: support ext4 i_version

ext4 supports a real NFSv4 change attribute, which is bumped whenever
the ctime would be updated, including times when two updates arrive
within a jiffy of each other.  (Note that although ext4 has space for
nanosecond-precision ctime, the real resolution is lower: it actually
uses jiffies as the time-source.)  This ensures clients will invalidate
their caches when they need to.

There is some fear that keeping the i_version up-to-date could have
performance drawbacks, so for now it's turned on only by a mount option.
We hope to do something better eventually.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Theodore Tso <tytso@mit.edu>
---
 fs/nfsd/nfs3xdr.c          |  1 +
 fs/nfsd/nfs4xdr.c          | 63 ++++++++++++++++++++++++++++++----------------
 include/linux/nfsd/nfsfh.h |  7 ++++++
 include/linux/nfsd/xdr4.h  | 17 ++++++++++---
 4 files changed, 63 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 17d0dd99720..01d4ec1c88e 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -272,6 +272,7 @@ void fill_post_wcc(struct svc_fh *fhp)
 
 	err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
 			&fhp->fh_post_attr);
+	fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
 	if (err)
 		fhp->fh_post_saved = 0;
 	else
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4a71fcd3f03..12d36a7361c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1490,13 +1490,41 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	memcpy(p, ptr, nbytes);					\
 	p += XDR_QUADLEN(nbytes);				\
 }} while (0)
-#define WRITECINFO(c)		do {				\
-	*p++ = htonl(c.atomic);					\
-	*p++ = htonl(c.before_ctime_sec);				\
-	*p++ = htonl(c.before_ctime_nsec);				\
-	*p++ = htonl(c.after_ctime_sec);				\
-	*p++ = htonl(c.after_ctime_nsec);				\
-} while (0)
+
+static void write32(__be32 **p, u32 n)
+{
+	*(*p)++ = n;
+}
+
+static void write64(__be32 **p, u64 n)
+{
+	write32(p, (u32)(n >> 32));
+	write32(p, (u32)n);
+}
+
+static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+{
+	if (IS_I_VERSION(inode)) {
+		write64(p, inode->i_version);
+	} else {
+		write32(p, stat->ctime.tv_sec);
+		write32(p, stat->ctime.tv_nsec);
+	}
+}
+
+static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+{
+	write32(p, c->atomic);
+	if (c->change_supported) {
+		write64(p, c->before_change);
+		write64(p, c->after_change);
+	} else {
+		write32(p, c->before_ctime_sec);
+		write32(p, c->before_ctime_nsec);
+		write32(p, c->after_ctime_sec);
+		write32(p, c->after_ctime_nsec);
+	}
+}
 
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = resp->p;						\
@@ -1849,16 +1877,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
 	}
 	if (bmval0 & FATTR4_WORD0_CHANGE) {
-		/*
-		 * Note: This _must_ be consistent with the scheme for writing
-		 * change_info, so any changes made here must be reflected there
-		 * as well.  (See xdr4.h:set_change_info() and the WRITECINFO()
-		 * macro above.)
-		 */
 		if ((buflen -= 8) < 0)
 			goto out_resource;
-		WRITE32(stat.ctime.tv_sec);
-		WRITE32(stat.ctime.tv_nsec);
+		write_change(&p, &stat, dentry->d_inode);
 	}
 	if (bmval0 & FATTR4_WORD0_SIZE) {
 		if ((buflen -= 8) < 0)
@@ -2364,7 +2385,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 	if (!nfserr) {
 		RESERVE_SPACE(32);
-		WRITECINFO(create->cr_cinfo);
+		write_cinfo(&p, &create->cr_cinfo);
 		WRITE32(2);
 		WRITE32(create->cr_bmval[0]);
 		WRITE32(create->cr_bmval[1]);
@@ -2475,7 +2496,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 
 	if (!nfserr) {
 		RESERVE_SPACE(20);
-		WRITECINFO(link->li_cinfo);
+		write_cinfo(&p, &link->li_cinfo);
 		ADJUST_ARGS();
 	}
 	return nfserr;
@@ -2493,7 +2514,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 
 	nfsd4_encode_stateid(resp, &open->op_stateid);
 	RESERVE_SPACE(40);
-	WRITECINFO(open->op_cinfo);
+	write_cinfo(&p, &open->op_cinfo);
 	WRITE32(open->op_rflags);
 	WRITE32(2);
 	WRITE32(open->op_bmval[0]);
@@ -2771,7 +2792,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 	if (!nfserr) {
 		RESERVE_SPACE(20);
-		WRITECINFO(remove->rm_cinfo);
+		write_cinfo(&p, &remove->rm_cinfo);
 		ADJUST_ARGS();
 	}
 	return nfserr;
@@ -2784,8 +2805,8 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 	if (!nfserr) {
 		RESERVE_SPACE(40);
-		WRITECINFO(rename->rn_sinfo);
-		WRITECINFO(rename->rn_tinfo);
+		write_cinfo(&p, &rename->rn_sinfo);
+		write_cinfo(&p, &rename->rn_tinfo);
 		ADJUST_ARGS();
 	}
 	return nfserr;
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index afa19016c4a..8f641c90845 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -151,9 +151,15 @@ typedef struct svc_fh {
 	__u64			fh_pre_size;	/* size before operation */
 	struct timespec		fh_pre_mtime;	/* mtime before oper */
 	struct timespec		fh_pre_ctime;	/* ctime before oper */
+	/*
+	 * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+	 *  to find out if it is valid.
+	 */
+	u64			fh_pre_change;
 
 	/* Post-op attributes saved in fh_unlock */
 	struct kstat		fh_post_attr;	/* full attrs after operation */
+	u64			fh_post_change; /* nfsv4 change; see above */
 #endif /* CONFIG_NFSD_V3 */
 
 } svc_fh;
@@ -298,6 +304,7 @@ fill_pre_wcc(struct svc_fh *fhp)
 		fhp->fh_pre_mtime = inode->i_mtime;
 		fhp->fh_pre_ctime = inode->i_ctime;
 		fhp->fh_pre_size  = inode->i_size;
+		fhp->fh_pre_change = inode->i_version;
 		fhp->fh_pre_saved = 1;
 	}
 }
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index f80d6013fdc..d0f050f01ec 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -64,10 +64,13 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
 
 struct nfsd4_change_info {
 	u32		atomic;
+	bool		change_supported;
 	u32		before_ctime_sec;
 	u32		before_ctime_nsec;
+	u64		before_change;
 	u32		after_ctime_sec;
 	u32		after_ctime_nsec;
+	u64		after_change;
 };
 
 struct nfsd4_access {
@@ -503,10 +506,16 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
 	BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
 	cinfo->atomic = 1;
-	cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-	cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-	cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-	cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+	cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
+	if (cinfo->change_supported) {
+		cinfo->before_change = fhp->fh_pre_change;
+		cinfo->after_change = fhp->fh_post_change;
+	} else {
+		cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+		cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+		cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+		cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+	}
 }
 
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
-- 
cgit v1.2.3-70-g09d2


From 3cef9ab266a932899e756f7e1ea7a988a97bf3b2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 23 Feb 2009 21:42:10 -0800
Subject: nfsd4: lookup up callback cred only once

Lookup the callback cred once and then use it for all subsequent
callbacks.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 26 ++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c        |  4 ++++
 include/linux/nfsd/state.h |  1 +
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 711c6282151..cc10ed35ac8 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -415,6 +415,22 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 		(int)clp->cl_name.len, clp->cl_name.data, reason);
 }
 
+static struct rpc_cred *lookup_cb_cred(struct nfs4_callback *cb)
+{
+	struct auth_cred acred = {
+		.machine_cred = 1
+	};
+
+	/*
+	 * Note in the gss case this doesn't actually have to wait for a
+	 * gss upcall (or any calls to the client); this just creates a
+	 * non-uptodate cred which the rpc state machine will fill in with
+	 * a refresh_upcall later.
+	 */
+	return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
+							RPCAUTH_LOOKUP_NEW);
+}
+
 static int do_probe_callback(void *data)
 {
 	struct nfs4_client *clp = data;
@@ -423,9 +439,18 @@ static int do_probe_callback(void *data)
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
 		.rpc_argp       = clp,
 	};
+	struct rpc_cred *cred;
 	int status;
 
+	cred = lookup_cb_cred(cb);
+	if (IS_ERR(cred)) {
+		status = PTR_ERR(cred);
+		goto out;
+	}
+	cb->cb_cred = cred;
+	msg.rpc_cred = cb->cb_cred;
 	status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
+out:
 	if (status)
 		warn_no_callback_path(clp, status);
 	else
@@ -475,6 +500,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_argp = cbr,
+		.rpc_cred = clp->cl_callback.cb_cred
 	};
 	int retries = 1;
 	int status = 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7e1fcc3aade..b205c7d7bc6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -643,6 +643,10 @@ shutdown_callback_client(struct nfs4_client *clp)
 		clp->cl_callback.cb_client = NULL;
 		rpc_shutdown_client(clnt);
 	}
+	if (clp->cl_callback.cb_cred) {
+		put_rpccred(clp->cl_callback.cb_cred);
+		clp->cl_callback.cb_cred = NULL;
+	}
 }
 
 static inline void
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 4d61c873fee..8d882a3eb4b 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -97,6 +97,7 @@ struct nfs4_callback {
 	/* RPC client info */
 	atomic_t		cb_set;     /* successful CB_NULL call */
 	struct rpc_clnt *       cb_client;
+	struct rpc_cred	*	cb_cred;
 };
 
 /* Maximum number of slots per session. 128 is useful for long haul TCP */
-- 
cgit v1.2.3-70-g09d2


From 3bcac0263f0b45e67a64034ebcb69eb9abb742f4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 Apr 2009 13:45:05 +0100
Subject: SELinux: Don't flush inherited SIGKILL during execve()

Don't flush inherited SIGKILL during execve() in SELinux's post cred commit
hook.  This isn't really a security problem: if the SIGKILL came before the
credentials were changed, then we were right to receive it at the time, and
should honour it; if it came after the creds were changed, then we definitely
should honour it; and in any case, all that will happen is that the process
will be scrapped before it ever returns to userspace.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/sched.h    |  1 +
 kernel/signal.c          | 11 ++++++++---
 security/selinux/hooks.c |  9 +++++----
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1d19c025f9d..d3b787c7aef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1875,6 +1875,7 @@ extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
+extern void __flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a1..f93efec14ff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -238,14 +238,19 @@ void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
+void __flush_signals(struct task_struct *t)
+{
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
+	flush_sigqueue(&t->pending);
+	flush_sigqueue(&t->signal->shared_pending);
+}
+
 void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	clear_tsk_thread_flag(t, TIF_SIGPENDING);
-	flush_sigqueue(&t->pending);
-	flush_sigqueue(&t->signal->shared_pending);
+	__flush_signals(t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dd19ba81201..5a345115036 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2394,11 +2394,12 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 		memset(&itimer, 0, sizeof itimer);
 		for (i = 0; i < 3; i++)
 			do_setitimer(i, &itimer, NULL);
-		flush_signals(current);
 		spin_lock_irq(&current->sighand->siglock);
-		flush_signal_handlers(current, 1);
-		sigemptyset(&current->blocked);
-		recalc_sigpending();
+		if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
+			__flush_signals(current);
+			flush_signal_handlers(current, 1);
+			sigemptyset(&current->blocked);
+		}
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 0821c71751ef88f4251d7206e76ce497ee267a2d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:02:59 +0000
Subject: ethtool: Add port type PORT_OTHER

Add a PORT_OTHER to represent all other physical port types.  Current
NICs generally do not allow switching between multiple port types in
software so specific types should not be needed.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 131b127b70f..5ccb6bd660c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -605,6 +605,7 @@ struct ethtool_ops {
 #define PORT_MII		0x02
 #define PORT_FIBRE		0x03
 #define PORT_BNC		0x04
+#define PORT_OTHER		0xff
 
 /* Which transceiver to use. */
 #define XCVR_INTERNAL		0x00
-- 
cgit v1.2.3-70-g09d2


From 52c94dfae11d9ffd70b7bd003a36a4e210f2866a Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:04:14 +0000
Subject: mdio: Add register definitions for MDIO (clause 45)

IEEE 802.3 clause 45 specifies the MDIO interface and registers for
use in 10G and other PHYs, similar to the MII management interface.

PHYs may have up to 32 MMDs corresponding to different sub-layers and
functions, each with up to 65536 registers.  These are addressed by
PRTAD (similar to the MII PHY address) and DEVAD.  Define a mapping
for specifying PRTAD and DEVAD through the existing MII ioctls.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 237 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 include/linux/mdio.h

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
new file mode 100644
index 00000000000..d57ddb08a1b
--- /dev/null
+++ b/include/linux/mdio.h
@@ -0,0 +1,237 @@
+/*
+ * linux/mdio.h: definitions for MDIO (clause 45) transceivers
+ * Copyright 2006-2009 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#ifndef __LINUX_MDIO_H__
+#define __LINUX_MDIO_H__
+
+#include <linux/mii.h>
+
+/* MDIO Manageable Devices (MMDs). */
+#define MDIO_MMD_PMAPMD		1	/* Physical Medium Attachment/
+					 * Physical Medium Dependent */
+#define MDIO_MMD_WIS		2	/* WAN Interface Sublayer */
+#define MDIO_MMD_PCS		3	/* Physical Coding Sublayer */
+#define MDIO_MMD_PHYXS		4	/* PHY Extender Sublayer */
+#define MDIO_MMD_DTEXS		5	/* DTE Extender Sublayer */
+#define MDIO_MMD_TC		6	/* Transmission Convergence */
+#define MDIO_MMD_AN		7	/* Auto-Negotiation */
+#define MDIO_MMD_C22EXT		29	/* Clause 22 extension */
+#define MDIO_MMD_VEND1		30	/* Vendor specific 1 */
+#define MDIO_MMD_VEND2		31	/* Vendor specific 2 */
+
+/* Generic MDIO registers. */
+#define MDIO_CTRL1		MII_BMCR
+#define MDIO_STAT1		MII_BMSR
+#define MDIO_DEVID1		MII_PHYSID1
+#define MDIO_DEVID2		MII_PHYSID2
+#define MDIO_SPEED		4	/* Speed ability */
+#define MDIO_DEVS1		5	/* Devices in package */
+#define MDIO_DEVS2		6
+#define MDIO_CTRL2		7	/* 10G control 2 */
+#define MDIO_STAT2		8	/* 10G status 2 */
+#define MDIO_PMA_TXDIS		9	/* 10G PMA/PMD transmit disable */
+#define MDIO_PMA_RXDET		10	/* 10G PMA/PMD receive signal detect */
+#define MDIO_PMA_EXTABLE	11	/* 10G PMA/PMD extended ability */
+#define MDIO_PKGID1		14	/* Package identifier */
+#define MDIO_PKGID2		15
+#define MDIO_AN_ADVERTISE	16	/* AN advertising (base page) */
+#define MDIO_AN_LPA		19	/* AN LP abilities (base page) */
+#define MDIO_PHYXS_LNSTAT	24	/* PHY XGXS lane state */
+
+/* Media-dependent registers. */
+#define MDIO_PMA_10GBT_TXPWR	131	/* 10GBASE-T TX power control */
+#define MDIO_PCS_10GBX_STAT1	24	/* 10GBASE-X PCS status 1 */
+#define MDIO_PCS_10GBRT_STAT1	32	/* 10GBASE-R/-T PCS status 1 */
+#define MDIO_PCS_10GBRT_STAT2	33	/* 10GBASE-R/-T PCS status 2 */
+#define MDIO_AN_10GBT_CTRL	32	/* 10GBASE-T auto-negotiation control */
+#define MDIO_AN_10GBT_STAT	33	/* 10GBASE-T auto-negotiation status */
+
+/* Control register 1. */
+/* Enable extended speed selection */
+#define MDIO_CTRL1_SPEEDSELEXT		(BMCR_SPEED1000 | BMCR_SPEED100)
+/* All speed selection bits */
+#define MDIO_CTRL1_SPEEDSEL		(MDIO_CTRL1_SPEEDSELEXT | 0x003c)
+#define MDIO_CTRL1_FULLDPLX		BMCR_FULLDPLX
+#define MDIO_CTRL1_LPOWER		BMCR_PDOWN
+#define MDIO_CTRL1_RESET		BMCR_RESET
+#define MDIO_PMA_CTRL1_LOOPBACK		0x0001
+#define MDIO_PMA_CTRL1_SPEED1000	BMCR_SPEED1000
+#define MDIO_PMA_CTRL1_SPEED100		BMCR_SPEED100
+#define MDIO_PCS_CTRL1_LOOPBACK		BMCR_LOOPBACK
+#define MDIO_PHYXS_CTRL1_LOOPBACK	BMCR_LOOPBACK
+#define MDIO_AN_CTRL1_RESTART		BMCR_ANRESTART
+#define MDIO_AN_CTRL1_ENABLE		BMCR_ANENABLE
+#define MDIO_AN_CTRL1_XNP		0x2000	/* Enable extended next page */
+
+/* 10 Gb/s */
+#define MDIO_CTRL1_SPEED10G		(MDIO_CTRL1_SPEEDSELEXT | 0x00)
+/* 10PASS-TS/2BASE-TL */
+#define MDIO_CTRL1_SPEED10P2B		(MDIO_CTRL1_SPEEDSELEXT | 0x04)
+
+/* Status register 1. */
+#define MDIO_STAT1_LPOWERABLE		0x0002	/* Low-power ability */
+#define MDIO_STAT1_LSTATUS		BMSR_LSTATUS
+#define MDIO_STAT1_FAULT		0x0080	/* Fault */
+#define MDIO_AN_STAT1_LPABLE		0x0001	/* Link partner AN ability */
+#define MDIO_AN_STAT1_ABLE		BMSR_ANEGCAPABLE
+#define MDIO_AN_STAT1_RFAULT		BMSR_RFAULT
+#define MDIO_AN_STAT1_COMPLETE		BMSR_ANEGCOMPLETE
+#define MDIO_AN_STAT1_PAGE		0x0040	/* Page received */
+#define MDIO_AN_STAT1_XNP		0x0080	/* Extended next page status */
+
+/* Speed register. */
+#define MDIO_SPEED_10G			0x0001	/* 10G capable */
+#define MDIO_PMA_SPEED_2B		0x0002	/* 2BASE-TL capable */
+#define MDIO_PMA_SPEED_10P		0x0004	/* 10PASS-TS capable */
+#define MDIO_PMA_SPEED_1000		0x0010	/* 1000M capable */
+#define MDIO_PMA_SPEED_100		0x0020	/* 100M capable */
+#define MDIO_PMA_SPEED_10		0x0040	/* 10M capable */
+#define MDIO_PCS_SPEED_10P2B		0x0002	/* 10PASS-TS/2BASE-TL capable */
+
+/* Device present registers. */
+#define MDIO_DEVS_PRESENT(devad)	(1 << (devad))
+#define MDIO_DEVS_PMAPMD		MDIO_DEVS_PRESENT(MDIO_MMD_PMAPMD)
+#define MDIO_DEVS_WIS			MDIO_DEVS_PRESENT(MDIO_MMD_WIS)
+#define MDIO_DEVS_PCS			MDIO_DEVS_PRESENT(MDIO_MMD_PCS)
+#define MDIO_DEVS_PHYXS			MDIO_DEVS_PRESENT(MDIO_MMD_PHYXS)
+#define MDIO_DEVS_DTEXS			MDIO_DEVS_PRESENT(MDIO_MMD_DTEXS)
+#define MDIO_DEVS_TC			MDIO_DEVS_PRESENT(MDIO_MMD_TC)
+#define MDIO_DEVS_AN			MDIO_DEVS_PRESENT(MDIO_MMD_AN)
+#define MDIO_DEVS_C22EXT		MDIO_DEVS_PRESENT(MDIO_MMD_C22EXT)
+
+/* Control register 2. */
+#define MDIO_PMA_CTRL2_TYPE		0x000f	/* PMA/PMD type selection */
+#define MDIO_PMA_CTRL2_10GBCX4		0x0000	/* 10GBASE-CX4 type */
+#define MDIO_PMA_CTRL2_10GBEW		0x0001	/* 10GBASE-EW type */
+#define MDIO_PMA_CTRL2_10GBLW		0x0002	/* 10GBASE-LW type */
+#define MDIO_PMA_CTRL2_10GBSW		0x0003	/* 10GBASE-SW type */
+#define MDIO_PMA_CTRL2_10GBLX4		0x0004	/* 10GBASE-LX4 type */
+#define MDIO_PMA_CTRL2_10GBER		0x0005	/* 10GBASE-ER type */
+#define MDIO_PMA_CTRL2_10GBLR		0x0006	/* 10GBASE-LR type */
+#define MDIO_PMA_CTRL2_10GBSR		0x0007	/* 10GBASE-SR type */
+#define MDIO_PMA_CTRL2_10GBLRM		0x0008	/* 10GBASE-LRM type */
+#define MDIO_PMA_CTRL2_10GBT		0x0009	/* 10GBASE-T type */
+#define MDIO_PMA_CTRL2_10GBKX4		0x000a	/* 10GBASE-KX4 type */
+#define MDIO_PMA_CTRL2_10GBKR		0x000b	/* 10GBASE-KR type */
+#define MDIO_PMA_CTRL2_1000BT		0x000c	/* 1000BASE-T type */
+#define MDIO_PMA_CTRL2_1000BKX		0x000d	/* 1000BASE-KX type */
+#define MDIO_PMA_CTRL2_100BTX		0x000e	/* 100BASE-TX type */
+#define MDIO_PMA_CTRL2_10BT		0x000f	/* 10BASE-T type */
+#define MDIO_PCS_CTRL2_TYPE		0x0003	/* PCS type selection */
+#define MDIO_PCS_CTRL2_10GBR		0x0000	/* 10GBASE-R type */
+#define MDIO_PCS_CTRL2_10GBX		0x0001	/* 10GBASE-X type */
+#define MDIO_PCS_CTRL2_10GBW		0x0002	/* 10GBASE-W type */
+#define MDIO_PCS_CTRL2_10GBT		0x0003	/* 10GBASE-T type */
+
+/* Status register 2. */
+#define MDIO_STAT2_RXFAULT		0x0400	/* Receive fault */
+#define MDIO_STAT2_TXFAULT		0x0800	/* Transmit fault */
+#define MDIO_STAT2_DEVPRST		0xc000	/* Device present */
+#define MDIO_STAT2_DEVPRST_VAL		0x8000	/* Device present value */
+#define MDIO_PMA_STAT2_LBABLE		0x0001	/* PMA loopback ability */
+#define MDIO_PMA_STAT2_10GBEW		0x0002	/* 10GBASE-EW ability */
+#define MDIO_PMA_STAT2_10GBLW		0x0004	/* 10GBASE-LW ability */
+#define MDIO_PMA_STAT2_10GBSW		0x0008	/* 10GBASE-SW ability */
+#define MDIO_PMA_STAT2_10GBLX4		0x0010	/* 10GBASE-LX4 ability */
+#define MDIO_PMA_STAT2_10GBER		0x0020	/* 10GBASE-ER ability */
+#define MDIO_PMA_STAT2_10GBLR		0x0040	/* 10GBASE-LR ability */
+#define MDIO_PMA_STAT2_10GBSR		0x0080	/* 10GBASE-SR ability */
+#define MDIO_PMD_STAT2_TXDISAB		0x0100	/* PMD TX disable ability */
+#define MDIO_PMA_STAT2_EXTABLE		0x0200	/* Extended abilities */
+#define MDIO_PMA_STAT2_RXFLTABLE	0x1000	/* Receive fault ability */
+#define MDIO_PMA_STAT2_TXFLTABLE	0x2000	/* Transmit fault ability */
+#define MDIO_PCS_STAT2_10GBR		0x0001	/* 10GBASE-R capable */
+#define MDIO_PCS_STAT2_10GBX		0x0002	/* 10GBASE-X capable */
+#define MDIO_PCS_STAT2_10GBW		0x0004	/* 10GBASE-W capable */
+#define MDIO_PCS_STAT2_RXFLTABLE	0x1000	/* Receive fault ability */
+#define MDIO_PCS_STAT2_TXFLTABLE	0x2000	/* Transmit fault ability */
+
+/* Transmit disable register. */
+#define MDIO_PMD_TXDIS_GLOBAL		0x0001	/* Global PMD TX disable */
+#define MDIO_PMD_TXDIS_0		0x0002	/* PMD TX disable 0 */
+#define MDIO_PMD_TXDIS_1		0x0004	/* PMD TX disable 1 */
+#define MDIO_PMD_TXDIS_2		0x0008	/* PMD TX disable 2 */
+#define MDIO_PMD_TXDIS_3		0x0010	/* PMD TX disable 3 */
+
+/* Receive signal detect register. */
+#define MDIO_PMD_RXDET_GLOBAL		0x0001	/* Global PMD RX signal detect */
+#define MDIO_PMD_RXDET_0		0x0002	/* PMD RX signal detect 0 */
+#define MDIO_PMD_RXDET_1		0x0004	/* PMD RX signal detect 1 */
+#define MDIO_PMD_RXDET_2		0x0008	/* PMD RX signal detect 2 */
+#define MDIO_PMD_RXDET_3		0x0010	/* PMD RX signal detect 3 */
+
+/* Extended abilities register. */
+#define MDIO_PMA_EXTABLE_10GCX4		0x0001	/* 10GBASE-CX4 ability */
+#define MDIO_PMA_EXTABLE_10GBLRM	0x0002	/* 10GBASE-LRM ability */
+#define MDIO_PMA_EXTABLE_10GBT		0x0004	/* 10GBASE-T ability */
+#define MDIO_PMA_EXTABLE_10GBKX4	0x0008	/* 10GBASE-KX4 ability */
+#define MDIO_PMA_EXTABLE_10GBKR		0x0010	/* 10GBASE-KR ability */
+#define MDIO_PMA_EXTABLE_1000BT		0x0020	/* 1000BASE-T ability */
+#define MDIO_PMA_EXTABLE_1000BKX	0x0040	/* 1000BASE-KX ability */
+#define MDIO_PMA_EXTABLE_100BTX		0x0080	/* 100BASE-TX ability */
+#define MDIO_PMA_EXTABLE_10BT		0x0100	/* 10BASE-T ability */
+
+/* PHY XGXS lane state register. */
+#define MDIO_PHYXS_LNSTAT_SYNC0		0x0001
+#define MDIO_PHYXS_LNSTAT_SYNC1		0x0002
+#define MDIO_PHYXS_LNSTAT_SYNC2		0x0004
+#define MDIO_PHYXS_LNSTAT_SYNC3		0x0008
+#define MDIO_PHYXS_LNSTAT_ALIGN		0x1000
+
+/* PMA 10GBASE-T TX power register. */
+#define MDIO_PMA_10GBT_TXPWR_SHORT	0x0001	/* Short-reach mode */
+
+/* PCS 10GBASE-R/-T status register 1. */
+#define MDIO_PCS_10GBRT_STAT1_BLKLK	0x0001	/* Block lock attained */
+
+/* PCS 10GBASE-R/-T status register 2. */
+#define MDIO_PCS_10GBRT_STAT2_ERR	0x00ff
+#define MDIO_PCS_10GBRT_STAT2_BER	0x3f00
+
+/* AN 10GBASE-T control register. */
+#define MDIO_AN_10GBT_CTRL_ADV10G	0x1000	/* Advertise 10GBASE-T */
+
+/* AN 10GBASE-T status register. */
+#define MDIO_AN_10GBT_STAT_LPTRR	0x0200	/* LP training reset req. */
+#define MDIO_AN_10GBT_STAT_LPLTABLE	0x0400	/* LP loop timing ability */
+#define MDIO_AN_10GBT_STAT_LP10G	0x0800	/* LP is 10GBT capable */
+#define MDIO_AN_10GBT_STAT_REMOK	0x1000	/* Remote OK */
+#define MDIO_AN_10GBT_STAT_LOCOK	0x2000	/* Local OK */
+#define MDIO_AN_10GBT_STAT_MS		0x4000	/* Master/slave config */
+#define MDIO_AN_10GBT_STAT_MSFLT	0x8000	/* Master/slave config fault */
+
+/* Mapping between MDIO PRTAD/DEVAD and mii_ioctl_data::phy_id */
+
+#define MDIO_PHY_ID_C45			0x8000
+#define MDIO_PHY_ID_PRTAD		0x03e0
+#define MDIO_PHY_ID_DEVAD		0x001f
+#define MDIO_PHY_ID_C45_MASK						\
+	(MDIO_PHY_ID_C45 | MDIO_PHY_ID_PRTAD | MDIO_PHY_ID_DEVAD)
+
+static inline __u16 mdio_phy_id_c45(int prtad, int devad)
+{
+	return MDIO_PHY_ID_C45 | (prtad << 5) | devad;
+}
+
+static inline bool mdio_phy_id_is_c45(int phy_id)
+{
+	return (phy_id & MDIO_PHY_ID_C45) && !(phy_id & ~MDIO_PHY_ID_C45_MASK);
+}
+
+static inline __u16 mdio_phy_id_prtad(int phy_id)
+{
+	return (phy_id & MDIO_PHY_ID_PRTAD) >> 5;
+}
+
+static inline __u16 mdio_phy_id_devad(int phy_id)
+{
+	return phy_id & MDIO_PHY_ID_DEVAD;
+}
+
+#endif /* __LINUX_MDIO_H__ */
-- 
cgit v1.2.3-70-g09d2


From 1b1c2e95103ce391c2ea39a9460968fcb73deb30 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:04:46 +0000
Subject: mdio: Add generic MDIO (clause 45) support functions

These roughly mirror many of the MII library functions and are based
on code from the sfc driver.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig  |   3 +
 drivers/net/Makefile |   1 +
 drivers/net/mdio.c   | 359 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mdio.h |  66 ++++++++++
 4 files changed, 429 insertions(+)
 create mode 100644 drivers/net/mdio.c

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index bc7eef12d95..2f81db51b45 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2443,6 +2443,9 @@ menuconfig NETDEV_10000
 
 if NETDEV_10000
 
+config MDIO
+	tristate
+
 config CHELSIO_T1
         tristate "Chelsio 10Gb Ethernet support"
         depends on PCI
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 80420f6d079..dcd5f15dd9e 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_SH_ETH) += sh_eth.o
 #
 
 obj-$(CONFIG_MII) += mii.o
+obj-$(CONFIG_MDIO) += mdio.o
 obj-$(CONFIG_PHYLIB) += phy/
 
 obj-$(CONFIG_SUNDANCE) += sundance.o
diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
new file mode 100644
index 00000000000..0604a25748e
--- /dev/null
+++ b/drivers/net/mdio.c
@@ -0,0 +1,359 @@
+/*
+ * mdio.c: Generic support for MDIO-compatible transceivers
+ * Copyright 2006-2009 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/ethtool.h>
+#include <linux/mdio.h>
+#include <linux/module.h>
+
+/**
+ * mdio45_probe - probe for an MDIO (clause 45) device
+ * @mdio: MDIO interface
+ * @prtad: Expected PHY address
+ *
+ * This sets @prtad and @mmds in the MDIO interface if successful.
+ * Returns 0 on success, negative on error.
+ */
+int mdio45_probe(struct mdio_if_info *mdio, int prtad)
+{
+	int mmd, stat2, devs1, devs2;
+
+	/* Assume PHY must have at least one of PMA/PMD, WIS, PCS, PHY
+	 * XS or DTE XS; give up if none is present. */
+	for (mmd = 1; mmd <= 5; mmd++) {
+		/* Is this MMD present? */
+		stat2 = mdio->mdio_read(mdio->dev, prtad, mmd, MDIO_STAT2);
+		if (stat2 < 0 ||
+		    (stat2 & MDIO_STAT2_DEVPRST) != MDIO_STAT2_DEVPRST_VAL)
+			continue;
+
+		/* It should tell us about all the other MMDs */
+		devs1 = mdio->mdio_read(mdio->dev, prtad, mmd, MDIO_DEVS1);
+		devs2 = mdio->mdio_read(mdio->dev, prtad, mmd, MDIO_DEVS2);
+		if (devs1 < 0 || devs2 < 0)
+			continue;
+
+		mdio->prtad = prtad;
+		mdio->mmds = devs1 | (devs2 << 16);
+		return 0;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(mdio45_probe);
+
+/**
+ * mdio_set_flag - set or clear flag in an MDIO register
+ * @mdio: MDIO interface
+ * @prtad: PHY address
+ * @devad: MMD address
+ * @addr: Register address
+ * @mask: Mask for flag (single bit set)
+ * @sense: New value of flag
+ *
+ * This debounces changes: it does not write the register if the flag
+ * already has the proper value.  Returns 0 on success, negative on error.
+ */
+int mdio_set_flag(const struct mdio_if_info *mdio,
+		  int prtad, int devad, u16 addr, int mask,
+		  bool sense)
+{
+	int old_val = mdio->mdio_read(mdio->dev, prtad, devad, addr);
+	int new_val;
+
+	if (old_val < 0)
+		return old_val;
+	if (sense)
+		new_val = old_val | mask;
+	else
+		new_val = old_val & ~mask;
+	if (old_val == new_val)
+		return 0;
+	return mdio->mdio_write(mdio->dev, prtad, devad, addr, new_val);
+}
+EXPORT_SYMBOL(mdio_set_flag);
+
+/**
+ * mdio_link_ok - is link status up/OK
+ * @mdio: MDIO interface
+ * @mmd_mask: Mask for MMDs to check
+ *
+ * Returns 1 if the PHY reports link status up/OK, 0 otherwise.
+ * @mmd_mask is normally @mdio->mmds, but if loopback is enabled
+ * the MMDs being bypassed should be excluded from the mask.
+ */
+int mdio45_links_ok(const struct mdio_if_info *mdio, u32 mmd_mask)
+{
+	int devad, reg;
+
+	if (!mmd_mask) {
+		/* Use absence of XGMII faults in lieu of link state */
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad,
+				      MDIO_MMD_PHYXS, MDIO_STAT2);
+		return reg >= 0 && !(reg & MDIO_STAT2_RXFAULT);
+	}
+
+	for (devad = 0; mmd_mask; devad++) {
+		if (mmd_mask & (1 << devad)) {
+			mmd_mask &= ~(1 << devad);
+
+			/* Read twice because link state is latched and a
+			 * read moves the current state into the register */
+			mdio->mdio_read(mdio->dev, mdio->prtad,
+					devad, MDIO_STAT1);
+			reg = mdio->mdio_read(mdio->dev, mdio->prtad,
+					      devad, MDIO_STAT1);
+			if (reg < 0 || !(reg & MDIO_STAT1_LSTATUS))
+				return false;
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(mdio45_links_ok);
+
+/**
+ * mdio45_nway_restart - restart auto-negotiation for this interface
+ * @mdio: MDIO interface
+ *
+ * Returns 0 on success, negative on error.
+ */
+int mdio45_nway_restart(const struct mdio_if_info *mdio)
+{
+	if (!(mdio->mmds & MDIO_DEVS_AN))
+		return -EOPNOTSUPP;
+
+	mdio_set_flag(mdio, mdio->prtad, MDIO_MMD_AN, MDIO_CTRL1,
+		      MDIO_AN_CTRL1_RESTART, true);
+	return 0;
+}
+EXPORT_SYMBOL(mdio45_nway_restart);
+
+static u32 mdio45_get_an(const struct mdio_if_info *mdio, u16 addr)
+{
+	u32 result = 0;
+	int reg;
+
+	reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN, addr);
+	if (reg & ADVERTISE_10HALF)
+		result |= ADVERTISED_10baseT_Half;
+	if (reg & ADVERTISE_10FULL)
+		result |= ADVERTISED_10baseT_Full;
+	if (reg & ADVERTISE_100HALF)
+		result |= ADVERTISED_100baseT_Half;
+	if (reg & ADVERTISE_100FULL)
+		result |= ADVERTISED_100baseT_Full;
+	return result;
+}
+
+/**
+ * mdio45_ethtool_gset_npage - get settings for ETHTOOL_GSET
+ * @mdio: MDIO interface
+ * @ecmd: Ethtool request structure
+ * @npage_adv: Modes currently advertised on next pages
+ * @npage_lpa: Modes advertised by link partner on next pages
+ *
+ * Since the CSRs for auto-negotiation using next pages are not fully
+ * standardised, this function does not attempt to decode them.  The
+ * caller must pass them in.
+ */
+void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
+			       struct ethtool_cmd *ecmd,
+			       u32 npage_adv, u32 npage_lpa)
+{
+	int reg;
+
+	ecmd->transceiver = XCVR_INTERNAL;
+	ecmd->phy_address = mdio->prtad;
+
+	reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+			      MDIO_CTRL2);
+	switch (reg & MDIO_PMA_CTRL2_TYPE) {
+	case MDIO_PMA_CTRL2_10GBT:
+	case MDIO_PMA_CTRL2_1000BT:
+	case MDIO_PMA_CTRL2_100BTX:
+	case MDIO_PMA_CTRL2_10BT:
+		ecmd->port = PORT_TP;
+		ecmd->supported = SUPPORTED_TP;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_SPEED);
+		if (reg & MDIO_SPEED_10G)
+			ecmd->supported |= SUPPORTED_10000baseT_Full;
+		if (reg & MDIO_PMA_SPEED_1000)
+			ecmd->supported |= (SUPPORTED_1000baseT_Full |
+					    SUPPORTED_1000baseT_Half);
+		if (reg & MDIO_PMA_SPEED_100)
+			ecmd->supported |= (SUPPORTED_100baseT_Full |
+					    SUPPORTED_100baseT_Half);
+		if (reg & MDIO_PMA_SPEED_10)
+			ecmd->supported |= (SUPPORTED_10baseT_Full |
+					    SUPPORTED_10baseT_Half);
+		ecmd->advertising = ADVERTISED_TP;
+		break;
+
+	case MDIO_PMA_CTRL2_10GBCX4:
+	case MDIO_PMA_CTRL2_10GBKX4:
+	case MDIO_PMA_CTRL2_10GBKR:
+	case MDIO_PMA_CTRL2_1000BKX:
+		ecmd->port = PORT_OTHER;
+		ecmd->supported = 0;
+		ecmd->advertising = 0;
+		break;
+
+	/* All the other defined modes are flavours of optical */
+	default:
+		ecmd->port = PORT_FIBRE;
+		ecmd->supported = SUPPORTED_FIBRE;
+		ecmd->advertising = ADVERTISED_FIBRE;
+		break;
+	}
+
+	if (mdio->mmds & MDIO_DEVS_AN) {
+		ecmd->supported |= SUPPORTED_Autoneg;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN,
+				      MDIO_CTRL1);
+		if (reg & MDIO_AN_CTRL1_ENABLE) {
+			ecmd->autoneg = AUTONEG_ENABLE;
+			ecmd->advertising |=
+				ADVERTISED_Autoneg |
+				mdio45_get_an(mdio, MDIO_AN_ADVERTISE) |
+				npage_adv;
+		} else {
+			ecmd->autoneg = AUTONEG_DISABLE;
+		}
+	} else {
+		ecmd->autoneg = AUTONEG_DISABLE;
+	}
+
+	if (ecmd->autoneg) {
+		u32 modes = 0;
+
+		/* If AN is complete and successful, report best common
+		 * mode, otherwise report best advertised mode. */
+		if (mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN,
+				    MDIO_STAT1) &
+		    MDIO_AN_STAT1_COMPLETE)
+			modes = (ecmd->advertising &
+				 (mdio45_get_an(mdio, MDIO_AN_LPA) |
+				  npage_lpa));
+		if (modes == 0)
+			modes = ecmd->advertising;
+
+		if (modes & ADVERTISED_10000baseT_Full) {
+			ecmd->speed = SPEED_10000;
+			ecmd->duplex = DUPLEX_FULL;
+		} else if (modes & (ADVERTISED_1000baseT_Full |
+				    ADVERTISED_1000baseT_Half)) {
+			ecmd->speed = SPEED_1000;
+			ecmd->duplex = !!(modes & ADVERTISED_1000baseT_Full);
+		} else if (modes & (ADVERTISED_100baseT_Full |
+				    ADVERTISED_100baseT_Half)) {
+			ecmd->speed = SPEED_100;
+			ecmd->duplex = !!(modes & ADVERTISED_100baseT_Full);
+		} else {
+			ecmd->speed = SPEED_10;
+			ecmd->duplex = !!(modes & ADVERTISED_10baseT_Full);
+		}
+	} else {
+		/* Report forced settings */
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_CTRL1);
+		ecmd->speed = (((reg & MDIO_PMA_CTRL1_SPEED1000) ? 100 : 1) *
+			       ((reg & MDIO_PMA_CTRL1_SPEED100) ? 100 : 10));
+		ecmd->duplex = (reg & MDIO_CTRL1_FULLDPLX ||
+				ecmd->speed == SPEED_10000);
+	}
+}
+EXPORT_SYMBOL(mdio45_ethtool_gset_npage);
+
+/**
+ * mdio_mii_ioctl - MII ioctl interface for MDIO (clause 22 or 45) PHYs
+ * @mdio: MDIO interface
+ * @mii_data: MII ioctl data structure
+ * @cmd: MII ioctl command
+ *
+ * Returns 0 on success, negative on error.
+ */
+int mdio_mii_ioctl(const struct mdio_if_info *mdio,
+		   struct mii_ioctl_data *mii_data, int cmd)
+{
+	int prtad, devad;
+	u16 addr = mii_data->reg_num;
+
+	/* Validate/convert cmd to one of SIOC{G,S}MIIREG */
+	switch (cmd) {
+	case SIOCGMIIPHY:
+		if (mdio->prtad == MDIO_PRTAD_NONE)
+			return -EOPNOTSUPP;
+		mii_data->phy_id = mdio->prtad;
+		cmd = SIOCGMIIREG;
+		break;
+	case SIOCGMIIREG:
+		break;
+	case SIOCSMIIREG:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* Validate/convert phy_id */
+	if ((mdio->mode_support & MDIO_SUPPORTS_C45) &&
+	    mdio_phy_id_is_c45(mii_data->phy_id)) {
+		prtad = mdio_phy_id_prtad(mii_data->phy_id);
+		devad = mdio_phy_id_devad(mii_data->phy_id);
+	} else if ((mdio->mode_support & MDIO_SUPPORTS_C22) &&
+		   mii_data->phy_id < 0x20) {
+		prtad = mii_data->phy_id;
+		devad = MDIO_DEVAD_NONE;
+		addr &= 0x1f;
+	} else if ((mdio->mode_support & MDIO_EMULATE_C22) &&
+		   mdio->prtad != MDIO_PRTAD_NONE &&
+		   mii_data->phy_id == mdio->prtad) {
+		/* Remap commonly-used MII registers. */
+		prtad = mdio->prtad;
+		switch (addr) {
+		case MII_BMCR:
+		case MII_BMSR:
+		case MII_PHYSID1:
+		case MII_PHYSID2:
+			devad = __ffs(mdio->mmds);
+			break;
+		case MII_ADVERTISE:
+		case MII_LPA:
+			if (!(mdio->mmds & MDIO_DEVS_AN))
+				return -EINVAL;
+			devad = MDIO_MMD_AN;
+			if (addr == MII_ADVERTISE)
+				addr = MDIO_AN_ADVERTISE;
+			else
+				addr = MDIO_AN_LPA;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		return -EINVAL;
+	}
+
+	if (cmd == SIOCGMIIREG) {
+		int rc = mdio->mdio_read(mdio->dev, prtad, devad, addr);
+		if (rc < 0)
+			return rc;
+		mii_data->val_out = rc;
+		return 0;
+	} else {
+		return mdio->mdio_write(mdio->dev, prtad, devad, addr,
+					mii_data->val_in);
+	}
+}
+EXPORT_SYMBOL(mdio_mii_ioctl);
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index d57ddb08a1b..ba41537eaa8 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -234,4 +234,70 @@ static inline __u16 mdio_phy_id_devad(int phy_id)
 	return phy_id & MDIO_PHY_ID_DEVAD;
 }
 
+#ifdef __KERNEL__ 
+
+/**
+ * struct mdio_if_info - Ethernet controller MDIO interface
+ * @prtad: PRTAD of the PHY (%MDIO_PRTAD_NONE if not present/unknown)
+ * @mmds: Mask of MMDs expected to be present in the PHY.  This must be
+ *	non-zero unless @prtad = %MDIO_PRTAD_NONE.
+ * @mode_support: MDIO modes supported.  If %MDIO_SUPPORTS_C22 is set then
+ *	MII register access will be passed through with @devad =
+ *	%MDIO_DEVAD_NONE.  If %MDIO_EMULATE_C22 is set then access to
+ *	commonly used clause 22 registers will be translated into
+ *	clause 45 registers.
+ * @dev: Net device structure
+ * @mdio_read: Register read function; returns value or negative error code
+ * @mdio_write: Register write function; returns 0 or negative error code
+ */
+struct mdio_if_info {
+	int prtad;
+	u32 __bitwise mmds;
+	unsigned mode_support;
+
+	struct net_device *dev;
+	int (*mdio_read)(struct net_device *dev, int prtad, int devad,
+			 u16 addr);
+	int (*mdio_write)(struct net_device *dev, int prtad, int devad,
+			  u16 addr, u16 val);
+};
+
+#define MDIO_PRTAD_NONE			(-1)
+#define MDIO_DEVAD_NONE			(-1)
+#define MDIO_SUPPORTS_C22		1
+#define MDIO_SUPPORTS_C45		2
+#define MDIO_EMULATE_C22		4
+
+struct ethtool_cmd;
+struct ethtool_pauseparam;
+extern int mdio45_probe(struct mdio_if_info *mdio, int prtad);
+extern int mdio_set_flag(const struct mdio_if_info *mdio,
+			 int prtad, int devad, u16 addr, int mask,
+			 bool sense);
+extern int mdio45_links_ok(const struct mdio_if_info *mdio, u32 mmds);
+extern int mdio45_nway_restart(const struct mdio_if_info *mdio);
+extern void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
+				      struct ethtool_cmd *ecmd,
+				      u32 npage_adv, u32 npage_lpa);
+
+/**
+ * mdio45_ethtool_gset - get settings for ETHTOOL_GSET
+ * @mdio: MDIO interface
+ * @ecmd: Ethtool request structure
+ *
+ * Since the CSRs for auto-negotiation using next pages are not fully
+ * standardised, this function does not attempt to decode them.  Use
+ * mdio45_ethtool_gset_npage() to specify advertisement bits from next
+ * pages.
+ */
+static inline void mdio45_ethtool_gset(const struct mdio_if_info *mdio,
+				       struct ethtool_cmd *ecmd)
+{
+	mdio45_ethtool_gset_npage(mdio, ecmd, 0, 0);
+}
+
+extern int mdio_mii_ioctl(const struct mdio_if_info *mdio,
+			  struct mii_ioctl_data *mii_data, int cmd);
+
+#endif /* __KERNEL__ */
 #endif /* __LINUX_MDIO_H__ */
-- 
cgit v1.2.3-70-g09d2


From 44c22ee91b56d7cad3b48c439dd96aad2e910fbc Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:15:05 +0000
Subject: mii: Simplify mii_resolve_flowctrl_fdx()

This is a shorter and more comprehensible formulation of the
conditions for each flow control mode.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index ad748588faf..14ecb2e114f 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -250,18 +250,12 @@ static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv)
 {
 	u8 cap = 0;
 
-	if (lcladv & ADVERTISE_PAUSE_CAP) {
-		if (lcladv & ADVERTISE_PAUSE_ASYM) {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-			else if (rmtadv & LPA_PAUSE_ASYM)
-				cap = FLOW_CTRL_RX;
-		} else {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-		}
-	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
-		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
+	if (lcladv & rmtadv & ADVERTISE_PAUSE_CAP) {
+		cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
+	} else if (lcladv & rmtadv & ADVERTISE_PAUSE_ASYM) {
+		if (lcladv & ADVERTISE_PAUSE_CAP)
+			cap = FLOW_CTRL_RX;
+		else if (rmtadv & ADVERTISE_PAUSE_CAP)
 			cap = FLOW_CTRL_TX;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a8c30832b5b12e5d4e9d1c20cdac3cc2880e08b8 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:19:03 +0000
Subject: mii: Add mii_advertise_flowctrl()

This converts flow control capabilites to an advertising mask and can
be useful in combination with mii_resolve_flowctrl_fdx().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 14ecb2e114f..359fba88027 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -239,6 +239,22 @@ static inline unsigned int mii_duplex (unsigned int duplex_lock,
 	return 0;
 }
 
+/**
+ * mii_advertise_flowctrl - get flow control advertisement flags
+ * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both)
+ */
+static inline u16 mii_advertise_flowctrl(int cap)
+{
+	u16 adv = 0;
+
+	if (cap & FLOW_CTRL_RX)
+		adv = ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM;
+	if (cap & FLOW_CTRL_TX)
+		adv ^= ADVERTISE_PAUSE_ASYM;
+
+	return adv;
+}
+
 /**
  * mii_resolve_flowctrl_fdx
  * @lcladv: value of MII ADVERTISE register
-- 
cgit v1.2.3-70-g09d2


From af2a3eac2fe6a6d8e9fdf6927284b34466a7d808 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:19:36 +0000
Subject: mdio: Add mdio45_ethtool_spauseparam_an()

This implements the ETHTOOL_SPAUSEPARAM operation for MDIO (clause 45)
PHYs with auto-negotiation MMDs.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio.c   | 30 ++++++++++++++++++++++++++++++
 include/linux/mdio.h |  3 +++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index 0604a25748e..2fb0d16b61f 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -274,6 +274,36 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 }
 EXPORT_SYMBOL(mdio45_ethtool_gset_npage);
 
+/**
+ * mdio45_ethtool_spauseparam_an - set auto-negotiated pause parameters
+ * @mdio: MDIO interface
+ * @ecmd: Ethtool request structure
+ *
+ * This function assumes that the PHY has an auto-negotiation MMD.  It
+ * will enable and disable advertising of flow control as appropriate.
+ */
+void mdio45_ethtool_spauseparam_an(const struct mdio_if_info *mdio,
+				   const struct ethtool_pauseparam *ecmd)
+{
+	int adv, old_adv;
+
+	WARN_ON(!(mdio->mmds & MDIO_DEVS_AN));
+
+	old_adv = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN,
+				  MDIO_AN_ADVERTISE);
+	adv = old_adv & ~(ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM);
+	if (ecmd->autoneg)
+		adv |= mii_advertise_flowctrl(
+			(ecmd->rx_pause ? FLOW_CTRL_RX : 0) |
+			(ecmd->tx_pause ? FLOW_CTRL_TX : 0));
+	if (adv != old_adv) {
+		mdio->mdio_write(mdio->dev, mdio->prtad, MDIO_MMD_AN,
+				 MDIO_AN_ADVERTISE, adv);
+		mdio45_nway_restart(mdio);
+	}
+}
+EXPORT_SYMBOL(mdio45_ethtool_spauseparam_an);
+
 /**
  * mdio_mii_ioctl - MII ioctl interface for MDIO (clause 22 or 45) PHYs
  * @mdio: MDIO interface
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index ba41537eaa8..5645c0f863d 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -279,6 +279,9 @@ extern int mdio45_nway_restart(const struct mdio_if_info *mdio);
 extern void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 				      struct ethtool_cmd *ecmd,
 				      u32 npage_adv, u32 npage_lpa);
+extern void
+mdio45_ethtool_spauseparam_an(const struct mdio_if_info *mdio,
+			      const struct ethtool_pauseparam *ecmd);
 
 /**
  * mdio45_ethtool_gset - get settings for ETHTOOL_GSET
-- 
cgit v1.2.3-70-g09d2


From 0c09c1a49cc7b819b33566a49d9901f7cfdd6889 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:21:53 +0000
Subject: ethtool/mdio: Report MDIO mode support and link partner advertising

Add mdio_support and lp_advertising fields to ethtool_cmd.  Set these
in mdio45_ethtool_gset{,_npage}().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio.c      | 19 ++++++++++++-------
 include/linux/ethtool.h |  4 +++-
 include/linux/mdio.h    |  5 +++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index 2fb0d16b61f..ee383c25693 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -173,6 +173,8 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 
 	ecmd->transceiver = XCVR_INTERNAL;
 	ecmd->phy_address = mdio->prtad;
+	ecmd->mdio_support =
+		mdio->mode_support & (MDIO_SUPPORTS_C45 | MDIO_SUPPORTS_C22);
 
 	reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
 			      MDIO_CTRL2);
@@ -235,16 +237,19 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 
 	if (ecmd->autoneg) {
 		u32 modes = 0;
+		int an_stat = mdio->mdio_read(mdio->dev, mdio->prtad,
+					      MDIO_MMD_AN, MDIO_STAT1);
 
 		/* If AN is complete and successful, report best common
 		 * mode, otherwise report best advertised mode. */
-		if (mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN,
-				    MDIO_STAT1) &
-		    MDIO_AN_STAT1_COMPLETE)
-			modes = (ecmd->advertising &
-				 (mdio45_get_an(mdio, MDIO_AN_LPA) |
-				  npage_lpa));
-		if (modes == 0)
+		if (an_stat & MDIO_AN_STAT1_COMPLETE) {
+			ecmd->lp_advertising =
+				mdio45_get_an(mdio, MDIO_AN_LPA) | npage_lpa;
+			if (an_stat & MDIO_AN_STAT1_LPABLE)
+				ecmd->lp_advertising |= ADVERTISED_Autoneg;
+			modes = ecmd->advertising & ecmd->lp_advertising;
+		}
+		if ((modes & ~ADVERTISED_Autoneg) == 0)
 			modes = ecmd->advertising;
 
 		if (modes & ADVERTISED_10000baseT_Full) {
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 5ccb6bd660c..14e6bc86011 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -26,11 +26,13 @@ struct ethtool_cmd {
 	__u8	phy_address;
 	__u8	transceiver;	/* Which transceiver to use */
 	__u8	autoneg;	/* Enable or disable autonegotiation */
+	__u8	mdio_support;
 	__u32	maxtxpkt;	/* Tx pkts before generating tx int */
 	__u32	maxrxpkt;	/* Rx pkts before generating rx int */
 	__u16	speed_hi;
 	__u16	reserved2;
-	__u32	reserved[3];
+	__u32	lp_advertising;	/* Features the link partner advertises */
+	__u32	reserved[2];
 };
 
 static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep,
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 5645c0f863d..1bff2f2d0e1 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -234,6 +234,9 @@ static inline __u16 mdio_phy_id_devad(int phy_id)
 	return phy_id & MDIO_PHY_ID_DEVAD;
 }
 
+#define MDIO_SUPPORTS_C22		1
+#define MDIO_SUPPORTS_C45		2
+
 #ifdef __KERNEL__ 
 
 /**
@@ -264,8 +267,6 @@ struct mdio_if_info {
 
 #define MDIO_PRTAD_NONE			(-1)
 #define MDIO_DEVAD_NONE			(-1)
-#define MDIO_SUPPORTS_C22		1
-#define MDIO_SUPPORTS_C45		2
 #define MDIO_EMULATE_C22		4
 
 struct ethtool_cmd;
-- 
cgit v1.2.3-70-g09d2


From 894b19a6b343ce3589237167a56e6df0fe72ef0d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 29 Apr 2009 08:25:57 +0000
Subject: ethtool/mdio: Support backplane mode negotiation

Compile-tested only.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio.c      | 30 +++++++++++++++++++++++++-----
 include/linux/ethtool.h | 10 ++++++++++
 include/linux/mdio.h    |  5 +++++
 3 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index ee383c25693..66483035f68 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -202,12 +202,29 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 		break;
 
 	case MDIO_PMA_CTRL2_10GBCX4:
+		ecmd->port = PORT_OTHER;
+		ecmd->supported = 0;
+		ecmd->advertising = 0;
+		break;
+
 	case MDIO_PMA_CTRL2_10GBKX4:
 	case MDIO_PMA_CTRL2_10GBKR:
 	case MDIO_PMA_CTRL2_1000BKX:
 		ecmd->port = PORT_OTHER;
-		ecmd->supported = 0;
-		ecmd->advertising = 0;
+		ecmd->supported = SUPPORTED_Backplane;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_PMA_EXTABLE);
+		if (reg & MDIO_PMA_EXTABLE_10GBKX4)
+			ecmd->supported |= SUPPORTED_10000baseKX4_Full;
+		if (reg & MDIO_PMA_EXTABLE_10GBKR)
+			ecmd->supported |= SUPPORTED_10000baseKR_Full;
+		if (reg & MDIO_PMA_EXTABLE_1000BKX)
+			ecmd->supported |= SUPPORTED_1000baseKX_Full;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_PMA_10GBR_FECABLE);
+		if (reg & MDIO_PMA_10GBR_FECABLE_ABLE)
+			ecmd->supported |= SUPPORTED_10000baseR_FEC;
+		ecmd->advertising = ADVERTISED_Backplane;
 		break;
 
 	/* All the other defined modes are flavours of optical */
@@ -252,13 +269,16 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 		if ((modes & ~ADVERTISED_Autoneg) == 0)
 			modes = ecmd->advertising;
 
-		if (modes & ADVERTISED_10000baseT_Full) {
+		if (modes & (ADVERTISED_10000baseT_Full |
+			     ADVERTISED_10000baseKX4_Full |
+			     ADVERTISED_10000baseKR_Full)) {
 			ecmd->speed = SPEED_10000;
 			ecmd->duplex = DUPLEX_FULL;
 		} else if (modes & (ADVERTISED_1000baseT_Full |
-				    ADVERTISED_1000baseT_Half)) {
+				    ADVERTISED_1000baseT_Half |
+				    ADVERTISED_1000baseKX_Full)) {
 			ecmd->speed = SPEED_1000;
-			ecmd->duplex = !!(modes & ADVERTISED_1000baseT_Full);
+			ecmd->duplex = !(modes & ADVERTISED_1000baseT_Half);
 		} else if (modes & (ADVERTISED_100baseT_Full |
 				    ADVERTISED_100baseT_Half)) {
 			ecmd->speed = SPEED_100;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 14e6bc86011..380b04272bf 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -565,6 +565,11 @@ struct ethtool_ops {
 #define SUPPORTED_Pause			(1 << 13)
 #define SUPPORTED_Asym_Pause		(1 << 14)
 #define SUPPORTED_2500baseX_Full	(1 << 15)
+#define SUPPORTED_Backplane		(1 << 16)
+#define SUPPORTED_1000baseKX_Full	(1 << 17)
+#define SUPPORTED_10000baseKX4_Full	(1 << 18)
+#define SUPPORTED_10000baseKR_Full	(1 << 19)
+#define SUPPORTED_10000baseR_FEC	(1 << 20)
 
 /* Indicates what features are advertised by the interface. */
 #define ADVERTISED_10baseT_Half		(1 << 0)
@@ -583,6 +588,11 @@ struct ethtool_ops {
 #define ADVERTISED_Pause		(1 << 13)
 #define ADVERTISED_Asym_Pause		(1 << 14)
 #define ADVERTISED_2500baseX_Full	(1 << 15)
+#define ADVERTISED_Backplane		(1 << 16)
+#define ADVERTISED_1000baseKX_Full	(1 << 17)
+#define ADVERTISED_10000baseKX4_Full	(1 << 18)
+#define ADVERTISED_10000baseKR_Full	(1 << 19)
+#define ADVERTISED_10000baseR_FEC	(1 << 20)
 
 /* The following are all involved in forcing a particular link
  * mode for the device for setting things.  When getting the
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 1bff2f2d0e1..26b4eb3bbee 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -46,6 +46,7 @@
 
 /* Media-dependent registers. */
 #define MDIO_PMA_10GBT_TXPWR	131	/* 10GBASE-T TX power control */
+#define MDIO_PMA_10GBR_FECABLE	170	/* 10GBASE-R FEC ability */
 #define MDIO_PCS_10GBX_STAT1	24	/* 10GBASE-X PCS status 1 */
 #define MDIO_PCS_10GBRT_STAT1	32	/* 10GBASE-R/-T PCS status 1 */
 #define MDIO_PCS_10GBRT_STAT2	33	/* 10GBASE-R/-T PCS status 2 */
@@ -187,6 +188,10 @@
 /* PMA 10GBASE-T TX power register. */
 #define MDIO_PMA_10GBT_TXPWR_SHORT	0x0001	/* Short-reach mode */
 
+/* PMA 10GBASE-R FEC ability register. */
+#define MDIO_PMA_10GBR_FECABLE_ABLE	0x0001	/* FEC ability */
+#define MDIO_PMA_10GBR_FECABLE_ERRABLE	0x0002	/* FEC error indic. ability */
+
 /* PCS 10GBASE-R/-T status register 1. */
 #define MDIO_PCS_10GBRT_STAT1_BLKLK	0x0001	/* Block lock attained */
 
-- 
cgit v1.2.3-70-g09d2


From a511e3f968c462a55ef58697257f5347c73d306e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 29 Apr 2009 15:59:58 -0700
Subject: mutex: add atomic_dec_and_mutex_lock(), fix

include/linux/mutex.h:136: warning: 'mutex_lock' declared inline after being called
 include/linux/mutex.h:136: warning: previous declaration of 'mutex_lock' was here

uninline it.

[ Impact: clean up and uninline, address compiler warning ]

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <200904292318.n3TNIsi6028340@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 24 +-----------------------
 kernel/mutex.c        | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 93054fc3635..878cab4f5fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -150,28 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
-
-/**
- * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
- * @cnt: the atomic which we are to dec
- * @lock: the mutex to return holding if we dec to 0
- *
- * return true and hold lock if we dec to 0, return false otherwise
- */
-static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
-{
-	/* dec if we can't possibly hit 0 */
-	if (atomic_add_unless(cnt, -1, 1))
-		return 0;
-	/* we might hit 0, so take the lock */
-	mutex_lock(lock);
-	if (!atomic_dec_and_test(cnt)) {
-		/* when we actually did the dec, we didn't hit 0 */
-		mutex_unlock(lock);
-		return 0;
-	}
-	/* we hit 0, and we hold the lock */
-	return 1;
-}
+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f..e2d25e9e62a 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
-- 
cgit v1.2.3-70-g09d2


From ba9c22f2c01cf5c88beed5a6b9e07d42e10bd358 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Mon, 20 Apr 2009 22:22:22 -0700
Subject: futex: remove FUTEX_REQUEUE_PI (non CMP)

The new requeue PI futex op codes were modeled after the existing
FUTEX_REQUEUE and FUTEX_CMP_REQUEUE calls.  I was unaware at the time
that FUTEX_REQUEUE was only around for compatibility reasons and
shouldn't be used in new code.  Ulrich Drepper elaborates on this in his
Futexes are Tricky paper: http://people.redhat.com/drepper/futex.pdf.
The deprecated call doesn't catch changes to the futex corresponding to
the destination futex which can lead to deadlock.

Therefor, I feel it best to remove FUTEX_REQUEUE_PI and leave only
FUTEX_CMP_REQUEUE_PI as there are not yet any existing users of the API.
This patch does change the OP code value of FUTEX_CMP_REQUEUE_PI to 12
from 13.  Since my test case is the only known user of this API, I felt
this was the right thing to do, rather than leave a hole in the
enumeration.

I chose to continue using the _CMP_ modifier in the OP code to make it
explicit to the user that the test is being done.

Builds, boots, and ran several hundred iterations requeue_pi.c.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <49ED580E.1050502@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/futex.h | 4 +---
 kernel/futex.c        | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index b05519ca9e5..34956c8fdeb 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -24,8 +24,7 @@ union ktime;
 #define FUTEX_WAIT_BITSET	9
 #define FUTEX_WAKE_BITSET	10
 #define FUTEX_WAIT_REQUEUE_PI	11
-#define FUTEX_REQUEUE_PI	12
-#define FUTEX_CMP_REQUEUE_PI	13
+#define FUTEX_CMP_REQUEUE_PI	12
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -43,7 +42,6 @@ union ktime;
 #define FUTEX_WAKE_BITSET_PRIVATE	(FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
-#define FUTEX_REQUEUE_PI_PRIVATE	(FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d2daa46f9f..aec8bf89bf4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2555,9 +2555,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
 					    clockrt, uaddr2);
 		break;
-	case FUTEX_REQUEUE_PI:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
-		break;
 	case FUTEX_CMP_REQUEUE_PI:
 		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
 				    1);
@@ -2596,8 +2593,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-	    cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
-	    cmd == FUTEX_WAKE_OP)
+	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
cgit v1.2.3-70-g09d2


From 62ab4505e3efaf67784f84059e0fb9cedb1728ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:06 +0000
Subject: signals: implement sys_rt_tgsigqueueinfo

sys_kill has the per thread counterpart sys_tgkill. sigqueueinfo is
missing a thread directed counterpart. Such an interface is important
for migrating applications from other OSes which have the per thread
delivery implemented.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>
---
 include/linux/compat.h |  2 ++
 include/linux/signal.h |  2 ++
 kernel/compat.c        | 11 +++++++++++
 kernel/signal.c        | 26 ++++++++++++++++++++++++++
 4 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index f2ded21f9a3..af931ee43dd 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
 int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event);
+long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+				  struct compat_siginfo __user *uinfo);
 
 static inline int compat_timeval_compare(struct compat_timeval *lhs,
 					struct compat_timeval *rhs)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 84f997f8aa5..c7552836bd9 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
+extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
+				 siginfo_t *info);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460..f6c204f07ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 
 }
 
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+			     struct compat_siginfo __user *uinfo)
+{
+	siginfo_t info;
+
+	if (copy_siginfo_from_user32(&info, uinfo))
+		return -EFAULT;
+	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 56d27acad87..f79b3b9f837 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2369,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 	return kill_proc_info(sig, &info, pid);
 }
 
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	/* Not even root can pretend to send signals from the kernel.
+	   Nor can they impersonate a kill(), which adds source info.  */
+	if (info->si_code >= 0)
+		return -EPERM;
+	info->si_signo = sig;
+
+	return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+		siginfo_t __user *, uinfo)
+{
+	siginfo_t info;
+
+	if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+		return -EFAULT;
+
+	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct task_struct *t = current;
-- 
cgit v1.2.3-70-g09d2


From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:16 +0200
Subject: perf_counter: fix race in perf_output_*

When two (or more) contexts output to the same buffer, it is possible
to observe half written output.

Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing
perf_counter_overflow(). If CPU1 does a wakeup and exposes head to
user-space, then CPU2 can observe the data CPU0 is still writing.

[ Impact: fix occasionally corrupted profiling records ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.007821627@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   5 +-
 kernel/perf_counter.c        | 130 +++++++++++++++++++++++++++++++++----------
 2 files changed, 105 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 41aed427005..f776851f8c4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,10 +358,13 @@ struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
 
-	atomic_t			wakeup;		/* POLL_ for wakeups */
+	atomic_t			poll;		/* POLL_ for wakeups */
 	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
+	atomic_t			wakeup_head;	/* completed head    */
+	atomic_t			lock;		/* concurrent writes */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75f2b6c8239..8660ae57953 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
 	struct perf_mmap_data *data;
-	unsigned int events;
+	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data)
-		events = atomic_xchg(&data->wakeup, 0);
-	else
-		events = POLL_HUP;
+		events = atomic_xchg(&data->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &counter->waitq, wait);
@@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = {
 
 void perf_counter_wakeup(struct perf_counter *counter)
 {
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data) {
-		atomic_set(&data->wakeup, POLL_IN);
-		/*
-		 * Ensure all data writes are issued before updating the
-		 * user-space data head information. The matching rmb()
-		 * will be in userspace after reading this value.
-		 */
-		smp_wmb();
-		data->user_page->data_head = atomic_read(&data->head);
-	}
-	rcu_read_unlock();
-
 	wake_up_all(&counter->waitq);
 
 	if (counter->pending_kill) {
@@ -1721,10 +1703,14 @@ struct perf_output_handle {
 	int			wakeup;
 	int			nmi;
 	int			overflow;
+	int			locked;
+	unsigned long		flags;
 };
 
-static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+static void perf_output_wakeup(struct perf_output_handle *handle)
 {
+	atomic_set(&handle->data->poll, POLL_IN);
+
 	if (handle->nmi) {
 		handle->counter->pending_wakeup = 1;
 		perf_pending_queue(&handle->counter->pending,
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 		perf_counter_wakeup(handle->counter);
 }
 
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int cpu;
+
+	handle->locked = 0;
+
+	local_irq_save(handle->flags);
+	cpu = smp_processor_id();
+
+	if (in_nmi() && atomic_read(&data->lock) == cpu)
+		return;
+
+	while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+		cpu_relax();
+
+	handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int head, cpu;
+
+	if (handle->wakeup)
+		data->wakeup_head = data->head;
+
+	if (!handle->locked)
+		goto out;
+
+again:
+	/*
+	 * The xchg implies a full barrier that ensures all writes are done
+	 * before we publish the new head, matched by a rmb() in userspace when
+	 * reading this position.
+	 */
+	while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+		data->user_page->data_head = head;
+		handle->wakeup = 1;
+	}
+
+	/*
+	 * NMI can happen here, which means we can miss a wakeup_head update.
+	 */
+
+	cpu = atomic_xchg(&data->lock, 0);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
+	/*
+	 * Therefore we have to validate we did not indeed do so.
+	 */
+	if (unlikely(atomic_read(&data->wakeup_head))) {
+		/*
+		 * Since we had it locked, we can lock it again.
+		 */
+		while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+			cpu_relax();
+
+		goto again;
+	}
+
+	if (handle->wakeup)
+		perf_output_wakeup(handle);
+out:
+	local_irq_restore(handle->flags);
+}
+
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
 			     int nmi, int overflow)
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
+	handle->data	 = data;
 	handle->counter	 = counter;
 	handle->nmi	 = nmi;
 	handle->overflow = overflow;
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data->nr_pages)
 		goto fail;
 
+	perf_output_lock(handle);
+
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	handle->data	= data;
 	handle->offset	= offset;
 	handle->head	= head;
 	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	__perf_output_wakeup(handle);
+	perf_output_wakeup(handle);
 out:
 	rcu_read_unlock();
 
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	int wakeup_events = handle->counter->hw_event.wakeup_events;
+	struct perf_counter *counter = handle->counter;
+	struct perf_mmap_data *data = handle->data;
+
+	int wakeup_events = counter->hw_event.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
-		int events = atomic_inc_return(&handle->data->events);
+		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
-			atomic_sub(wakeup_events, &handle->data->events);
-			__perf_output_wakeup(handle);
+			atomic_sub(wakeup_events, &data->events);
+			handle->wakeup = 1;
 		}
-	} else if (handle->wakeup)
-		__perf_output_wakeup(handle);
+	}
+
+	perf_output_unlock(handle);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3-70-g09d2


From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 30 Apr 2009 01:17:50 -0700
Subject: x86/irq: use move_irq_desc() in create_irq_nr()

move_irq_desc() will try to move irq_desc to the home node if
the allocated one is not correct, in create_irq_nr().

( This can happen on devices that are on different nodes that
  are using MSI, when drivers are loaded and unloaded randomly. )

v2: fix non-smp build
v3: add NUMA_IRQ_DESC to eliminate #ifdefs

[ Impact: improve irq descriptor locality on NUMA systems ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F95EAE.2050903@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig               |  4 ++++
 arch/x86/kernel/apic/io_apic.c |  6 +-----
 include/linux/irq.h            | 11 +++++++++--
 kernel/irq/Makefile            |  2 +-
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e1b2543f8ed..674e21e9f0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -274,6 +274,10 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
+config NUMA_IRQ_DESC
+	def_bool y
+	depends on SPARSE_IRQ && NUMA
+
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9cd4806cdf5..e583291fe6c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3197,11 +3197,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 		if (cfg_new->vector != 0)
 			continue;
 
-#ifdef CONFIG_NUMA_IRQ_DESC
-		/* different node ?*/
-		if (desc_new->node != node)
-			desc = move_irq_desc(desc, node);
-#endif
+		desc_new = move_irq_desc(desc_new, node);
 
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4b95ddb5304..eedbb8e5e0c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -206,9 +206,16 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
-#else /* CONFIG_SPARSE_IRQ */
+#endif
+
+#ifdef CONFIG_NUMA_IRQ_DESC
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
-#endif /* CONFIG_SPARSE_IRQ */
+#else
+static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
+{
+	return desc;
+}
+#endif
 
 extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2f065277f8e..7d047808419 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
-- 
cgit v1.2.3-70-g09d2


From c237dc0303bcf6f4cc2e0efe4fe4e341c6f34dac Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 29 Apr 2009 19:09:19 -0400
Subject: nfsd4: rename callback struct to cb_conn

I want to use the name for a struct that actually does represent a
single callback.

(Actually, I've never been sure it helps to a separate struct for the
callback information.  Some day maybe those fields could just be dumped
into struct nfs4_client.  I don't know.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 16 ++++++++--------
 fs/nfsd/nfs4state.c        | 22 +++++++++++-----------
 include/linux/nfsd/state.h |  4 ++--
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0aaf68beedb..ed860d7ddd1 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -369,7 +369,7 @@ static int max_cb_time(void)
 int setup_callback_client(struct nfs4_client *clp)
 {
 	struct sockaddr_in	addr;
-	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
 		.to_retries	= 0,
@@ -422,7 +422,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 	if (task->tk_status)
 		warn_no_callback_path(clp, task->tk_status);
 	else
-		atomic_set(&clp->cl_callback.cb_set, 1);
+		atomic_set(&clp->cl_cb_conn.cb_set, 1);
 	put_nfs4_client(clp);
 }
 
@@ -430,7 +430,7 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
 	.rpc_call_done = nfsd4_cb_probe_done,
 };
 
-static struct rpc_cred *lookup_cb_cred(struct nfs4_callback *cb)
+static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
 {
 	struct auth_cred acred = {
 		.machine_cred = 1
@@ -448,7 +448,7 @@ static struct rpc_cred *lookup_cb_cred(struct nfs4_callback *cb)
 
 void do_probe_callback(struct nfs4_client *clp)
 {
-	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
 		.rpc_argp       = clp,
@@ -480,7 +480,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 {
 	int status;
 
-	BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+	BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
 
 	status = setup_callback_client(clp);
 	if (status) {
@@ -501,12 +501,12 @@ void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
 	struct nfs4_cb_recall *cbr = &dp->dl_recall;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_argp = cbr,
-		.rpc_cred = clp->cl_callback.cb_cred
+		.rpc_cred = clp->cl_cb_conn.cb_cred
 	};
 	int retries = 1;
 	int status = 0;
@@ -519,7 +519,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 		switch (status) {
 			case -EIO:
 				/* Network partition? */
-				atomic_set(&clp->cl_callback.cb_set, 0);
+				atomic_set(&clp->cl_cb_conn.cb_set, 0);
 			case -EBADHANDLE:
 			case -NFS4ERR_BAD_STATEID:
 				/* Race: client probably got cb_recall
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b205c7d7bc6..d7b5e6b8920 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -182,7 +182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
-	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
+	struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
 
 	dprintk("NFSD alloc_init_deleg\n");
 	if (fp->fi_had_conflict)
@@ -633,19 +633,19 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static void
 shutdown_callback_client(struct nfs4_client *clp)
 {
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
 
 	if (clnt) {
 		/*
 		 * Callback threads take a reference on the client, so there
 		 * should be no outstanding callbacks at this point.
 		 */
-		clp->cl_callback.cb_client = NULL;
+		clp->cl_cb_conn.cb_client = NULL;
 		rpc_shutdown_client(clnt);
 	}
-	if (clp->cl_callback.cb_cred) {
-		put_rpccred(clp->cl_callback.cb_cred);
-		clp->cl_callback.cb_cred = NULL;
+	if (clp->cl_cb_conn.cb_cred) {
+		put_rpccred(clp->cl_cb_conn.cb_cred);
+		clp->cl_cb_conn.cb_cred = NULL;
 	}
 }
 
@@ -719,7 +719,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 		return NULL;
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_count, 1);
-	atomic_set(&clp->cl_callback.cb_set, 0);
+	atomic_set(&clp->cl_cb_conn.cb_set, 0);
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
@@ -971,7 +971,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 {
-	struct nfs4_callback *cb = &clp->cl_callback;
+	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 
 	/* Currently, we only support tcp for the callback channel */
 	if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
@@ -1691,7 +1691,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		else {
 			/* XXX: We just turn off callbacks until we can handle
 			  * change request correctly. */
-			atomic_set(&conf->cl_callback.cb_set, 0);
+			atomic_set(&conf->cl_cb_conn.cb_set, 0);
 			expire_client(unconf);
 			status = nfs_ok;
 
@@ -2425,7 +2425,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	struct nfs4_callback *cb = &sop->so_client->cl_callback;
+	struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
 	struct file_lock fl, *flp = &fl;
 	int status, flag = 0;
 
@@ -2617,7 +2617,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
-			&& !atomic_read(&clp->cl_callback.cb_set))
+			&& !atomic_read(&clp->cl_cb_conn.cb_set))
 		goto out;
 	status = nfs_ok;
 out:
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 8d882a3eb4b..563c367a301 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -88,7 +88,7 @@ struct nfs4_delegation {
 #define dl_fh           dl_recall.cbr_fh
 
 /* client delegation callback info */
-struct nfs4_callback {
+struct nfs4_cb_conn {
 	/* SETCLIENTID info */
 	u32                     cb_addr;
 	unsigned short          cb_port;
@@ -186,7 +186,7 @@ struct nfs4_client {
 	struct svc_cred		cl_cred; 	/* setclientid principal */
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
-	struct nfs4_callback	cl_callback;    /* callback info */
+	struct nfs4_cb_conn	cl_cb_conn;     /* callback info */
 	atomic_t		cl_count;	/* ref count */
 	u32			cl_firststate;	/* recovery dir creation */
 
-- 
cgit v1.2.3-70-g09d2


From b53d40c5070bffde1b2bcaf848412a50d8894794 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 1 May 2009 19:50:00 -0400
Subject: nfsd4: eliminate struct nfs4_cb_recall

The nfs4_cb_recall struct is used only in nfs4_delegation, so its
pointer to the containing delegation is unnecessary--we could just use
container_of().

But there's no real reason to have this a separate struct at all--just
move these fields to nfs4_delegation.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 24 +++++++++++-------------
 fs/nfsd/nfs4state.c        |  5 ++---
 include/linux/nfsd/state.h | 18 +++++-------------
 3 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ed860d7ddd1..2509305f6f5 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -215,18 +215,18 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 }
 
 static int
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
+encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp)
 {
 	__be32 *p;
-	int len = cb_rec->cbr_fh.fh_size;
+	int len = dp->dl_fh.fh_size;
 
-	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
+	RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
 	WRITE32(OP_CB_RECALL);
-	WRITE32(cb_rec->cbr_stateid.si_generation);
-	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
-	WRITE32(cb_rec->cbr_trunc);
+	WRITE32(dp->dl_stateid.si_generation);
+	WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
+	WRITE32(dp->dl_trunc);
 	WRITE32(len);
-	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
+	WRITEMEM(&dp->dl_fh.fh_base, len);
 	return 0;
 }
 
@@ -241,11 +241,11 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 }
 
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr = {
-		.ident = args->cbr_ident,
+		.ident = args->dl_ident,
 		.nops   = 1,
 	};
 
@@ -502,17 +502,15 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
 	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-	struct nfs4_cb_recall *cbr = &dp->dl_recall;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-		.rpc_argp = cbr,
+		.rpc_argp = dp,
 		.rpc_cred = clp->cl_cb_conn.cb_cred
 	};
 	int retries = 1;
 	int status = 0;
 
-	cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
-	cbr->cbr_dp = dp;
+	dp->dl_trunc = 0; /* XXX need to implement truncate optimization */
 
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
 	while (retries--) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d7b5e6b8920..3e5345e01b1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -203,9 +203,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	get_file(stp->st_vfs_file);
 	dp->dl_vfs_file = stp->st_vfs_file;
 	dp->dl_type = type;
-	dp->dl_recall.cbr_dp = NULL;
-	dp->dl_recall.cbr_ident = cb->cb_ident;
-	dp->dl_recall.cbr_trunc = 0;
+	dp->dl_ident = cb->cb_ident;
+	dp->dl_trunc = 0;
 	dp->dl_stateid.si_boot = get_seconds();
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 563c367a301..233b60d39b8 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -61,15 +61,6 @@ typedef struct {
 #define si_stateownerid   si_opaque.so_stateownerid
 #define si_fileid         si_opaque.so_fileid
 
-
-struct nfs4_cb_recall {
-	u32			cbr_ident;
-	int			cbr_trunc;
-	stateid_t		cbr_stateid;
-	struct knfsd_fh		cbr_fh;
-	struct nfs4_delegation	*cbr_dp;
-};
-
 struct nfs4_delegation {
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
@@ -81,12 +72,13 @@ struct nfs4_delegation {
 	struct file		*dl_vfs_file;
 	u32			dl_type;
 	time_t			dl_time;
-	struct nfs4_cb_recall	dl_recall;
+/* For recall: */
+	u32			dl_ident;
+	int			dl_trunc;
+	stateid_t		dl_stateid;
+	struct knfsd_fh		dl_fh;
 };
 
-#define dl_stateid      dl_recall.cbr_stateid
-#define dl_fh           dl_recall.cbr_fh
-
 /* client delegation callback info */
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
-- 
cgit v1.2.3-70-g09d2


From 6707bd3d420f53ae8f090dac871843f6f43c9980 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 1 May 2009 19:57:46 -0400
Subject: nfsd4: remove unused dl_trunc

There's no point in keeping this field around--it's always zero.

(Background: the protocol allows you to tell the client that the file is
about to be truncated, as an optimization to save the client from
writing back dirty pages that will just be discarded.  We don't
implement this hint.  If we do some day, adding this field back in will
be the least of the work involved.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 4 +---
 fs/nfsd/nfs4state.c        | 1 -
 include/linux/nfsd/state.h | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 2509305f6f5..0420b5e6e20 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -224,7 +224,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp)
 	WRITE32(OP_CB_RECALL);
 	WRITE32(dp->dl_stateid.si_generation);
 	WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
-	WRITE32(dp->dl_trunc);
+	WRITE32(0); /* truncate optimization not implemented */
 	WRITE32(len);
 	WRITEMEM(&dp->dl_fh.fh_base, len);
 	return 0;
@@ -510,8 +510,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 	int retries = 1;
 	int status = 0;
 
-	dp->dl_trunc = 0; /* XXX need to implement truncate optimization */
-
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
 	while (retries--) {
 		switch (status) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3e5345e01b1..cbb16e191d5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -204,7 +204,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_vfs_file = stp->st_vfs_file;
 	dp->dl_type = type;
 	dp->dl_ident = cb->cb_ident;
-	dp->dl_trunc = 0;
 	dp->dl_stateid.si_boot = get_seconds();
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 233b60d39b8..346b603072c 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -74,7 +74,6 @@ struct nfs4_delegation {
 	time_t			dl_time;
 /* For recall: */
 	u32			dl_ident;
-	int			dl_trunc;
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
 };
-- 
cgit v1.2.3-70-g09d2


From 3aea09dc9106407d8bc18e593fbffda9ad632844 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 1 May 2009 20:11:12 -0400
Subject: nfsd4: track recall retries in nfs4_delegation

Move this out of a local variable into the nfs4_delegation object in
preparation for making this an async rpc call (at which point we'll need
any state like this in a common object that's preserved across function
calls).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 4 ++--
 include/linux/nfsd/state.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0420b5e6e20..b88b207d75d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -507,11 +507,11 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 		.rpc_argp = dp,
 		.rpc_cred = clp->cl_cb_conn.cb_cred
 	};
-	int retries = 1;
 	int status = 0;
 
+	dp->dl_retries = 1;
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
-	while (retries--) {
+	while (dp->dl_retries--) {
 		switch (status) {
 			case -EIO:
 				/* Network partition? */
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 346b603072c..c0c49215ddc 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -76,6 +76,7 @@ struct nfs4_delegation {
 	u32			dl_ident;
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
+	int			dl_retries;
 };
 
 /* client delegation callback info */
-- 
cgit v1.2.3-70-g09d2


From a25cbd045a2ffc42787d4dbcbb9c7118f5f42732 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Fri, 1 May 2009 14:45:46 +0900
Subject: clocksource: setup mult_orig in clocksource_enable()

Setup clocksource mult_orig in clocksource_enable().

Clocksource drivers can save power by using keeping the
device clock disabled while the clocksource is unused.

In practice this means that the enable() and disable()
callbacks perform clk_enable() and clk_disable().

The enable() callback may also use clk_get_rate() to get
the clock rate from the clock framework. This information
can then be used to calculate the shift and mult variables.

Currently the mult_orig variable is setup from mult at
registration time only. This is conflicting with the above
case since the clock is disabled and the mult variable is
not yet calculated at the time of registration.

Moving the mult_orig setup code to clocksource_enable()
allows us to both handle the common case with no enable()
callback and the mult-changed-after-enable() case.

[ Impact: allow dynamic clock source usage ]

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090501054546.8193.10688.sendpatchset@rx1.opensource.se>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 10 +++++++++-
 kernel/time/clocksource.c   |  3 ---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5a40d14daa9..c56457c8334 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -288,7 +288,15 @@ static inline cycle_t clocksource_read(struct clocksource *cs)
  */
 static inline int clocksource_enable(struct clocksource *cs)
 {
-	return cs->enable ? cs->enable(cs) : 0;
+	int ret = 0;
+
+	if (cs->enable)
+		ret = cs->enable(cs);
+
+	/* save mult_orig on enable */
+	cs->mult_orig = cs->mult;
+
+	return ret;
 }
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e..80189f6f1c5 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
 	unsigned long flags;
 	int ret;
 
-	/* save mult_orig on registration */
-	c->mult_orig = c->mult;
-
 	spin_lock_irqsave(&clocksource_lock, flags);
 	ret = clocksource_enqueue(c);
 	if (!ret)
-- 
cgit v1.2.3-70-g09d2


From 7d27558c4138ac6b3684dea35c2f4379b940a7dd Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 1 May 2009 13:10:26 -0700
Subject: timekeeping: create arch_gettimeoffset infrastructure

Some arches don't supply their own clocksource. This is mainly the
case in architectures that get their inter-tick times by reading the
counter on their interval timer.  Since these timers wrap every tick,
they're not really useful as clocksources.  Wrapping them to act like
one is possible but not very efficient. So we provide a callout these
arches can implement for use with the jiffies clocksource to provide
finer then tick granular time.

[ Impact: ease the migration to generic time keeping ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 15 +++++++++++++++
 kernel/time/timekeeping.c |  7 +++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 242f62499bb..ea16c1a01d5 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,6 +113,21 @@ struct timespec current_kernel_time(void);
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
 
+/* Some architectures do not supply their own clocksource.
+ * This is mainly the case in architectures that get their
+ * inter-tick times by reading the counter on their interval
+ * timer. Since these timers wrap every tick, they're not really
+ * useful as clocksources. Wrapping them to act like one is possible
+ * but not very efficient. So we provide a callout these arches
+ * can implement for use with the jiffies clocksource to provide
+ * finer then tick granular time.
+ */
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+extern u32 arch_gettimeoffset(void);
+#else
+static inline u32 arch_gettimeoffset(void) { return 0; }
+#endif
+
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e..e97c50f8458 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
 	clock->cycle_last = cycle_now;
 
 	nsec = cyc2ns(clock, cycle_delta);
+
+	/* If arch requires, add in gettimeoffset() */
+	nsec += arch_gettimeoffset();
+
 	timespec_add_ns(&xtime, nsec);
 
 	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
 		/* convert to nanoseconds: */
 		nsecs = cyc2ns(clock, cycle_delta);
 
+		/* If arch requires, add in gettimeoffset() */
+		nsecs += arch_gettimeoffset();
+
 	} while (read_seqretry(&xtime_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
-- 
cgit v1.2.3-70-g09d2


From d5ed4c2e5ce9f5f6fd6a5a39ee1196a1f8a46eed Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:02:49 +0000
Subject: clocksource: SuperH MTU2 Timer driver

This patch adds a MTU2 driver for the SuperH architecture.

The MTU2 driver is a platform driver with early platform
support to allow using a MTU2 channel as only clockevent
during system bootup.

Clocksource on sh2a is currently unsupported due to code
generation issues with 64-bit math, so at this point only
periodic clockevent support is in place.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig               |  11 ++
 drivers/clocksource/Makefile  |   1 +
 drivers/clocksource/sh_mtu2.c | 357 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/sh_mtu2.h       |  12 ++
 4 files changed, 381 insertions(+)
 create mode 100644 drivers/clocksource/sh_mtu2.c
 create mode 100644 include/linux/sh_mtu2.h

(limited to 'include/linux')

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6f91478826d..6d0dd378eca 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -113,6 +113,9 @@ config SYS_SUPPORTS_PCI
 config SYS_SUPPORTS_CMT
 	bool
 
+config SYS_SUPPORTS_MTU2
+	bool
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -478,6 +481,14 @@ config SH_MTU2
 	help
 	  This enables the use of the MTU2 as the system timer.
 
+config SH_TIMER_MTU2
+	bool "MTU2 timer driver"
+	depends on SYS_SUPPORTS_MTU2 && !SH_MTU2
+	default y
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables build of the MTU2 timer driver.
+
 config SH_TIMER_IRQ
 	int
 	default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 1efb2879a94..9785586dc8c 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_X86_CYCLONE_TIMER)	+= cyclone.o
 obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
+obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
new file mode 100644
index 00000000000..420566f4c50
--- /dev/null
+++ b/drivers/clocksource/sh_mtu2.c
@@ -0,0 +1,357 @@
+/*
+ * SuperH Timer Support - MTU2
+ *
+ *  Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clockchips.h>
+#include <linux/sh_mtu2.h>
+
+struct sh_mtu2_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+	unsigned long rate;
+	unsigned long periodic;
+	struct clock_event_device ced;
+};
+
+static DEFINE_SPINLOCK(sh_mtu2_lock);
+
+#define TSTR -1 /* shared register */
+#define TCR  0 /* channel register */
+#define TMDR 1 /* channel register */
+#define TIOR 2 /* channel register */
+#define TIER 3 /* channel register */
+#define TSR  4 /* channel register */
+#define TCNT 5 /* channel register */
+#define TGR  6 /* channel register */
+
+static unsigned long mtu2_reg_offs[] = {
+	[TCR] = 0,
+	[TMDR] = 1,
+	[TIOR] = 2,
+	[TIER] = 4,
+	[TSR] = 5,
+	[TCNT] = 6,
+	[TGR] = 8,
+};
+
+static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR)
+		return ioread8(base + cfg->channel_offset);
+
+	offs = mtu2_reg_offs[reg_nr];
+
+	if ((reg_nr == TCNT) || (reg_nr == TGR))
+		return ioread16(base + offs);
+	else
+		return ioread8(base + offs);
+}
+
+static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR) {
+		iowrite8(value, base + cfg->channel_offset);
+		return;
+	}
+
+	offs = mtu2_reg_offs[reg_nr];
+
+	if ((reg_nr == TCNT) || (reg_nr == TGR))
+		iowrite16(value, base + offs);
+	else
+		iowrite8(value, base + offs);
+}
+
+static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_mtu2_lock, flags);
+	value = sh_mtu2_read(p, TSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_mtu2_write(p, TSTR, value);
+	spin_unlock_irqrestore(&sh_mtu2_lock, flags);
+}
+
+static int sh_mtu2_enable(struct sh_mtu2_priv *p)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_mtu2: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+
+	/* make sure channel is disabled */
+	sh_mtu2_start_stop_ch(p, 0);
+
+	p->rate = clk_get_rate(p->clk) / 64;
+	p->periodic = (p->rate + HZ/2) / HZ;
+
+	/* "Periodic Counter Operation" */
+	sh_mtu2_write(p, TCR, 0x23); /* TGRA clear, divide clock by 64 */
+	sh_mtu2_write(p, TIOR, 0);
+	sh_mtu2_write(p, TGR, p->periodic);
+	sh_mtu2_write(p, TCNT, 0);
+	sh_mtu2_write(p, TMDR, 0);
+	sh_mtu2_write(p, TIER, 0x01);
+
+	/* enable channel */
+	sh_mtu2_start_stop_ch(p, 1);
+
+	return 0;
+}
+
+static void sh_mtu2_disable(struct sh_mtu2_priv *p)
+{
+	/* disable channel */
+	sh_mtu2_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+static irqreturn_t sh_mtu2_interrupt(int irq, void *dev_id)
+{
+	struct sh_mtu2_priv *p = dev_id;
+
+	/* acknowledge interrupt */
+	sh_mtu2_read(p, TSR);
+	sh_mtu2_write(p, TSR, 0xfe);
+
+	/* notify clockevent layer */
+	p->ced.event_handler(&p->ced);
+	return IRQ_HANDLED;
+}
+
+static struct sh_mtu2_priv *ced_to_sh_mtu2(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_mtu2_priv, ced);
+}
+
+static void sh_mtu2_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_mtu2_priv *p = ced_to_sh_mtu2(ced);
+	int disabled = 0;
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		sh_mtu2_disable(p);
+		disabled = 1;
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_mtu2: %s used for periodic clock events\n",
+			ced->name);
+		sh_mtu2_enable(p);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+		if (!disabled)
+			sh_mtu2_disable(p);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		break;
+	}
+}
+
+static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+	int ret;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_mode = sh_mtu2_clock_event_mode;
+
+	ret = setup_irq(p->irqaction.irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_mtu2: failed to request irq %d\n",
+		       p->irqaction.irq);
+		return;
+	}
+
+	pr_info("sh_mtu2: %s used for clock events\n", ced->name);
+	clockevents_register_device(ced);
+}
+
+int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
+		     unsigned long clockevent_rating)
+{
+	if (clockevent_rating)
+		sh_mtu2_register_clockevent(p, name, clockevent_rating);
+
+	return 0;
+}
+
+static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
+{
+	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_mtu2: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* setup data for setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_mtu2_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.irq = irq;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_mtu2: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err1;
+	}
+
+	return sh_mtu2_register(p, cfg->name, cfg->clockevent_rating);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_mtu2_probe(struct platform_device *pdev)
+{
+	struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
+	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	int ret;
+
+	if (p) {
+		pr_info("sh_mtu2: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_mtu2_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_mtu2_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent */
+}
+
+static struct platform_driver sh_mtu2_device_driver = {
+	.probe		= sh_mtu2_probe,
+	.remove		= __devexit_p(sh_mtu2_remove),
+	.driver		= {
+		.name	= "sh_mtu2",
+	}
+};
+
+static int __init sh_mtu2_init(void)
+{
+	return platform_driver_register(&sh_mtu2_device_driver);
+}
+
+static void __exit sh_mtu2_exit(void)
+{
+	platform_driver_unregister(&sh_mtu2_device_driver);
+}
+
+early_platform_init("earlytimer", &sh_mtu2_device_driver);
+module_init(sh_mtu2_init);
+module_exit(sh_mtu2_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH MTU2 Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h
new file mode 100644
index 00000000000..a98876340ff
--- /dev/null
+++ b/include/linux/sh_mtu2.h
@@ -0,0 +1,12 @@
+#ifndef __SH_MTU2_H__
+#define __SH_MTU2_H__
+
+struct sh_mtu2_config {
+	char *name;
+	int channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+};
+
+#endif /* __SH_MTU2_H__ */
-- 
cgit v1.2.3-70-g09d2


From 9570ef20423b549757aa484ad388f9a7d5bdc4d9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 1 May 2009 06:51:00 +0000
Subject: clocksource: SuperH TMU Timer driver

This patch adds a TMU driver for the SuperH architecture.

The TMU driver is a platform driver with early platform
support to allow using a TMU channel as clockevent or
clocksource during system bootup or later.

Clocksource or clockevent can be selected.
Both periodic and oneshot clockevents are supported.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig              |  12 ++
 drivers/clocksource/Makefile |   1 +
 drivers/clocksource/sh_tmu.c | 461 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/sh_tmu.h       |  13 ++
 4 files changed, 487 insertions(+)
 create mode 100644 drivers/clocksource/sh_tmu.c
 create mode 100644 include/linux/sh_tmu.h

(limited to 'include/linux')

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index bda3dc1b0c2..1f74737c4f2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -116,6 +116,9 @@ config SYS_SUPPORTS_CMT
 config SYS_SUPPORTS_MTU2
 	bool
 
+config SYS_SUPPORTS_TMU
+	bool
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -470,6 +473,15 @@ config SH_TMU
 	help
 	  This enables the use of the TMU as the system timer.
 
+config SH_TIMER_TMU
+	bool "TMU timer driver"
+	depends on !SH_TMU && SYS_SUPPORTS_TMU
+	default y
+	select GENERIC_TIME
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables the build of the TMU timer driver.
+
 config SH_TIMER_CMT
 	bool "CMT timer driver"
 	depends on SYS_SUPPORTS_CMT
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 9785586dc8c..eef216f7f61 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
 obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
+obj-$(CONFIG_SH_TIMER_TMU)	+= sh_tmu.o
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
new file mode 100644
index 00000000000..21bd77aa6a3
--- /dev/null
+++ b/drivers/clocksource/sh_tmu.c
@@ -0,0 +1,461 @@
+/*
+ * SuperH Timer Support - TMU
+ *
+ *  Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/sh_tmu.h>
+
+struct sh_tmu_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+	unsigned long rate;
+	unsigned long periodic;
+	struct clock_event_device ced;
+	struct clocksource cs;
+};
+
+static DEFINE_SPINLOCK(sh_tmu_lock);
+
+#define TSTR -1 /* shared register */
+#define TCOR  0 /* channel register */
+#define TCNT 1 /* channel register */
+#define TCR 2 /* channel register */
+
+static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR)
+		return ioread8(base - cfg->channel_offset);
+
+	offs = reg_nr << 2;
+
+	if (reg_nr == TCR)
+		return ioread16(base + offs);
+	else
+		return ioread32(base + offs);
+}
+
+static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR) {
+		iowrite8(value, base - cfg->channel_offset);
+		return;
+	}
+
+	offs = reg_nr << 2;
+
+	if (reg_nr == TCR)
+		iowrite16(value, base + offs);
+	else
+		iowrite32(value, base + offs);
+}
+
+static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_tmu_lock, flags);
+	value = sh_tmu_read(p, TSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_tmu_write(p, TSTR, value);
+	spin_unlock_irqrestore(&sh_tmu_lock, flags);
+}
+
+static int sh_tmu_enable(struct sh_tmu_priv *p)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_tmu: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+
+	/* make sure channel is disabled */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* maximum timeout */
+	sh_tmu_write(p, TCOR, 0xffffffff);
+	sh_tmu_write(p, TCNT, 0xffffffff);
+
+	/* configure channel to parent clock / 4, irq off */
+	p->rate = clk_get_rate(p->clk) / 4;
+	sh_tmu_write(p, TCR, 0x0000);
+
+	/* enable channel */
+	sh_tmu_start_stop_ch(p, 1);
+
+	return 0;
+}
+
+static void sh_tmu_disable(struct sh_tmu_priv *p)
+{
+	/* disable channel */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+static void sh_tmu_set_next(struct sh_tmu_priv *p, unsigned long delta,
+			    int periodic)
+{
+	/* stop timer */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* acknowledge interrupt */
+	sh_tmu_read(p, TCR);
+
+	/* enable interrupt */
+	sh_tmu_write(p, TCR, 0x0020);
+
+	/* reload delta value in case of periodic timer */
+	if (periodic)
+		sh_tmu_write(p, TCOR, delta);
+	else
+		sh_tmu_write(p, TCOR, 0);
+
+	sh_tmu_write(p, TCNT, delta);
+
+	/* start timer */
+	sh_tmu_start_stop_ch(p, 1);
+}
+
+static irqreturn_t sh_tmu_interrupt(int irq, void *dev_id)
+{
+	struct sh_tmu_priv *p = dev_id;
+
+	/* disable or acknowledge interrupt */
+	if (p->ced.mode == CLOCK_EVT_MODE_ONESHOT)
+		sh_tmu_write(p, TCR, 0x0000);
+	else
+		sh_tmu_write(p, TCR, 0x0020);
+
+	/* notify clockevent layer */
+	p->ced.event_handler(&p->ced);
+	return IRQ_HANDLED;
+}
+
+static struct sh_tmu_priv *cs_to_sh_tmu(struct clocksource *cs)
+{
+	return container_of(cs, struct sh_tmu_priv, cs);
+}
+
+static cycle_t sh_tmu_clocksource_read(struct clocksource *cs)
+{
+	struct sh_tmu_priv *p = cs_to_sh_tmu(cs);
+
+	return sh_tmu_read(p, TCNT) ^ 0xffffffff;
+}
+
+static int sh_tmu_clocksource_enable(struct clocksource *cs)
+{
+	struct sh_tmu_priv *p = cs_to_sh_tmu(cs);
+	int ret;
+
+	ret = sh_tmu_enable(p);
+	if (ret)
+		return ret;
+
+	/* TODO: calculate good shift from rate and counter bit width */
+	cs->shift = 10;
+	cs->mult = clocksource_hz2mult(p->rate, cs->shift);
+	return 0;
+}
+
+static void sh_tmu_clocksource_disable(struct clocksource *cs)
+{
+	sh_tmu_disable(cs_to_sh_tmu(cs));
+}
+
+static int sh_tmu_register_clocksource(struct sh_tmu_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clocksource *cs = &p->cs;
+
+	cs->name = name;
+	cs->rating = rating;
+	cs->read = sh_tmu_clocksource_read;
+	cs->enable = sh_tmu_clocksource_enable;
+	cs->disable = sh_tmu_clocksource_disable;
+	cs->mask = CLOCKSOURCE_MASK(32);
+	cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	pr_info("sh_tmu: %s used as clock source\n", cs->name);
+	clocksource_register(cs);
+	return 0;
+}
+
+static struct sh_tmu_priv *ced_to_sh_tmu(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_tmu_priv, ced);
+}
+
+static void sh_tmu_clock_event_start(struct sh_tmu_priv *p, int periodic)
+{
+	struct clock_event_device *ced = &p->ced;
+
+	sh_tmu_enable(p);
+
+	/* TODO: calculate good shift from rate and counter bit width */
+
+	ced->shift = 32;
+	ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift);
+	ced->max_delta_ns = clockevent_delta2ns(0xffffffff, ced);
+	ced->min_delta_ns = 5000;
+
+	if (periodic) {
+		p->periodic = (p->rate + HZ/2) / HZ;
+		sh_tmu_set_next(p, p->periodic, 1);
+	}
+}
+
+static void sh_tmu_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_tmu_priv *p = ced_to_sh_tmu(ced);
+	int disabled = 0;
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		sh_tmu_disable(p);
+		disabled = 1;
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_tmu: %s used for periodic clock events\n",
+			ced->name);
+		sh_tmu_clock_event_start(p, 1);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		pr_info("sh_tmu: %s used for oneshot clock events\n",
+			ced->name);
+		sh_tmu_clock_event_start(p, 0);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+		if (!disabled)
+			sh_tmu_disable(p);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		break;
+	}
+}
+
+static int sh_tmu_clock_event_next(unsigned long delta,
+				   struct clock_event_device *ced)
+{
+	struct sh_tmu_priv *p = ced_to_sh_tmu(ced);
+
+	BUG_ON(ced->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	/* program new delta value */
+	sh_tmu_set_next(p, delta, 0);
+	return 0;
+}
+
+static void sh_tmu_register_clockevent(struct sh_tmu_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+	int ret;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->features |= CLOCK_EVT_FEAT_ONESHOT;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_next_event = sh_tmu_clock_event_next;
+	ced->set_mode = sh_tmu_clock_event_mode;
+
+	ret = setup_irq(p->irqaction.irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_tmu: failed to request irq %d\n",
+		       p->irqaction.irq);
+		return;
+	}
+
+	pr_info("sh_tmu: %s used for clock events\n", ced->name);
+	clockevents_register_device(ced);
+}
+
+static int sh_tmu_register(struct sh_tmu_priv *p, char *name,
+		    unsigned long clockevent_rating,
+		    unsigned long clocksource_rating)
+{
+	if (clockevent_rating)
+		sh_tmu_register_clockevent(p, name, clockevent_rating);
+	else if (clocksource_rating)
+		sh_tmu_register_clocksource(p, name, clocksource_rating);
+
+	return 0;
+}
+
+static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
+{
+	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_tmu: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* setup data for setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_tmu_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.irq = irq;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_tmu: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err1;
+	}
+
+	return sh_tmu_register(p, cfg->name,
+			       cfg->clockevent_rating,
+			       cfg->clocksource_rating);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_tmu_probe(struct platform_device *pdev)
+{
+	struct sh_tmu_priv *p = platform_get_drvdata(pdev);
+	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	int ret;
+
+	if (p) {
+		pr_info("sh_tmu: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_tmu_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_tmu_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent and clocksource */
+}
+
+static struct platform_driver sh_tmu_device_driver = {
+	.probe		= sh_tmu_probe,
+	.remove		= __devexit_p(sh_tmu_remove),
+	.driver		= {
+		.name	= "sh_tmu",
+	}
+};
+
+static int __init sh_tmu_init(void)
+{
+	return platform_driver_register(&sh_tmu_device_driver);
+}
+
+static void __exit sh_tmu_exit(void)
+{
+	platform_driver_unregister(&sh_tmu_device_driver);
+}
+
+early_platform_init("earlytimer", &sh_tmu_device_driver);
+module_init(sh_tmu_init);
+module_exit(sh_tmu_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH TMU Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h
new file mode 100644
index 00000000000..b1195e8a37d
--- /dev/null
+++ b/include/linux/sh_tmu.h
@@ -0,0 +1,13 @@
+#ifndef __SH_TMU_H__
+#define __SH_TMU_H__
+
+struct sh_tmu_config {
+	char *name;
+	unsigned long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_TMU_H__ */
-- 
cgit v1.2.3-70-g09d2


From 46a12f7426d71cabc08972cf8d3ffdd441d26a3a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 17:57:17 +0900
Subject: sh: Consolidate MTU2/CMT/TMU timer platform data.

All of the SH timers use a roughly identical structure for platform data,
which presently is broken out for each block. Consolidate all of these
definitions, as there is no reason for them to be broken out in the first
place.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh2/setup-sh7619.c  |  6 +++---
 arch/sh/kernel/cpu/sh2a/setup-mxg.c    |  8 ++++----
 arch/sh/kernel/cpu/sh2a/setup-sh7201.c |  8 ++++----
 arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 11 +++++------
 arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 13 ++++++-------
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c |  4 ++--
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c |  4 ++--
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 11 +++++------
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c |  4 ++--
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c |  4 ++--
 drivers/clocksource/sh_cmt.c           | 14 +++++++-------
 drivers/clocksource/sh_mtu2.c          | 14 +++++++-------
 drivers/clocksource/sh_tmu.c           | 14 +++++++-------
 include/linux/sh_cmt.h                 | 13 -------------
 include/linux/sh_mtu2.h                | 12 ------------
 include/linux/sh_timer.h               | 13 +++++++++++++
 include/linux/sh_tmu.h                 | 13 -------------
 17 files changed, 69 insertions(+), 97 deletions(-)
 delete mode 100644 include/linux/sh_cmt.h
 delete mode 100644 include/linux/sh_mtu2.h
 create mode 100644 include/linux/sh_timer.h
 delete mode 100644 include/linux/sh_tmu.h

(limited to 'include/linux')

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index d70c263eb07..94ac27fc223 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -111,7 +111,7 @@ static struct platform_device eth_device = {
 	.resource = eth_resources,
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -143,7 +143,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 870030aa05b..a452d964906 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -11,7 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 
 enum {
 	UNUSED = 0,
@@ -114,7 +114,7 @@ static struct intc_mask_reg mask_registers[] __initdata = {
 static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups,
 			 mask_registers, prio_registers, NULL);
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -145,7 +145,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -176,7 +176,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 074aa9062e4..772358b7685 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -251,7 +251,7 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -282,7 +282,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -313,7 +313,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 3448164b573..d7493418ba6 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -11,8 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -208,7 +207,7 @@ static struct platform_device sci_device = {
 	},
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -240,7 +239,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
@@ -272,7 +271,7 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -303,7 +302,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index e700559b6b8..2fc6bff5c5f 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -12,8 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -168,7 +167,7 @@ static struct platform_device sci_device = {
 	},
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -200,7 +199,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
@@ -232,7 +231,7 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -263,7 +262,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -294,7 +293,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index cb5b4db1ca2..f327b7eaf47 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -12,7 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 
 static struct resource iic0_resources[] = {
@@ -141,7 +141,7 @@ static struct platform_device jpu_device = {
 	.num_resources	= ARRAY_SIZE(jpu_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 2a771f48e9e..7361ea989b9 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -14,7 +14,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 
 static struct resource iic_resources[] = {
@@ -148,7 +148,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 512735c5cc8..624e8e5a605 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -13,8 +13,7 @@
 #include <linux/serial_sci.h>
 #include <linux/mm.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_tmu.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -178,7 +177,7 @@ static struct platform_device jpu_device = {
 	.num_resources	= ARRAY_SIZE(jpu_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
@@ -210,7 +209,7 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
-static struct sh_tmu_config tmu0_platform_data = {
+static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
@@ -241,7 +240,7 @@ static struct platform_device tmu0_device = {
 	.num_resources	= ARRAY_SIZE(tmu0_resources),
 };
 
-static struct sh_tmu_config tmu1_platform_data = {
+static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
@@ -272,7 +271,7 @@ static struct platform_device tmu1_device = {
 	.num_resources	= ARRAY_SIZE(tmu1_resources),
 };
 
-static struct sh_tmu_config tmu2_platform_data = {
+static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index dbb44949ed1..fb9b5427a06 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -13,7 +13,7 @@
 #include <linux/mm.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -101,7 +101,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 8429396acb5..e074951b88b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -18,7 +18,7 @@
 #include <linux/mm.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
@@ -230,7 +230,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 4ff1508e5ab..aeb8c9b27b5 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -28,7 +28,7 @@
 #include <linux/err.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 
 struct sh_cmt_priv {
 	void __iomem *mapbase;
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(sh_cmt_lock);
 
 static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -83,7 +83,7 @@ static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
 static inline void sh_cmt_write(struct sh_cmt_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -131,7 +131,7 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p,
 
 static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -149,7 +149,7 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
 
 static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -560,7 +560,7 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name,
 
 static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 {
-	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -638,7 +638,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 static int __devinit sh_cmt_probe(struct platform_device *pdev)
 {
 	struct sh_cmt_priv *p = platform_get_drvdata(pdev);
-	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index 420566f4c50..ef02185a827 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -28,7 +28,7 @@
 #include <linux/irq.h>
 #include <linux/err.h>
 #include <linux/clockchips.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 
 struct sh_mtu2_priv {
 	void __iomem *mapbase;
@@ -63,7 +63,7 @@ static unsigned long mtu2_reg_offs[] = {
 
 static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -81,7 +81,7 @@ static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
 static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -100,7 +100,7 @@ static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
 
 static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -118,7 +118,7 @@ static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
 
 static int sh_mtu2_enable(struct sh_mtu2_priv *p)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -243,7 +243,7 @@ int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
 
 static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
 {
-	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -303,7 +303,7 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
 static int __devinit sh_mtu2_probe(struct platform_device *pdev)
 {
 	struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
-	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 21bd77aa6a3..d6ea4398bf6 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -29,7 +29,7 @@
 #include <linux/err.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/sh_tmu.h>
+#include <linux/sh_timer.h>
 
 struct sh_tmu_priv {
 	void __iomem *mapbase;
@@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(sh_tmu_lock);
 
 static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -69,7 +69,7 @@ static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
 static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -88,7 +88,7 @@ static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
 
 static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -106,7 +106,7 @@ static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
 
 static int sh_tmu_enable(struct sh_tmu_priv *p)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -345,7 +345,7 @@ static int sh_tmu_register(struct sh_tmu_priv *p, char *name,
 
 static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
 {
-	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -407,7 +407,7 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
 static int __devinit sh_tmu_probe(struct platform_device *pdev)
 {
 	struct sh_tmu_priv *p = platform_get_drvdata(pdev);
-	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/include/linux/sh_cmt.h b/include/linux/sh_cmt.h
deleted file mode 100644
index 68cacde5954..00000000000
--- a/include/linux/sh_cmt.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __SH_CMT_H__
-#define __SH_CMT_H__
-
-struct sh_cmt_config {
-	char *name;
-	unsigned long channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-	unsigned long clocksource_rating;
-};
-
-#endif /* __SH_CMT_H__ */
diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h
deleted file mode 100644
index a98876340ff..00000000000
--- a/include/linux/sh_mtu2.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __SH_MTU2_H__
-#define __SH_MTU2_H__
-
-struct sh_mtu2_config {
-	char *name;
-	int channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-};
-
-#endif /* __SH_MTU2_H__ */
diff --git a/include/linux/sh_timer.h b/include/linux/sh_timer.h
new file mode 100644
index 00000000000..864bd56bd3b
--- /dev/null
+++ b/include/linux/sh_timer.h
@@ -0,0 +1,13 @@
+#ifndef __SH_TIMER_H__
+#define __SH_TIMER_H__
+
+struct sh_timer_config {
+	char *name;
+	long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_TIMER_H__ */
diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h
deleted file mode 100644
index b1195e8a37d..00000000000
--- a/include/linux/sh_tmu.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __SH_TMU_H__
-#define __SH_TMU_H__
-
-struct sh_tmu_config {
-	char *name;
-	unsigned long channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-	unsigned long clocksource_rating;
-};
-
-#endif /* __SH_TMU_H__ */
-- 
cgit v1.2.3-70-g09d2


From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:13:30 +0200
Subject: perf_counter: initialize the per-cpu context earlier

percpu scheduling for perfcounters wants to take the context lock,
but that lock first needs to be initialized. Currently it is an
early_initcall() - but that is too late, the task tick runs much
sooner than that.

Call it explicitly from the scheduler init sequence instead.

[ Impact: fix access-before-init crash ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 5 ++++-
 kernel/perf_counter.c        | 5 +----
 kernel/sched.c               | 5 ++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f776851f8c4..a356fa69796 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -573,6 +573,8 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 
+extern void perf_counter_init(void);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -600,9 +602,10 @@ perf_counter_mmap(unsigned long addr, unsigned long len,
 
 static inline void
 perf_counter_munmap(unsigned long addr, unsigned long len,
-		    unsigned long pgoff, struct file *file) 		{ }
+		    unsigned long pgoff, struct file *file)		{ }
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
+static inline void perf_counter_init(void)				{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b9679c36bcc..fcdafa234a5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3265,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 	.notifier_call		= perf_cpu_notify,
 };
 
-static int __init perf_counter_init(void)
+void __init perf_counter_init(void)
 {
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	register_cpu_notifier(&perf_cpu_nb);
-
-	return 0;
 }
-early_initcall(perf_counter_init);
 
 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f600e30dcf..a728976a3a6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -8996,7 +8997,7 @@ void __init sched_init(void)
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
-		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9097,6 +9098,8 @@ void __init sched_init(void)
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
 
+	perf_counter_init();
+
 	scheduler_running = 1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 13:43:37 -0400
Subject: ring-buffer: add counters for commit overrun and nmi dropped entries

The WARN_ON in the ring buffer when a commit is preempted and the
buffer is filled by preceding writes can happen in normal operations.
The WARN_ON makes it look like a bug, not to mention, because
it does not stop tracing and calls printk which can also recurse, this
is prone to deadlock (the WARN_ON is not in a position to recurse).

This patch removes the WARN_ON and replaces it with a counter that
can be retrieved by a tracer. This counter is called commit_overrun.

While at it, I added a nmi_dropped counter to count any time an NMI entry
is dropped because the NMI could not take the spinlock.

[ Impact: prevent deadlock by printing normal case warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 ++
 kernel/trace/ring_buffer.c  | 52 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1c2f80911fb..f1345828c7c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -153,6 +153,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3e86da9b2a0..26e1359fe19 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -402,6 +402,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			nmi_dropped;
+	unsigned long			commit_overrun;
 	unsigned long			overrun;
 	unsigned long			entries;
 	u64				write_stamp;
@@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * simply fail.
 		 */
 		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock))
+			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+				cpu_buffer->nmi_dropped++;
 				goto out_reset;
+			}
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			/* This can easily happen on small ring buffers */
-			WARN_ON_ONCE(buffer->pages > 2);
+			cpu_buffer->commit_overrun++;
 			goto out_reset;
 		}
 
@@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
+/**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->nmi_dropped;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->commit_overrun;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
 /**
  * ring_buffer_entries - get the number of entries in a buffer
  * @buffer: The ring buffer
@@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
+	cpu_buffer->nmi_dropped = 0;
+	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
 
-- 
cgit v1.2.3-70-g09d2


From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:22 +0200
Subject: perf_counter: uncouple data_head updates from wakeups

Keep data_head up-to-date irrespective of notifications. This fixes
the case where you disable a counter and don't get a notification for
the last few pending events, and it also allows polling usage.

[ Impact: increase precision of perfcounter mmap-ed fields ]

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090505155436.925084300@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 +++-
 kernel/perf_counter.c        | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a356fa69796..17b63105f2a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -362,9 +362,11 @@ struct perf_mmap_data {
 	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
-	atomic_t			wakeup_head;	/* completed head    */
+	atomic_t			done_head;	/* completed head    */
 	atomic_t			lock;		/* concurrent writes */
 
+	atomic_t			wakeup;		/* needs a wakeup    */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5f86a1156c9..ba5e921e1f3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1696,7 +1696,6 @@ struct perf_output_handle {
 	struct perf_mmap_data	*data;
 	unsigned int		offset;
 	unsigned int		head;
-	int			wakeup;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -1752,8 +1751,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	struct perf_mmap_data *data = handle->data;
 	int head, cpu;
 
-	if (handle->wakeup)
-		data->wakeup_head = data->head;
+	data->done_head = data->head;
 
 	if (!handle->locked)
 		goto out;
@@ -1764,13 +1762,11 @@ again:
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+	while ((head = atomic_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
-		handle->wakeup = 1;
-	}
 
 	/*
-	 * NMI can happen here, which means we can miss a wakeup_head update.
+	 * NMI can happen here, which means we can miss a done_head update.
 	 */
 
 	cpu = atomic_xchg(&data->lock, 0);
@@ -1779,7 +1775,7 @@ again:
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->wakeup_head))) {
+	if (unlikely(atomic_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -1789,7 +1785,7 @@ again:
 		goto again;
 	}
 
-	if (handle->wakeup)
+	if (atomic_xchg(&data->wakeup, 0))
 		perf_output_wakeup(handle);
 out:
 	local_irq_restore(handle->flags);
@@ -1824,7 +1820,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 
 	handle->offset	= offset;
 	handle->head	= head;
-	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
+
+	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+		atomic_set(&data->wakeup, 1);
 
 	return 0;
 
@@ -1882,7 +1880,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &data->events);
-			handle->wakeup = 1;
+			atomic_set(&data->wakeup, 1);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:23 +0200
Subject: perf_counter: add ioctl(PERF_COUNTER_IOC_RESET)

Provide a way to reset an existing counter - this eases PAPI
libraries around perfcounters.

Similar to read() it doesn't collapse pending child counters.

[ Impact: new perfcounter fd ioctl method to reset counters ]

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090505155437.022272933@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 +
 kernel/perf_counter.c        | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 17b63105f2a..0fcbf34a4f7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -160,6 +160,7 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ba5e921e1f3..6e6834e0587 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1288,6 +1288,11 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
+static void perf_counter_reset(struct perf_counter *counter)
+{
+	atomic_set(&counter->count, 0);
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1303,6 +1308,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_REFRESH:
 		perf_counter_refresh(counter, arg);
 		break;
+	case PERF_COUNTER_IOC_RESET:
+		perf_counter_reset(counter);
+		break;
 	default:
 		err = -ENOTTY;
 	}
-- 
cgit v1.2.3-70-g09d2


From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:24 +0200
Subject: perf_counter: provide an mlock threshold

Provide a threshold to relax the mlock accounting, increasing usability.

Each counter gets perf_counter_mlock_kb for free.

[ Impact: allow more mmap buffering ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.112113632@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 15 +++++++++++----
 kernel/sysctl.c              |  8 ++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0fcbf34a4f7..00081d84169 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,6 +358,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
+	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 	atomic_t			head;		/* write position    */
@@ -575,6 +576,7 @@ struct perf_callchain_entry {
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
+extern int sysctl_perf_counter_mlock;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 6e6834e0587..2d134273830 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,6 +44,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1461,7 +1462,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
-		vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
+		vma->vm_mm->locked_vm -= counter->data->nr_locked;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
 	}
@@ -1480,6 +1481,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long nr_pages;
 	unsigned long locked, lock_limit;
 	int ret = 0;
+	long extra;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
@@ -1507,8 +1509,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	locked = vma->vm_mm->locked_vm;
-	locked += nr_pages + 1;
+	extra = nr_pages /* + 1 only account the data pages */;
+	extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+	if (extra < 0)
+		extra = 0;
+
+	locked = vma->vm_mm->locked_vm + extra;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
@@ -1524,7 +1530,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 
 	atomic_set(&counter->mmap_count, 1);
-	vma->vm_mm->locked_vm += nr_pages + 1;
+	vma->vm_mm->locked_vm += extra;
+	counter->data->nr_locked = extra;
 unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8203d70928d..3b05c2b088d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -920,6 +920,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_mlock_kb",
+		.data		= &sysctl_perf_counter_mlock,
+		.maxlen		= sizeof(sysctl_perf_counter_mlock),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
cgit v1.2.3-70-g09d2


From f001fde5eadd915f4858d22ed70d7040f48767cf Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 5 May 2009 02:48:28 +0000
Subject: net: introduce a list of device addresses dev_addr_list (v6)

v5 -> v6 (current):
-removed so far unused static functions
-corrected dev_addr_del_multiple to call del instead of add

v4 -> v5:
-added device address type (suggested by davem)
-removed refcounting (better to have simplier code then safe potentially few
 bytes)

v3 -> v4:
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h |  27 +++++
 include/linux/netdevice.h   |  37 ++++++-
 net/core/dev.c              | 250 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 312 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17abba7d..3d7a6687d24 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ff42aba403c..02882e2ebd4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,16 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	unsigned char		type;
+#define NETDEV_HW_ADDR_T_LAN	1
+#define NETDEV_HW_ADDR_T_SAN	2
+#define NETDEV_HW_ADDR_T_SLAVE	3
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -784,8 +794,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1791,6 +1804,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1803,6 +1823,19 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_del(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3c8073fe970..637ea71b0a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3434,6 +3434,252 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	alloc_size = sizeof(*ha);
+	if (alloc_size < L1_CACHE_BYTES)
+		alloc_size = L1_CACHE_BYTES;
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->type = addr_type;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, unsigned char addr_type,
+			    int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len) &&
+		    (ha->type == addr_type || !addr_type)) {
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, unsigned char addr_type,
+				     int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		type = addr_type ? addr_type : ha2->type;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+				 ignore_index);
+	}
+	return err;
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, unsigned char addr_type,
+				      int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+				 ignore_index);
+	}
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+			    NETDEV_HW_ADDR_T_LAN);
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *	@addr_type: address type
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+			    addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *	@addr_type: address type
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+			       addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Add device addresses of the one device to another.
+ **
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *	@addr_type: address type - 0 means type will used from from_dev
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_del_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, addr_type, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4776,6 +5022,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4801,6 +5048,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
-- 
cgit v1.2.3-70-g09d2


From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:04 +0800
Subject: tracing/events: fix memory leak when unloading module

When unloading a module, memory allocated by init_preds() and
trace_define_field() is not freed.

[ Impact: fix memory leak ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  1 +
 kernel/trace/trace_events.c        | 18 ++++++++++++++++++
 kernel/trace/trace_events_filter.c | 22 +++++++++++++++-------
 3 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 5fff40c9ff5..662c1becf36 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -116,6 +116,7 @@ struct ftrace_event_call {
 #define MAX_FILTER_STR_VAL	128
 
 extern int init_preds(struct ftrace_event_call *call);
+extern void destroy_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern int filter_current_check_discard(struct ftrace_event_call *call,
 					void *rec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f789ca540fe..f251a150e75 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,6 +60,22 @@ err:
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
 
+#ifdef CONFIG_MODULES
+
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+	struct ftrace_event_field *field, *next;
+
+	list_for_each_entry_safe(field, next, &call->fields, link) {
+		list_del(&field->link);
+		kfree(field->type);
+		kfree(field->name);
+		kfree(field);
+	}
+}
+
+#endif /* CONFIG_MODULES */
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
@@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
+			trace_destroy_fields(call);
+			destroy_preds(call);
 		}
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f49486687ee..ce07b818671 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
+void destroy_preds(struct ftrace_event_call *call)
+{
+	struct event_filter *filter = call->filter;
+	int i;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
+	}
+	kfree(filter->preds);
+	kfree(filter);
+	call->filter = NULL;
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call)
 	return 0;
 
 oom:
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (filter->preds[i])
-			filter_free_pred(filter->preds[i]);
-	}
-	kfree(filter->preds);
-	kfree(call->filter);
-	call->filter = NULL;
+	destroy_preds(call);
 
 	return -ENOMEM;
 }
-- 
cgit v1.2.3-70-g09d2


From 22cfbbfd9f67b67fe073010f51cb71d3632387d5 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 6 May 2009 11:43:57 +0200
Subject: ntp: adjust SHIFT_PLL to improve NTP convergence

The conversion to the ntpv4 reference model
f19923937321244e7dc334767eb4b67e0e3d5c74 ("ntp: convert to the NTP4
reference model") in 2.6.19 added nanosecond resolution the adjtimex
interface, but also changed the "stiffness" of the frequency adjustments,
causing NTP convergence time to greatly increase.

SHIFT_PLL, which reduces the stiffness of the freq adjustments, was
designed to be inversely linked to HZ, and the reference value of 4 was
designed for Unix systems using HZ=100.  However Linux's clock steering
code mostly independent of HZ.

So this patch reduces the SHIFT_PLL value from 4 to 2, which causes NTPd
behavior to match kernels prior to 2.6.19, greatly reducing convergence
times, and improving close synchronization through environmental thermal
changes.

The patch also changes some l's to L's in nearby code to avoid misreading
50l as 501.

[ Impact: tweak NTP algorithm for faster convergence ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: zippel@linux-m68k.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <200905051956.n45JuVo9025575@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index aa3475fcff6..0daf9611ef4 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -170,17 +170,37 @@ struct timex {
 #include <asm/timex.h>
 
 /*
- * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen
- * for a slightly underdamped convergence characteristic. SHIFT_KH
- * establishes the damping of the FLL and is chosen by wisdom and black
- * art.
+ * SHIFT_PLL is used as a dampening factor to define how much we
+ * adjust the frequency correction for a given offset in PLL mode.
+ * It also used in dampening the offset correction, to define how
+ * much of the current value in time_offset we correct for each
+ * second. Changing this value changes the stiffness of the ntp
+ * adjustment code. A lower value makes it more flexible, reducing
+ * NTP convergence time. A higher value makes it stiffer, increasing
+ * convergence time, but making the clock more stable.
  *
- * MAXTC establishes the maximum time constant of the PLL. With the
- * SHIFT_KG and SHIFT_KF values given and a time constant range from
- * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours,
- * respectively.
+ * In David Mills' nanokenrel reference implmentation SHIFT_PLL is 4.
+ * However this seems to increase convergence time much too long.
+ *
+ * https://lists.ntp.org/pipermail/hackers/2008-January/003487.html
+ *
+ * In the above mailing list discussion, it seems the value of 4
+ * was appropriate for other Unix systems with HZ=100, and that
+ * SHIFT_PLL should be decreased as HZ increases. However, Linux's
+ * clock steering implementation is HZ independent.
+ *
+ * Through experimentation, a SHIFT_PLL value of 2 was found to allow
+ * for fast convergence (very similar to the NTPv3 code used prior to
+ * v2.6.19), with good clock stability.
+ *
+ *
+ * SHIFT_FLL is used as a dampening factor to define how much we
+ * adjust the frequency correction for a given offset in FLL mode.
+ * In David Mills' nanokenrel reference implmentation SHIFT_PLL is 2.
+ *
+ * MAXTC establishes the maximum time constant of the PLL.
  */
-#define SHIFT_PLL	4	/* PLL frequency factor (shift) */
+#define SHIFT_PLL	2	/* PLL frequency factor (shift) */
 #define SHIFT_FLL	2	/* FLL frequency factor (shift) */
 #define MAXTC		10	/* maximum time constant (shift) */
 
@@ -192,10 +212,10 @@ struct timex {
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
 #define PPM_SCALE ((s64)NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
 #define PPM_SCALE_INV_SHIFT 19
-#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
+#define PPM_SCALE_INV ((1LL << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
 
-#define MAXPHASE 500000000l	/* max phase error (ns) */
+#define MAXPHASE 500000000L	/* max phase error (ns) */
 #define MAXFREQ 500000		/* max frequency error (ns/s) */
 #define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT)
 #define MINSEC 256		/* min interval between updates (s) */
-- 
cgit v1.2.3-70-g09d2


From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:27:26 -0400
Subject: blktrace: correct remap names

This attempts to clarify names utilized during block I/O remap
operations (partition, volume manager). It correctly matches up the
/from/ information for both device & sector. This takes in the concept
from Kosaki Motohiro and extends it to include better naming for the
"device_from" field.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF4FAE.3000301@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/blktrace_api.h |  4 ++--
 include/trace/block.h        |  4 ++--
 kernel/trace/blktrace.c      | 24 ++++++++++++------------
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 62763c95285..82b4636030e 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -116,9 +116,9 @@ struct blk_io_trace {
  * The remap event
  */
 struct blk_io_trace_remap {
-	__be32 device;
 	__be32 device_from;
-	__be64 sector;
+	__be32 device_to;
+	__be64 sector_from;
 };
 
 enum {
diff --git a/include/trace/block.h b/include/trace/block.h
index 25b7068b819..87f6456fd32 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from, sector_t to),
-	      TP_ARGS(q, bio, dev, from, to));
+		 sector_t to, sector_t from),
+	      TP_ARGS(q, bio, dev, to, from));
 
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c32062bd10b..f8d46d6f5d3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @from:	source sector
  * @to:		target sector
+ * @from:	source sector
  *
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
@@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
+				       dev_t dev, sector_t to, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 		return;
 
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
 			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
@@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 			  struct blk_io_trace_remap *r)
 {
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
+	__u64 sector_from = __r->sector_from;
 
-	r->device = be32_to_cpu(__r->device);
 	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
+	r->device_to   = be32_to_cpu(__r->device_to);
+	r->sector_from = be64_to_cpu(sector_from);
 }
 
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s,
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
-	struct blk_io_trace_remap r = { .device = 0, };
+	struct blk_io_trace_remap r = { .device_from = 0, };
 
 	get_pdu_remap(ent, &r);
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
+				t_sector(ent), t_sec(ent),
+				MAJOR(r.device_from), MINOR(r.device_from),
+				(unsigned long long)r.sector_from);
 }
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
-- 
cgit v1.2.3-70-g09d2


From 2f01a1f58889fbfeb68b1bc1b52e4197f3333490 Mon Sep 17 00:00:00 2001
From: Kalle Valo <kalle.valo@nokia.com>
Date: Wed, 29 Apr 2009 23:33:31 +0300
Subject: wl12xx: add driver

wl12xx is a driver for TI wl1251 802.11 chipset designed for embedded
devices, supporting both SDIO and SPI busses. Currently the driver
supports only SPI. Adding support 1253 (the 5 GHz version) should be
relatively easy. More information here:

http://focus.ti.com/general/docs/wtbu/wtbuproductcontent.tsp?contentId=4711&navigationId=12494&templateId=6123

(Collapsed original sequence of pre-merge patches into single commit for
initial merge. -- JWL)

Signed-off-by: Kalle Valo <kalle.valo@nokia.com>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/Kconfig               |    1 +
 drivers/net/wireless/Makefile              |    2 +
 drivers/net/wireless/wl12xx/Kconfig        |   11 +
 drivers/net/wireless/wl12xx/Makefile       |    4 +
 drivers/net/wireless/wl12xx/acx.c          |  689 ++++++++++++++
 drivers/net/wireless/wl12xx/acx.h          | 1245 +++++++++++++++++++++++++
 drivers/net/wireless/wl12xx/boot.c         |  295 ++++++
 drivers/net/wireless/wl12xx/boot.h         |   40 +
 drivers/net/wireless/wl12xx/cmd.c          |  353 ++++++++
 drivers/net/wireless/wl12xx/cmd.h          |  265 ++++++
 drivers/net/wireless/wl12xx/debugfs.c      |  508 +++++++++++
 drivers/net/wireless/wl12xx/debugfs.h      |   33 +
 drivers/net/wireless/wl12xx/event.c        |  127 +++
 drivers/net/wireless/wl12xx/event.h        |  121 +++
 drivers/net/wireless/wl12xx/init.c         |  200 ++++
 drivers/net/wireless/wl12xx/init.h         |   40 +
 drivers/net/wireless/wl12xx/main.c         | 1358 ++++++++++++++++++++++++++++
 drivers/net/wireless/wl12xx/ps.c           |  151 ++++
 drivers/net/wireless/wl12xx/ps.h           |   36 +
 drivers/net/wireless/wl12xx/reg.h          |  745 +++++++++++++++
 drivers/net/wireless/wl12xx/rx.c           |  208 +++++
 drivers/net/wireless/wl12xx/rx.h           |  122 +++
 drivers/net/wireless/wl12xx/spi.c          |  358 ++++++++
 drivers/net/wireless/wl12xx/spi.h          |  109 +++
 drivers/net/wireless/wl12xx/tx.c           |  557 ++++++++++++
 drivers/net/wireless/wl12xx/tx.h           |  215 +++++
 drivers/net/wireless/wl12xx/wl1251.c       |  709 +++++++++++++++
 drivers/net/wireless/wl12xx/wl1251.h       |  165 ++++
 drivers/net/wireless/wl12xx/wl12xx.h       |  409 +++++++++
 drivers/net/wireless/wl12xx/wl12xx_80211.h |  156 ++++
 include/linux/spi/wl12xx.h                 |   31 +
 31 files changed, 9263 insertions(+)
 create mode 100644 drivers/net/wireless/wl12xx/Kconfig
 create mode 100644 drivers/net/wireless/wl12xx/Makefile
 create mode 100644 drivers/net/wireless/wl12xx/acx.c
 create mode 100644 drivers/net/wireless/wl12xx/acx.h
 create mode 100644 drivers/net/wireless/wl12xx/boot.c
 create mode 100644 drivers/net/wireless/wl12xx/boot.h
 create mode 100644 drivers/net/wireless/wl12xx/cmd.c
 create mode 100644 drivers/net/wireless/wl12xx/cmd.h
 create mode 100644 drivers/net/wireless/wl12xx/debugfs.c
 create mode 100644 drivers/net/wireless/wl12xx/debugfs.h
 create mode 100644 drivers/net/wireless/wl12xx/event.c
 create mode 100644 drivers/net/wireless/wl12xx/event.h
 create mode 100644 drivers/net/wireless/wl12xx/init.c
 create mode 100644 drivers/net/wireless/wl12xx/init.h
 create mode 100644 drivers/net/wireless/wl12xx/main.c
 create mode 100644 drivers/net/wireless/wl12xx/ps.c
 create mode 100644 drivers/net/wireless/wl12xx/ps.h
 create mode 100644 drivers/net/wireless/wl12xx/reg.h
 create mode 100644 drivers/net/wireless/wl12xx/rx.c
 create mode 100644 drivers/net/wireless/wl12xx/rx.h
 create mode 100644 drivers/net/wireless/wl12xx/spi.c
 create mode 100644 drivers/net/wireless/wl12xx/spi.h
 create mode 100644 drivers/net/wireless/wl12xx/tx.c
 create mode 100644 drivers/net/wireless/wl12xx/tx.h
 create mode 100644 drivers/net/wireless/wl12xx/wl1251.c
 create mode 100644 drivers/net/wireless/wl12xx/wl1251.h
 create mode 100644 drivers/net/wireless/wl12xx/wl12xx.h
 create mode 100644 drivers/net/wireless/wl12xx/wl12xx_80211.h
 create mode 100644 include/linux/spi/wl12xx.h

(limited to 'include/linux')

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index 2d8434f409b..91be3e7bf13 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -500,5 +500,6 @@ source "drivers/net/wireless/b43legacy/Kconfig"
 source "drivers/net/wireless/zd1211rw/Kconfig"
 source "drivers/net/wireless/rt2x00/Kconfig"
 source "drivers/net/wireless/orinoco/Kconfig"
+source "drivers/net/wireless/wl12xx/Kconfig"
 
 endmenu
diff --git a/drivers/net/wireless/Makefile b/drivers/net/wireless/Makefile
index 0625e91b599..f2b1861e6bc 100644
--- a/drivers/net/wireless/Makefile
+++ b/drivers/net/wireless/Makefile
@@ -58,3 +58,5 @@ obj-$(CONFIG_P54_COMMON)	+= p54/
 obj-$(CONFIG_ATH_COMMON)	+= ath/
 
 obj-$(CONFIG_MAC80211_HWSIM)	+= mac80211_hwsim.o
+
+obj-$(CONFIG_WL12XX)	+= wl12xx/
diff --git a/drivers/net/wireless/wl12xx/Kconfig b/drivers/net/wireless/wl12xx/Kconfig
new file mode 100644
index 00000000000..20a9633569f
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/Kconfig
@@ -0,0 +1,11 @@
+config WL12XX
+	tristate "TI wl1251/wl1271 support"
+	depends on MAC80211 && WLAN_80211 && SPI_MASTER && EXPERIMENTAL
+	select FW_LOADER
+	select CRC7
+	---help---
+	  This module adds support for wireless adapters based on
+	  TI wl1251/wl1271 chipsets.
+
+	  If you choose to build a module, it'll be called wl12xx. Say N if
+	  unsure.
diff --git a/drivers/net/wireless/wl12xx/Makefile b/drivers/net/wireless/wl12xx/Makefile
new file mode 100644
index 00000000000..d43de27dc54
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/Makefile
@@ -0,0 +1,4 @@
+wl12xx-objs		= main.o spi.o event.o tx.o rx.o \
+			  ps.o cmd.o acx.o boot.o init.o wl1251.o \
+			  debugfs.o
+obj-$(CONFIG_WL12XX)	+= wl12xx.o
diff --git a/drivers/net/wireless/wl12xx/acx.c b/drivers/net/wireless/wl12xx/acx.c
new file mode 100644
index 00000000000..1cfd458ad5a
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/acx.c
@@ -0,0 +1,689 @@
+#include "acx.h"
+
+#include <linux/module.h>
+#include <linux/crc7.h>
+#include <linux/spi/spi.h>
+
+#include "wl12xx.h"
+#include "wl12xx_80211.h"
+#include "reg.h"
+#include "spi.h"
+#include "ps.h"
+
+int wl12xx_acx_frame_rates(struct wl12xx *wl, u8 ctrl_rate, u8 ctrl_mod,
+			   u8 mgt_rate, u8 mgt_mod)
+{
+	int ret;
+	struct acx_fw_gen_frame_rates rates;
+
+	wl12xx_debug(DEBUG_ACX, "acx frame rates");
+
+	rates.header.id = ACX_FW_GEN_FRAME_RATES;
+	rates.header.len = sizeof(struct acx_fw_gen_frame_rates) -
+		sizeof(struct acx_header);
+
+	rates.tx_ctrl_frame_rate = ctrl_rate;
+	rates.tx_ctrl_frame_mod = ctrl_mod;
+	rates.tx_mgt_frame_rate = mgt_rate;
+	rates.tx_mgt_frame_mod = mgt_mod;
+
+	ret = wl12xx_cmd_configure(wl, &rates, sizeof(rates));
+	if (ret < 0) {
+		wl12xx_error("Failed to set FW rates and modulation");
+		return ret;
+	}
+
+	return 0;
+}
+
+
+int wl12xx_acx_station_id(struct wl12xx *wl)
+{
+	int ret, i;
+	struct dot11_station_id mac;
+
+	wl12xx_debug(DEBUG_ACX, "acx dot11_station_id");
+
+	mac.header.id = DOT11_STATION_ID;
+	mac.header.len = sizeof(mac) - sizeof(struct acx_header);
+
+	for (i = 0; i < ETH_ALEN; i++)
+		mac.mac[i] = wl->mac_addr[ETH_ALEN - 1 - i];
+
+	ret = wl12xx_cmd_configure(wl, &mac, sizeof(mac));
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_acx_default_key(struct wl12xx *wl, u8 key_id)
+{
+	struct acx_dot11_default_key default_key;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx dot11_default_key (%d)", key_id);
+
+	default_key.header.id = DOT11_DEFAULT_KEY;
+	default_key.header.len = sizeof(default_key) -
+		sizeof(struct acx_header);
+
+	default_key.id = key_id;
+
+	ret = wl12xx_cmd_configure(wl, &default_key, sizeof(default_key));
+	if (ret < 0) {
+		wl12xx_error("Couldnt set default key");
+		return ret;
+	}
+
+	wl->default_key = key_id;
+
+	return 0;
+}
+
+int wl12xx_acx_wake_up_conditions(struct wl12xx *wl, u8 listen_interval)
+{
+	struct acx_wake_up_condition wake_up;
+
+	wl12xx_debug(DEBUG_ACX, "acx wake up conditions");
+
+	wake_up.header.id = ACX_WAKE_UP_CONDITIONS;
+	wake_up.header.len = sizeof(wake_up) - sizeof(struct acx_header);
+
+	wake_up.wake_up_event = WAKE_UP_EVENT_DTIM_BITMAP;
+	wake_up.listen_interval = listen_interval;
+
+	return wl12xx_cmd_configure(wl, &wake_up, sizeof(wake_up));
+}
+
+int wl12xx_acx_sleep_auth(struct wl12xx *wl, u8 sleep_auth)
+{
+	int ret;
+	struct acx_sleep_auth auth;
+
+	wl12xx_debug(DEBUG_ACX, "acx sleep auth");
+
+	auth.header.id = ACX_SLEEP_AUTH;
+	auth.header.len = sizeof(auth) - sizeof(struct acx_header);
+
+	auth.sleep_auth = sleep_auth;
+
+	ret = wl12xx_cmd_configure(wl, &auth, sizeof(auth));
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_acx_fw_version(struct wl12xx *wl, char *buf, size_t len)
+{
+	struct wl12xx_command cmd;
+	struct acx_revision *rev;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx fw rev");
+
+	memset(&cmd, 0, sizeof(cmd));
+
+	ret = wl12xx_cmd_interrogate(wl, ACX_FW_REV, sizeof(*rev), &cmd);
+	if (ret < 0) {
+		wl12xx_warning("ACX_FW_REV interrogate failed");
+		return ret;
+	}
+
+	rev = (struct acx_revision *) &cmd.parameters;
+
+	/* be careful with the buffer sizes */
+	strncpy(buf, rev->fw_version, min(len, sizeof(rev->fw_version)));
+
+	/*
+	 * if the firmware version string is exactly
+	 * sizeof(rev->fw_version) long or fw_len is less than
+	 * sizeof(rev->fw_version) it won't be null terminated
+	 */
+	buf[min(len, sizeof(rev->fw_version)) - 1] = '\0';
+
+	return 0;
+}
+
+int wl12xx_acx_tx_power(struct wl12xx *wl, int power)
+{
+	struct acx_current_tx_power ie;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx dot11_cur_tx_pwr");
+
+	if (power < 0 || power > 25)
+		return -EINVAL;
+
+	memset(&ie, 0, sizeof(ie));
+
+	ie.header.id = DOT11_CUR_TX_PWR;
+	ie.header.len = sizeof(ie) - sizeof(struct acx_header);
+	ie.current_tx_power = power * 10;
+
+	ret = wl12xx_cmd_configure(wl, &ie, sizeof(ie));
+	if (ret < 0) {
+		wl12xx_warning("configure of tx power failed: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_feature_cfg(struct wl12xx *wl)
+{
+	struct acx_feature_config feature;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx feature cfg");
+
+	memset(&feature, 0, sizeof(feature));
+
+	feature.header.id = ACX_FEATURE_CFG;
+	feature.header.len = sizeof(feature) - sizeof(struct acx_header);
+
+	/* DF_ENCRYPTION_DISABLE and DF_SNIFF_MODE_ENABLE are disabled */
+	feature.data_flow_options = 0;
+	feature.options = 0;
+
+	ret = wl12xx_cmd_configure(wl, &feature, sizeof(feature));
+	if (ret < 0)
+		wl12xx_error("Couldnt set HW encryption");
+
+	return ret;
+}
+
+int wl12xx_acx_mem_map(struct wl12xx *wl, void *mem_map, size_t len)
+{
+	struct wl12xx_command cmd;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx mem map");
+
+	ret = wl12xx_cmd_interrogate(wl, ACX_MEM_MAP, len, &cmd);
+	if (ret < 0)
+		return ret;
+	else if (cmd.status != CMD_STATUS_SUCCESS)
+		return -EIO;
+
+	memcpy(mem_map, &cmd.parameters, len);
+
+	return 0;
+}
+
+int wl12xx_acx_data_path_params(struct wl12xx *wl,
+				struct acx_data_path_params_resp *data_path)
+{
+	struct acx_data_path_params params;
+	struct wl12xx_command cmd;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx data path params");
+
+	params.rx_packet_ring_chunk_size = DP_RX_PACKET_RING_CHUNK_SIZE;
+	params.tx_packet_ring_chunk_size = DP_TX_PACKET_RING_CHUNK_SIZE;
+
+	params.rx_packet_ring_chunk_num = DP_RX_PACKET_RING_CHUNK_NUM;
+	params.tx_packet_ring_chunk_num = DP_TX_PACKET_RING_CHUNK_NUM;
+
+	params.tx_complete_threshold = 1;
+
+	params.tx_complete_ring_depth = FW_TX_CMPLT_BLOCK_SIZE;
+
+	params.tx_complete_timeout = DP_TX_COMPLETE_TIME_OUT;
+
+	params.header.id = ACX_DATA_PATH_PARAMS;
+	params.header.len = sizeof(params) - sizeof(struct acx_header);
+
+	ret = wl12xx_cmd_configure(wl, &params, sizeof(params));
+	if (ret < 0)
+		return ret;
+
+
+	ret = wl12xx_cmd_interrogate(wl, ACX_DATA_PATH_PARAMS,
+				     sizeof(struct acx_data_path_params_resp),
+				     &cmd);
+
+	if (ret < 0) {
+		wl12xx_warning("failed to read data path parameters: %d", ret);
+		return ret;
+	} else if (cmd.status != CMD_STATUS_SUCCESS) {
+		wl12xx_warning("data path parameter acx status failed");
+		return -EIO;
+	}
+
+	memcpy(data_path, &cmd.parameters, sizeof(*data_path));
+
+	return 0;
+}
+
+int wl12xx_acx_rx_msdu_life_time(struct wl12xx *wl, u32 life_time)
+{
+	struct rx_msdu_lifetime msdu_lifetime;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx rx msdu life time");
+
+	msdu_lifetime.header.id = DOT11_RX_MSDU_LIFE_TIME;
+	msdu_lifetime.header.len = sizeof(msdu_lifetime) -
+		sizeof(struct acx_header);
+	msdu_lifetime.lifetime = life_time;
+
+	ret = wl12xx_cmd_configure(wl, &msdu_lifetime, sizeof(msdu_lifetime));
+	if (ret < 0) {
+		wl12xx_warning("failed to set rx msdu life time: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_rx_config(struct wl12xx *wl, u32 config, u32 filter)
+{
+	struct acx_rx_config rx_config;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx rx config");
+
+	rx_config.header.id = ACX_RX_CFG;
+	rx_config.header.len = sizeof(rx_config) - sizeof(struct acx_header);
+	rx_config.config_options = config;
+	rx_config.filter_options = filter;
+
+	ret = wl12xx_cmd_configure(wl, &rx_config, sizeof(rx_config));
+	if (ret < 0) {
+		wl12xx_warning("failed to set rx config: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_pd_threshold(struct wl12xx *wl)
+{
+	struct acx_packet_detection packet_detection;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx data pd threshold");
+
+	/* FIXME: threshold value not set */
+	packet_detection.header.id = ACX_PD_THRESHOLD;
+	packet_detection.header.len = sizeof(packet_detection) -
+		sizeof(struct acx_header);
+
+	ret = wl12xx_cmd_configure(wl, &packet_detection,
+				   sizeof(packet_detection));
+	if (ret < 0) {
+		wl12xx_warning("failed to set pd threshold: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_slot(struct wl12xx *wl, enum acx_slot_type slot_time)
+{
+	struct acx_slot slot;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx slot");
+
+	slot.header.id = ACX_SLOT;
+	slot.header.len = sizeof(slot) - sizeof(struct acx_header);
+
+	slot.wone_index = STATION_WONE_INDEX;
+	slot.slot_time = slot_time;
+
+	ret = wl12xx_cmd_configure(wl, &slot, sizeof(slot));
+	if (ret < 0) {
+		wl12xx_warning("failed to set slot time: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_group_address_tbl(struct wl12xx *wl)
+{
+	struct multicast_grp_addr_start multicast;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx group address tbl");
+
+	/* MAC filtering */
+	multicast.header.id = DOT11_GROUP_ADDRESS_TBL;
+	multicast.header.len = sizeof(multicast) - sizeof(struct acx_header);
+
+	multicast.enabled = 0;
+	multicast.num_groups = 0;
+	memset(multicast.mac_table, 0, ADDRESS_GROUP_MAX_LEN);
+
+	ret = wl12xx_cmd_configure(wl, &multicast, sizeof(multicast));
+	if (ret < 0) {
+		wl12xx_warning("failed to set group addr table: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_service_period_timeout(struct wl12xx *wl)
+{
+	struct acx_rx_timeout rx_timeout;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx service period timeout");
+
+	/* RX timeout */
+	rx_timeout.header.id = ACX_SERVICE_PERIOD_TIMEOUT;
+	rx_timeout.header.len = sizeof(rx_timeout) - sizeof(struct acx_header);
+
+	rx_timeout.ps_poll_timeout = RX_TIMEOUT_PS_POLL_DEF;
+	rx_timeout.upsd_timeout = RX_TIMEOUT_UPSD_DEF;
+
+	ret = wl12xx_cmd_configure(wl, &rx_timeout, sizeof(rx_timeout));
+	if (ret < 0) {
+		wl12xx_warning("failed to set service period timeout: %d",
+			       ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_rts_threshold(struct wl12xx *wl, u16 rts_threshold)
+{
+	struct acx_rts_threshold rts;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx rts threshold");
+
+	rts.header.id = DOT11_RTS_THRESHOLD;
+	rts.header.len = sizeof(rts) - sizeof(struct acx_header);
+
+	rts.threshold = rts_threshold;
+
+	ret = wl12xx_cmd_configure(wl, &rts, sizeof(rts));
+	if (ret < 0) {
+		wl12xx_warning("failed to set rts threshold: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_beacon_filter_opt(struct wl12xx *wl)
+{
+	struct acx_beacon_filter_option beacon_filter;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx beacon filter opt");
+
+	beacon_filter.header.id = ACX_BEACON_FILTER_OPT;
+	beacon_filter.header.len = sizeof(beacon_filter) -
+		sizeof(struct acx_header);
+
+	beacon_filter.enable = 0;
+	beacon_filter.max_num_beacons = 0;
+
+	ret = wl12xx_cmd_configure(wl, &beacon_filter, sizeof(beacon_filter));
+	if (ret < 0) {
+		wl12xx_warning("failed to set beacon filter opt: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_beacon_filter_table(struct wl12xx *wl)
+{
+	struct acx_beacon_filter_ie_table ie_table;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx beacon filter table");
+
+	ie_table.header.id = ACX_BEACON_FILTER_TABLE;
+	ie_table.header.len = sizeof(ie_table) - sizeof(struct acx_header);
+
+	ie_table.num_ie = 0;
+	memset(ie_table.table, 0, BEACON_FILTER_TABLE_MAX_SIZE);
+
+	ret = wl12xx_cmd_configure(wl, &ie_table, sizeof(ie_table));
+	if (ret < 0) {
+		wl12xx_warning("failed to set beacon filter table: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_sg_enable(struct wl12xx *wl)
+{
+	struct acx_bt_wlan_coex pta;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx sg enable");
+
+	pta.header.id = ACX_SG_ENABLE;
+	pta.header.len = sizeof(pta) - sizeof(struct acx_header);
+
+	pta.enable = SG_ENABLE;
+
+	ret = wl12xx_cmd_configure(wl, &pta, sizeof(pta));
+	if (ret < 0) {
+		wl12xx_warning("failed to set softgemini enable: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_sg_cfg(struct wl12xx *wl)
+{
+	struct acx_bt_wlan_coex_param param;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx sg cfg");
+
+	/* BT-WLAN coext parameters */
+	param.header.id = ACX_SG_CFG;
+	param.header.len = sizeof(param) - sizeof(struct acx_header);
+
+	param.min_rate = RATE_INDEX_24MBPS;
+	param.bt_hp_max_time = PTA_BT_HP_MAXTIME_DEF;
+	param.wlan_hp_max_time = PTA_WLAN_HP_MAX_TIME_DEF;
+	param.sense_disable_timer = PTA_SENSE_DISABLE_TIMER_DEF;
+	param.rx_time_bt_hp = PTA_PROTECTIVE_RX_TIME_DEF;
+	param.tx_time_bt_hp = PTA_PROTECTIVE_TX_TIME_DEF;
+	param.rx_time_bt_hp_fast = PTA_PROTECTIVE_RX_TIME_FAST_DEF;
+	param.tx_time_bt_hp_fast = PTA_PROTECTIVE_TX_TIME_FAST_DEF;
+	param.wlan_cycle_fast = PTA_CYCLE_TIME_FAST_DEF;
+	param.bt_anti_starvation_period = PTA_ANTI_STARVE_PERIOD_DEF;
+	param.next_bt_lp_packet = PTA_TIMEOUT_NEXT_BT_LP_PACKET_DEF;
+	param.wake_up_beacon = PTA_TIME_BEFORE_BEACON_DEF;
+	param.hp_dm_max_guard_time = PTA_HPDM_MAX_TIME_DEF;
+	param.next_wlan_packet = PTA_TIME_OUT_NEXT_WLAN_DEF;
+	param.antenna_type = PTA_ANTENNA_TYPE_DEF;
+	param.signal_type = PTA_SIGNALING_TYPE_DEF;
+	param.afh_leverage_on = PTA_AFH_LEVERAGE_ON_DEF;
+	param.quiet_cycle_num = PTA_NUMBER_QUIET_CYCLE_DEF;
+	param.max_cts = PTA_MAX_NUM_CTS_DEF;
+	param.wlan_packets_num = PTA_NUMBER_OF_WLAN_PACKETS_DEF;
+	param.bt_packets_num = PTA_NUMBER_OF_BT_PACKETS_DEF;
+	param.missed_rx_avalanche = PTA_RX_FOR_AVALANCHE_DEF;
+	param.wlan_elp_hp = PTA_ELP_HP_DEF;
+	param.bt_anti_starvation_cycles = PTA_ANTI_STARVE_NUM_CYCLE_DEF;
+	param.ack_mode_dual_ant = PTA_ACK_MODE_DEF;
+	param.pa_sd_enable = PTA_ALLOW_PA_SD_DEF;
+	param.pta_auto_mode_enable = PTA_AUTO_MODE_NO_CTS_DEF;
+	param.bt_hp_respected_num = PTA_BT_HP_RESPECTED_DEF;
+
+	ret = wl12xx_cmd_configure(wl, &param, sizeof(param));
+	if (ret < 0) {
+		wl12xx_warning("failed to set sg config: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_cca_threshold(struct wl12xx *wl)
+{
+	struct acx_energy_detection detection;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx cca threshold");
+
+	detection.header.id = ACX_CCA_THRESHOLD;
+	detection.header.len = sizeof(detection) - sizeof(struct acx_header);
+
+	detection.rx_cca_threshold = CCA_THRSH_DISABLE_ENERGY_D;
+	detection.tx_energy_detection = 0;
+
+	ret = wl12xx_cmd_configure(wl, &detection, sizeof(detection));
+	if (ret < 0) {
+		wl12xx_warning("failed to set cca threshold: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_bcn_dtim_options(struct wl12xx *wl)
+{
+	struct acx_beacon_broadcast bb;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx bcn dtim options");
+
+	bb.header.id = ACX_BCN_DTIM_OPTIONS;
+	bb.header.len = sizeof(bb) - sizeof(struct acx_header);
+
+	bb.beacon_rx_timeout = BCN_RX_TIMEOUT_DEF_VALUE;
+	bb.broadcast_timeout = BROADCAST_RX_TIMEOUT_DEF_VALUE;
+	bb.rx_broadcast_in_ps = RX_BROADCAST_IN_PS_DEF_VALUE;
+	bb.ps_poll_threshold = CONSECUTIVE_PS_POLL_FAILURE_DEF;
+
+	ret = wl12xx_cmd_configure(wl, &bb, sizeof(bb));
+	if (ret < 0) {
+		wl12xx_warning("failed to set rx config: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_aid(struct wl12xx *wl, u16 aid)
+{
+	struct acx_aid acx_aid;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx aid");
+
+	acx_aid.header.id = ACX_AID;
+	acx_aid.header.len = sizeof(acx_aid) - sizeof(struct acx_header);
+
+	acx_aid.aid = aid;
+
+	ret = wl12xx_cmd_configure(wl, &acx_aid, sizeof(acx_aid));
+	if (ret < 0) {
+		wl12xx_warning("failed to set aid: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_event_mbox_mask(struct wl12xx *wl, u32 event_mask)
+{
+	struct acx_event_mask mask;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx event mbox mask");
+
+	mask.header.id = ACX_EVENT_MBOX_MASK;
+	mask.header.len = sizeof(mask) - sizeof(struct acx_header);
+
+	/* high event mask is unused */
+	mask.high_event_mask = 0xffffffff;
+
+	mask.event_mask = event_mask;
+
+	ret = wl12xx_cmd_configure(wl, &mask, sizeof(mask));
+	if (ret < 0) {
+		wl12xx_warning("failed to set aid: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_acx_set_preamble(struct wl12xx *wl, enum acx_preamble_type preamble)
+{
+	struct acx_preamble ie;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx_set_preamble");
+
+	memset(&ie, 0, sizeof(ie));
+
+	ie.header.id = ACX_PREAMBLE_TYPE;
+	ie.header.len = sizeof(ie) - sizeof(struct acx_header);
+	ie.preamble = preamble;
+	ret = wl12xx_cmd_configure(wl, &ie, sizeof(ie));
+	if (ret < 0) {
+		wl12xx_warning("Setting of preamble failed: %d", ret);
+		return ret;
+	}
+	return 0;
+}
+
+int wl12xx_acx_cts_protect(struct wl12xx *wl,
+			   enum acx_ctsprotect_type ctsprotect)
+{
+	struct acx_ctsprotect ie;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx_set_ctsprotect");
+
+	memset(&ie, 0, sizeof(ie));
+
+	ie.header.id = ACX_CTS_PROTECTION;
+	ie.header.len = sizeof(ie) - sizeof(struct acx_header);
+	ie.ctsprotect = ctsprotect;
+	ret = wl12xx_cmd_configure(wl, &ie, sizeof(ie));
+	if (ret < 0) {
+		wl12xx_warning("Setting of ctsprotect failed: %d", ret);
+		return ret;
+	}
+	return 0;
+}
+
+int wl12xx_acx_statistics(struct wl12xx *wl, struct acx_statistics *stats)
+{
+	struct wl12xx_command *answer;
+	int ret;
+
+	wl12xx_debug(DEBUG_ACX, "acx statistics");
+
+	answer = kmalloc(sizeof(*answer), GFP_KERNEL);
+	if (!answer) {
+		wl12xx_warning("could not allocate memory for acx statistics");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = wl12xx_cmd_interrogate(wl, ACX_STATISTICS, sizeof(*answer),
+				     answer);
+	if (ret < 0) {
+		wl12xx_warning("acx statistics failed: %d", ret);
+		goto out;
+	}
+
+	memcpy(stats, answer->parameters, sizeof(*stats));
+
+out:
+	kfree(answer);
+	return ret;
+}
diff --git a/drivers/net/wireless/wl12xx/acx.h b/drivers/net/wireless/wl12xx/acx.h
new file mode 100644
index 00000000000..fb2d2340993
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/acx.h
@@ -0,0 +1,1245 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_ACX_H__
+#define __WL12XX_ACX_H__
+
+#include "wl12xx.h"
+
+/* Target's information element */
+struct acx_header {
+	u16 id;
+	u16 len;
+};
+
+struct acx_error_counter {
+	struct acx_header header;
+
+	/* The number of PLCP errors since the last time this */
+	/* information element was interrogated. This field is */
+	/* automatically cleared when it is interrogated.*/
+	u32 PLCP_error;
+
+	/* The number of FCS errors since the last time this */
+	/* information element was interrogated. This field is */
+	/* automatically cleared when it is interrogated.*/
+	u32 FCS_error;
+
+	/* The number of MPDUs without PLCP header errors received*/
+	/* since the last time this information element was interrogated. */
+	/* This field is automatically cleared when it is interrogated.*/
+	u32 valid_frame;
+
+	/* the number of missed sequence numbers in the squentially */
+	/* values of frames seq numbers */
+	u32 seq_num_miss;
+} __attribute__ ((packed));
+
+struct acx_revision {
+	struct acx_header header;
+
+	/*
+	 * The WiLink firmware version, an ASCII string x.x.x.x,
+	 * that uniquely identifies the current firmware.
+	 * The left most digit is incremented each time a
+	 * significant change is made to the firmware, such as
+	 * code redesign or new platform support.
+	 * The second digit is incremented when major enhancements
+	 * are added or major fixes are made.
+	 * The third digit is incremented for each GA release.
+	 * The fourth digit is incremented for each build.
+	 * The first two digits identify a firmware release version,
+	 * in other words, a unique set of features.
+	 * The first three digits identify a GA release.
+	 */
+	char fw_version[20];
+
+	/*
+	 * This 4 byte field specifies the WiLink hardware version.
+	 * bits 0  - 15: Reserved.
+	 * bits 16 - 23: Version ID - The WiLink version ID
+	 *              (1 = first spin, 2 = second spin, and so on).
+	 * bits 24 - 31: Chip ID - The WiLink chip ID.
+	 */
+	u32 hw_version;
+} __attribute__ ((packed));
+
+enum wl12xx_psm_mode {
+	/* Active mode */
+	WL12XX_PSM_CAM = 0,
+
+	/* Power save mode */
+	WL12XX_PSM_PS = 1,
+
+	/* Extreme low power */
+	WL12XX_PSM_ELP = 2,
+};
+
+struct acx_sleep_auth {
+	struct acx_header header;
+
+	/* The sleep level authorization of the device. */
+	/* 0 - Always active*/
+	/* 1 - Power down mode: light / fast sleep*/
+	/* 2 - ELP mode: Deep / Max sleep*/
+	u8  sleep_auth;
+	u8  padding[3];
+} __attribute__ ((packed));
+
+#define TIM_ELE_ID    5
+#define PARTIAL_VBM_MAX    251
+
+struct tim {
+	u8 identity;
+	u8 length;
+	u8 dtim_count;
+	u8 dtim_period;
+	u8 bitmap_ctrl;
+	u8 pvb_field[PARTIAL_VBM_MAX]; /* Partial Virtual Bitmap */
+} __attribute__ ((packed));
+
+/* Virtual Bit Map update */
+struct vbm_update_request {
+	__le16 len;
+	u8  padding[2];
+	struct tim tim;
+} __attribute__ ((packed));
+
+enum {
+	HOSTIF_PCI_MASTER_HOST_INDIRECT,
+	HOSTIF_PCI_MASTER_HOST_DIRECT,
+	HOSTIF_SLAVE,
+	HOSTIF_PKT_RING,
+	HOSTIF_DONTCARE = 0xFF
+};
+
+#define DEFAULT_UCAST_PRIORITY          0
+#define DEFAULT_RX_Q_PRIORITY           0
+#define DEFAULT_NUM_STATIONS            1
+#define DEFAULT_RXQ_PRIORITY            0 /* low 0 .. 15 high  */
+#define DEFAULT_RXQ_TYPE                0x07    /* All frames, Data/Ctrl/Mgmt */
+#define TRACE_BUFFER_MAX_SIZE           256
+
+#define  DP_RX_PACKET_RING_CHUNK_SIZE 1600
+#define  DP_TX_PACKET_RING_CHUNK_SIZE 1600
+#define  DP_RX_PACKET_RING_CHUNK_NUM 2
+#define  DP_TX_PACKET_RING_CHUNK_NUM 2
+#define  DP_TX_COMPLETE_TIME_OUT 20
+#define  FW_TX_CMPLT_BLOCK_SIZE 16
+
+struct acx_data_path_params {
+	struct acx_header header;
+
+	u16 rx_packet_ring_chunk_size;
+	u16 tx_packet_ring_chunk_size;
+
+	u8 rx_packet_ring_chunk_num;
+	u8 tx_packet_ring_chunk_num;
+
+	/*
+	 * Maximum number of packets that can be gathered
+	 * in the TX complete ring before an interrupt
+	 * is generated.
+	 */
+	u8 tx_complete_threshold;
+
+	/* Number of pending TX complete entries in cyclic ring.*/
+	u8 tx_complete_ring_depth;
+
+	/*
+	 * Max num microseconds since a packet enters the TX
+	 * complete ring until an interrupt is generated.
+	 */
+	u32 tx_complete_timeout;
+} __attribute__ ((packed));
+
+
+struct acx_data_path_params_resp {
+	struct acx_header header;
+
+	u16 rx_packet_ring_chunk_size;
+	u16 tx_packet_ring_chunk_size;
+
+	u8 rx_packet_ring_chunk_num;
+	u8 tx_packet_ring_chunk_num;
+
+	u8 pad[2];
+
+	u32 rx_packet_ring_addr;
+	u32 tx_packet_ring_addr;
+
+	u32 rx_control_addr;
+	u32 tx_control_addr;
+
+	u32 tx_complete_addr;
+} __attribute__ ((packed));
+
+#define TX_MSDU_LIFETIME_MIN       0
+#define TX_MSDU_LIFETIME_MAX       3000
+#define TX_MSDU_LIFETIME_DEF       512
+#define RX_MSDU_LIFETIME_MIN       0
+#define RX_MSDU_LIFETIME_MAX       0xFFFFFFFF
+#define RX_MSDU_LIFETIME_DEF       512000
+
+struct rx_msdu_lifetime {
+	struct acx_header header;
+
+	/*
+	 * The maximum amount of time, in TU, before the
+	 * firmware discards the MSDU.
+	 */
+	u32 lifetime;
+} __attribute__ ((packed));
+
+/*
+ * RX Config Options Table
+ * Bit		Definition
+ * ===		==========
+ * 31:14		Reserved
+ * 13		Copy RX Status - when set, write three receive status words
+ * 	 	to top of rx'd MPDUs.
+ * 		When cleared, do not write three status words (added rev 1.5)
+ * 12		Reserved
+ * 11		RX Complete upon FCS error - when set, give rx complete
+ *	 	interrupt for FCS errors, after the rx filtering, e.g. unicast
+ *	 	frames not to us with FCS error will not generate an interrupt.
+ * 10		SSID Filter Enable - When set, the WiLink discards all beacon,
+ *	        probe request, and probe response frames with an SSID that does
+ *		not match the SSID specified by the host in the START/JOIN
+ *		command.
+ *		When clear, the WiLink receives frames with any SSID.
+ * 9		Broadcast Filter Enable - When set, the WiLink discards all
+ * 	 	broadcast frames. When clear, the WiLink receives all received
+ *		broadcast frames.
+ * 8:6		Reserved
+ * 5		BSSID Filter Enable - When set, the WiLink discards any frames
+ * 	 	with a BSSID that does not match the BSSID specified by the
+ *		host.
+ *		When clear, the WiLink receives frames from any BSSID.
+ * 4		MAC Addr Filter - When set, the WiLink discards any frames
+ * 	 	with a destination address that does not match the MAC address
+ *		of the adaptor.
+ *		When clear, the WiLink receives frames destined to any MAC
+ *		address.
+ * 3		Promiscuous - When set, the WiLink receives all valid frames
+ * 	 	(i.e., all frames that pass the FCS check).
+ *		When clear, only frames that pass the other filters specified
+ *		are received.
+ * 2		FCS - When set, the WiLink includes the FCS with the received
+ *	 	frame.
+ *		When cleared, the FCS is discarded.
+ * 1		PLCP header - When set, write all data from baseband to frame
+ * 	 	buffer including PHY header.
+ * 0		Reserved - Always equal to 0.
+ *
+ * RX Filter Options Table
+ * Bit		Definition
+ * ===		==========
+ * 31:12		Reserved - Always equal to 0.
+ * 11		Association - When set, the WiLink receives all association
+ * 	 	related frames (association request/response, reassocation
+ *		request/response, and disassociation). When clear, these frames
+ *		are discarded.
+ * 10		Auth/De auth - When set, the WiLink receives all authentication
+ * 	 	and de-authentication frames. When clear, these frames are
+ *		discarded.
+ * 9		Beacon - When set, the WiLink receives all beacon frames.
+ * 	 	When clear, these frames are discarded.
+ * 8		Contention Free - When set, the WiLink receives all contention
+ * 	 	free frames.
+ *		When clear, these frames are discarded.
+ * 7		Control - When set, the WiLink receives all control frames.
+ * 	 	When clear, these frames are discarded.
+ * 6		Data - When set, the WiLink receives all data frames.
+ * 	 	When clear, these frames are discarded.
+ * 5		FCS Error - When set, the WiLink receives frames that have FCS
+ *	 	errors.
+ *		When clear, these frames are discarded.
+ * 4		Management - When set, the WiLink receives all management
+ *		frames.
+ * 	 	When clear, these frames are discarded.
+ * 3		Probe Request - When set, the WiLink receives all probe request
+ * 	 	frames.
+ *		When clear, these frames are discarded.
+ * 2		Probe Response - When set, the WiLink receives all probe
+ * 		response frames.
+ *		When clear, these frames are discarded.
+ * 1		RTS/CTS/ACK - When set, the WiLink receives all RTS, CTS and ACK
+ * 	 	frames.
+ *		When clear, these frames are discarded.
+ * 0		Rsvd Type/Sub Type - When set, the WiLink receives all frames
+ * 	 	that have reserved frame types and sub types as defined by the
+ *		802.11 specification.
+ *		When clear, these frames are discarded.
+ */
+struct acx_rx_config {
+	struct acx_header header;
+
+	u32 config_options;
+	u32 filter_options;
+} __attribute__ ((packed));
+
+enum {
+	QOS_AC_BE = 0,
+	QOS_AC_BK,
+	QOS_AC_VI,
+	QOS_AC_VO,
+	QOS_HIGHEST_AC_INDEX = QOS_AC_VO,
+};
+
+#define MAX_NUM_OF_AC             (QOS_HIGHEST_AC_INDEX+1)
+#define FIRST_AC_INDEX            QOS_AC_BE
+#define MAX_NUM_OF_802_1d_TAGS    8
+#define AC_PARAMS_MAX_TSID        15
+#define MAX_APSD_CONF             0xffff
+
+#define  QOS_TX_HIGH_MIN      (0)
+#define  QOS_TX_HIGH_MAX      (100)
+
+#define  QOS_TX_HIGH_BK_DEF   (25)
+#define  QOS_TX_HIGH_BE_DEF   (35)
+#define  QOS_TX_HIGH_VI_DEF   (35)
+#define  QOS_TX_HIGH_VO_DEF   (35)
+
+#define  QOS_TX_LOW_BK_DEF    (15)
+#define  QOS_TX_LOW_BE_DEF    (25)
+#define  QOS_TX_LOW_VI_DEF    (25)
+#define  QOS_TX_LOW_VO_DEF    (25)
+
+struct acx_tx_queue_qos_config {
+	struct acx_header header;
+
+	u8 qid;
+	u8 pad[3];
+
+	/* Max number of blocks allowd in the queue */
+	u16 high_threshold;
+
+	/* Lowest memory blocks guaranteed for this queue */
+	u16 low_threshold;
+} __attribute__ ((packed));
+
+struct acx_packet_detection {
+	struct acx_header header;
+
+	u32 threshold;
+} __attribute__ ((packed));
+
+
+enum acx_slot_type {
+	SLOT_TIME_LONG = 0,
+	SLOT_TIME_SHORT = 1,
+	DEFAULT_SLOT_TIME = SLOT_TIME_SHORT,
+	MAX_SLOT_TIMES = 0xFF
+};
+
+#define STATION_WONE_INDEX 0
+
+struct acx_slot {
+	struct acx_header header;
+
+	u8 wone_index; /* Reserved */
+	u8 slot_time;
+	u8 reserved[6];
+} __attribute__ ((packed));
+
+
+#define ADDRESS_GROUP_MAX	(8)
+#define ADDRESS_GROUP_MAX_LEN	(ETH_ALEN * ADDRESS_GROUP_MAX)
+
+struct multicast_grp_addr_start {
+	struct acx_header header;
+
+	u8 enabled;
+	u8 num_groups;
+	u8 pad[2];
+	u8 mac_table[ADDRESS_GROUP_MAX_LEN];
+} __attribute__ ((packed));
+
+
+#define  RX_TIMEOUT_PS_POLL_MIN    0
+#define  RX_TIMEOUT_PS_POLL_MAX    (200000)
+#define  RX_TIMEOUT_PS_POLL_DEF    (15)
+#define  RX_TIMEOUT_UPSD_MIN       0
+#define  RX_TIMEOUT_UPSD_MAX       (200000)
+#define  RX_TIMEOUT_UPSD_DEF       (15)
+
+struct acx_rx_timeout {
+	struct acx_header header;
+
+	/*
+	 * The longest time the STA will wait to receive
+	 * traffic from the AP after a PS-poll has been
+	 * transmitted.
+	 */
+	u16 ps_poll_timeout;
+
+	/*
+	 * The longest time the STA will wait to receive
+	 * traffic from the AP after a frame has been sent
+	 * from an UPSD enabled queue.
+	 */
+	u16 upsd_timeout;
+} __attribute__ ((packed));
+
+#define RTS_THRESHOLD_MIN              0
+#define RTS_THRESHOLD_MAX              4096
+#define RTS_THRESHOLD_DEF              2347
+
+struct acx_rts_threshold {
+	struct acx_header header;
+
+	u16 threshold;
+	u8 pad[2];
+} __attribute__ ((packed));
+
+struct acx_beacon_filter_option {
+	struct acx_header header;
+
+	u8 enable;
+
+	/*
+	 * The number of beacons without the unicast TIM
+	 * bit set that the firmware buffers before
+	 * signaling the host about ready frames.
+	 * When set to 0 and the filter is enabled, beacons
+	 * without the unicast TIM bit set are dropped.
+	 */
+	u8 max_num_beacons;
+	u8 pad[2];
+} __attribute__ ((packed));
+
+/*
+ * ACXBeaconFilterEntry (not 221)
+ * Byte Offset     Size (Bytes)    Definition
+ * ===========     ============    ==========
+ * 0				1               IE identifier
+ * 1               1               Treatment bit mask
+ *
+ * ACXBeaconFilterEntry (221)
+ * Byte Offset     Size (Bytes)    Definition
+ * ===========     ============    ==========
+ * 0               1               IE identifier
+ * 1               1               Treatment bit mask
+ * 2               3               OUI
+ * 5               1               Type
+ * 6               2               Version
+ *
+ *
+ * Treatment bit mask - The information element handling:
+ * bit 0 - The information element is compared and transferred
+ * in case of change.
+ * bit 1 - The information element is transferred to the host
+ * with each appearance or disappearance.
+ * Note that both bits can be set at the same time.
+ */
+#define	BEACON_FILTER_TABLE_MAX_IE_NUM		       (32)
+#define BEACON_FILTER_TABLE_MAX_VENDOR_SPECIFIC_IE_NUM (6)
+#define BEACON_FILTER_TABLE_IE_ENTRY_SIZE	       (2)
+#define BEACON_FILTER_TABLE_EXTRA_VENDOR_SPECIFIC_IE_SIZE (6)
+#define BEACON_FILTER_TABLE_MAX_SIZE ((BEACON_FILTER_TABLE_MAX_IE_NUM * \
+			    BEACON_FILTER_TABLE_IE_ENTRY_SIZE) + \
+			   (BEACON_FILTER_TABLE_MAX_VENDOR_SPECIFIC_IE_NUM * \
+			    BEACON_FILTER_TABLE_EXTRA_VENDOR_SPECIFIC_IE_SIZE))
+
+struct acx_beacon_filter_ie_table {
+	struct acx_header header;
+
+	u8 num_ie;
+	u8 table[BEACON_FILTER_TABLE_MAX_SIZE];
+	u8 pad[3];
+} __attribute__ ((packed));
+
+enum {
+	SG_ENABLE = 0,
+	SG_DISABLE,
+	SG_SENSE_NO_ACTIVITY,
+	SG_SENSE_ACTIVE
+};
+
+struct acx_bt_wlan_coex {
+	struct acx_header header;
+
+	/*
+	 * 0 -> PTA enabled
+	 * 1 -> PTA disabled
+	 * 2 -> sense no active mode, i.e.
+	 *      an interrupt is sent upon
+	 *      BT activity.
+	 * 3 -> PTA is switched on in response
+	 *      to the interrupt sending.
+	 */
+	u8 enable;
+	u8 pad[3];
+} __attribute__ ((packed));
+
+#define PTA_ANTENNA_TYPE_DEF		  (0)
+#define PTA_BT_HP_MAXTIME_DEF		  (2000)
+#define PTA_WLAN_HP_MAX_TIME_DEF	  (5000)
+#define PTA_SENSE_DISABLE_TIMER_DEF	  (1350)
+#define PTA_PROTECTIVE_RX_TIME_DEF	  (1500)
+#define PTA_PROTECTIVE_TX_TIME_DEF	  (1500)
+#define PTA_TIMEOUT_NEXT_BT_LP_PACKET_DEF (3000)
+#define PTA_SIGNALING_TYPE_DEF		  (1)
+#define PTA_AFH_LEVERAGE_ON_DEF		  (0)
+#define PTA_NUMBER_QUIET_CYCLE_DEF	  (0)
+#define PTA_MAX_NUM_CTS_DEF		  (3)
+#define PTA_NUMBER_OF_WLAN_PACKETS_DEF	  (2)
+#define PTA_NUMBER_OF_BT_PACKETS_DEF	  (2)
+#define PTA_PROTECTIVE_RX_TIME_FAST_DEF	  (1500)
+#define PTA_PROTECTIVE_TX_TIME_FAST_DEF	  (3000)
+#define PTA_CYCLE_TIME_FAST_DEF		  (8700)
+#define PTA_RX_FOR_AVALANCHE_DEF	  (5)
+#define PTA_ELP_HP_DEF			  (0)
+#define PTA_ANTI_STARVE_PERIOD_DEF	  (500)
+#define PTA_ANTI_STARVE_NUM_CYCLE_DEF	  (4)
+#define PTA_ALLOW_PA_SD_DEF		  (1)
+#define PTA_TIME_BEFORE_BEACON_DEF	  (6300)
+#define PTA_HPDM_MAX_TIME_DEF		  (1600)
+#define PTA_TIME_OUT_NEXT_WLAN_DEF	  (2550)
+#define PTA_AUTO_MODE_NO_CTS_DEF	  (0)
+#define PTA_BT_HP_RESPECTED_DEF		  (3)
+#define PTA_WLAN_RX_MIN_RATE_DEF	  (24)
+#define PTA_ACK_MODE_DEF		  (1)
+
+struct acx_bt_wlan_coex_param {
+	struct acx_header header;
+
+	/*
+	 * The minimum rate of a received WLAN packet in the STA,
+	 * during protective mode, of which a new BT-HP request
+	 * during this Rx will always be respected and gain the antenna.
+	 */
+	u32 min_rate;
+
+	/* Max time the BT HP will be respected. */
+	u16 bt_hp_max_time;
+
+	/* Max time the WLAN HP will be respected. */
+	u16 wlan_hp_max_time;
+
+	/*
+	 * The time between the last BT activity
+	 * and the moment when the sense mode returns
+	 * to SENSE_INACTIVE.
+	 */
+	u16 sense_disable_timer;
+
+	/* Time before the next BT HP instance */
+	u16 rx_time_bt_hp;
+	u16 tx_time_bt_hp;
+
+	/* range: 10-20000    default: 1500 */
+	u16 rx_time_bt_hp_fast;
+	u16 tx_time_bt_hp_fast;
+
+	/* range: 2000-65535  default: 8700 */
+	u16 wlan_cycle_fast;
+
+	/* range: 0 - 15000 (Msec) default: 1000 */
+	u16 bt_anti_starvation_period;
+
+	/* range 400-10000(Usec) default: 3000 */
+	u16 next_bt_lp_packet;
+
+	/* Deafult: worst case for BT DH5 traffic */
+	u16 wake_up_beacon;
+
+	/* range: 0-50000(Usec) default: 1050 */
+	u16 hp_dm_max_guard_time;
+
+	/*
+	 * This is to prevent both BT & WLAN antenna
+	 * starvation.
+	 * Range: 100-50000(Usec) default:2550
+	 */
+	u16 next_wlan_packet;
+
+	/* 0 -> shared antenna */
+	u8 antenna_type;
+
+	/*
+	 * 0 -> TI legacy
+	 * 1 -> Palau
+	 */
+	u8 signal_type;
+
+	/*
+	 * BT AFH status
+	 * 0 -> no AFH
+	 * 1 -> from dedicated GPIO
+	 * 2 -> AFH on (from host)
+	 */
+	u8 afh_leverage_on;
+
+	/*
+	 * The number of cycles during which no
+	 * TX will be sent after 1 cycle of RX
+	 * transaction in protective mode
+	 */
+	u8 quiet_cycle_num;
+
+	/*
+	 * The maximum number of CTSs that will
+	 * be sent for receiving RX packet in
+	 * protective mode
+	 */
+	u8 max_cts;
+
+	/*
+	 * The number of WLAN packets
+	 * transferred in common mode before
+	 * switching to BT.
+	 */
+	u8 wlan_packets_num;
+
+	/*
+	 * The number of BT packets
+	 * transferred in common mode before
+	 * switching to WLAN.
+	 */
+	u8 bt_packets_num;
+
+	/* range: 1-255  default: 5 */
+	u8 missed_rx_avalanche;
+
+	/* range: 0-1    default: 1 */
+	u8 wlan_elp_hp;
+
+	/* range: 0 - 15  default: 4 */
+	u8 bt_anti_starvation_cycles;
+
+	u8 ack_mode_dual_ant;
+
+	/*
+	 * Allow PA_SD assertion/de-assertion
+	 * during enabled BT activity.
+	 */
+	u8 pa_sd_enable;
+
+	/*
+	 * Enable/Disable PTA in auto mode:
+	 * Support Both Active & P.S modes
+	 */
+	u8 pta_auto_mode_enable;
+
+	/* range: 0 - 20  default: 1 */
+	u8 bt_hp_respected_num;
+} __attribute__ ((packed));
+
+#define CCA_THRSH_ENABLE_ENERGY_D       0x140A
+#define CCA_THRSH_DISABLE_ENERGY_D      0xFFEF
+
+struct acx_energy_detection {
+	struct acx_header header;
+
+	/* The RX Clear Channel Assessment threshold in the PHY */
+	u16 rx_cca_threshold;
+	u8 tx_energy_detection;
+	u8 pad;
+} __attribute__ ((packed));
+
+#define BCN_RX_TIMEOUT_DEF_VALUE        10000
+#define BROADCAST_RX_TIMEOUT_DEF_VALUE  20000
+#define RX_BROADCAST_IN_PS_DEF_VALUE    1
+#define CONSECUTIVE_PS_POLL_FAILURE_DEF 4
+
+struct acx_beacon_broadcast {
+	struct acx_header header;
+
+	u16 beacon_rx_timeout;
+	u16 broadcast_timeout;
+
+	/* Enables receiving of broadcast packets in PS mode */
+	u8 rx_broadcast_in_ps;
+
+	/* Consecutive PS Poll failures before updating the host */
+	u8 ps_poll_threshold;
+	u8 pad[2];
+} __attribute__ ((packed));
+
+struct acx_event_mask {
+	struct acx_header header;
+
+	u32 event_mask;
+	u32 high_event_mask; /* Unused */
+} __attribute__ ((packed));
+
+#define CFG_RX_FCS		BIT(2)
+#define CFG_RX_ALL_GOOD		BIT(3)
+#define CFG_UNI_FILTER_EN	BIT(4)
+#define CFG_BSSID_FILTER_EN	BIT(5)
+#define CFG_MC_FILTER_EN	BIT(6)
+#define CFG_MC_ADDR0_EN		BIT(7)
+#define CFG_MC_ADDR1_EN		BIT(8)
+#define CFG_BC_REJECT_EN	BIT(9)
+#define CFG_SSID_FILTER_EN	BIT(10)
+#define CFG_RX_INT_FCS_ERROR	BIT(11)
+#define CFG_RX_INT_ENCRYPTED	BIT(12)
+#define CFG_RX_WR_RX_STATUS	BIT(13)
+#define CFG_RX_FILTER_NULTI	BIT(14)
+#define CFG_RX_RESERVE		BIT(15)
+#define CFG_RX_TIMESTAMP_TSF	BIT(16)
+
+#define CFG_RX_RSV_EN		BIT(0)
+#define CFG_RX_RCTS_ACK		BIT(1)
+#define CFG_RX_PRSP_EN		BIT(2)
+#define CFG_RX_PREQ_EN		BIT(3)
+#define CFG_RX_MGMT_EN		BIT(4)
+#define CFG_RX_FCS_ERROR	BIT(5)
+#define CFG_RX_DATA_EN		BIT(6)
+#define CFG_RX_CTL_EN		BIT(7)
+#define CFG_RX_CF_EN		BIT(8)
+#define CFG_RX_BCN_EN		BIT(9)
+#define CFG_RX_AUTH_EN		BIT(10)
+#define CFG_RX_ASSOC_EN		BIT(11)
+
+#define SCAN_PASSIVE		BIT(0)
+#define SCAN_5GHZ_BAND		BIT(1)
+#define SCAN_TRIGGERED		BIT(2)
+#define SCAN_PRIORITY_HIGH	BIT(3)
+
+struct acx_fw_gen_frame_rates {
+	struct acx_header header;
+
+	u8 tx_ctrl_frame_rate; /* RATE_* */
+	u8 tx_ctrl_frame_mod; /* CCK_* or PBCC_* */
+	u8 tx_mgt_frame_rate;
+	u8 tx_mgt_frame_mod;
+} __attribute__ ((packed));
+
+/* STA MAC */
+struct dot11_station_id {
+	struct acx_header header;
+
+	u8 mac[ETH_ALEN];
+	u8 pad[2];
+} __attribute__ ((packed));
+
+/* HW encryption keys */
+#define NUM_ACCESS_CATEGORIES_COPY 4
+#define MAX_KEY_SIZE 32
+
+/* When set, disable HW encryption */
+#define DF_ENCRYPTION_DISABLE      0x01
+/* When set, disable HW decryption */
+#define DF_SNIFF_MODE_ENABLE       0x80
+
+struct acx_feature_config {
+	struct acx_header header;
+
+	u32 options;
+	u32 data_flow_options;
+} __attribute__ ((packed));
+
+enum acx_key_action {
+	KEY_ADD_OR_REPLACE = 1,
+	KEY_REMOVE         = 2,
+	KEY_SET_ID         = 3,
+	MAX_KEY_ACTION     = 0xffff,
+};
+
+enum acx_key_type {
+	KEY_WEP_DEFAULT       = 0,
+	KEY_WEP_ADDR          = 1,
+	KEY_AES_GROUP         = 4,
+	KEY_AES_PAIRWISE      = 5,
+	KEY_WEP_GROUP         = 6,
+	KEY_TKIP_MIC_GROUP    = 10,
+	KEY_TKIP_MIC_PAIRWISE = 11,
+};
+
+/*
+ *
+ * key_type_e   key size    key format
+ * ----------   ---------   ----------
+ * 0x00         5, 13, 29   Key data
+ * 0x01         5, 13, 29   Key data
+ * 0x04         16          16 bytes of key data
+ * 0x05         16          16 bytes of key data
+ * 0x0a         32          16 bytes of TKIP key data
+ *                          8 bytes of RX MIC key data
+ *                          8 bytes of TX MIC key data
+ * 0x0b         32          16 bytes of TKIP key data
+ *                          8 bytes of RX MIC key data
+ *                          8 bytes of TX MIC key data
+ *
+ */
+
+struct acx_set_key {
+	/* Ignored for default WEP key */
+	u8 addr[ETH_ALEN];
+
+	/* key_action_e */
+	u16 key_action;
+
+	u16 reserved_1;
+
+	/* key size in bytes */
+	u8 key_size;
+
+	/* key_type_e */
+	u8 key_type;
+	u8 ssid_profile;
+
+	/*
+	 * TKIP, AES: frame's key id field.
+	 * For WEP default key: key id;
+	 */
+	u8 id;
+	u8 reserved_2[6];
+	u8 key[MAX_KEY_SIZE];
+	u16 ac_seq_num16[NUM_ACCESS_CATEGORIES_COPY];
+	u32 ac_seq_num32[NUM_ACCESS_CATEGORIES_COPY];
+} __attribute__ ((packed));
+
+struct acx_current_tx_power {
+	struct acx_header header;
+
+	u8  current_tx_power;
+	u8  padding[3];
+} __attribute__ ((packed));
+
+struct acx_dot11_default_key {
+	struct acx_header header;
+
+	u8 id;
+	u8 pad[3];
+} __attribute__ ((packed));
+
+struct acx_tsf_info {
+	struct acx_header header;
+
+	u32 current_tsf_msb;
+	u32 current_tsf_lsb;
+	u32 last_TBTT_msb;
+	u32 last_TBTT_lsb;
+	u8 last_dtim_count;
+	u8 pad[3];
+} __attribute__ ((packed));
+
+/* 802.11 PS */
+enum acx_ps_mode {
+	STATION_ACTIVE_MODE,
+	STATION_POWER_SAVE_MODE
+};
+
+struct acx_ps_params {
+	u8 ps_mode; /* STATION_* */
+	u8 send_null_data; /* Do we have to send NULL data packet ? */
+	u8 retries; /* Number of retires for the initial NULL data packet */
+
+	 /*
+	  * TUs during which the target stays awake after switching
+	  * to power save mode.
+	  */
+	u8 hang_over_period;
+	u16 null_data_rate;
+	u8 pad[2];
+} __attribute__ ((packed));
+
+enum acx_wake_up_event {
+	WAKE_UP_EVENT_BEACON_BITMAP	= 0x01, /* Wake on every Beacon*/
+	WAKE_UP_EVENT_DTIM_BITMAP	= 0x02,	/* Wake on every DTIM*/
+	WAKE_UP_EVENT_N_DTIM_BITMAP	= 0x04, /* Wake on every Nth DTIM */
+	WAKE_UP_EVENT_N_BEACONS_BITMAP	= 0x08, /* Wake on every Nth Beacon */
+	WAKE_UP_EVENT_BITS_MASK		= 0x0F
+};
+
+struct acx_wake_up_condition {
+	struct acx_header header;
+
+	u8 wake_up_event; /* Only one bit can be set */
+	u8 listen_interval;
+	u8 pad[2];
+} __attribute__ ((packed));
+
+struct acx_aid {
+	struct acx_header header;
+
+	/*
+	 * To be set when associated with an AP.
+	 */
+	u16 aid;
+	u8 pad[2];
+} __attribute__ ((packed));
+
+enum acx_preamble_type {
+	ACX_PREAMBLE_LONG = 0,
+	ACX_PREAMBLE_SHORT = 1
+};
+
+struct acx_preamble {
+	struct acx_header header;
+	/*
+	 * When set, the WiLink transmits the frames with a short preamble and
+	 * when cleared, the WiLink transmits the frames with a long preamble.
+	 */
+	u8 preamble;
+	u8 padding[3];
+} __attribute__ ((packed));
+
+enum acx_ctsprotect_type {
+	CTSPROTECT_DISABLE = 0,
+	CTSPROTECT_ENABLE = 1
+};
+
+struct acx_ctsprotect {
+	struct acx_header header;
+	u8 ctsprotect;
+	u8 padding[3];
+} __attribute__ ((packed));
+
+struct acx_tx_statistics {
+	u32 internal_desc_overflow;
+}  __attribute__ ((packed));
+
+struct acx_rx_statistics {
+	u32 out_of_mem;
+	u32 hdr_overflow;
+	u32 hw_stuck;
+	u32 dropped;
+	u32 fcs_err;
+	u32 xfr_hint_trig;
+	u32 path_reset;
+	u32 reset_counter;
+} __attribute__ ((packed));
+
+struct acx_dma_statistics {
+	u32 rx_requested;
+	u32 rx_errors;
+	u32 tx_requested;
+	u32 tx_errors;
+}  __attribute__ ((packed));
+
+struct acx_isr_statistics {
+	/* host command complete */
+	u32 cmd_cmplt;
+
+	/* fiqisr() */
+	u32 fiqs;
+
+	/* (INT_STS_ND & INT_TRIG_RX_HEADER) */
+	u32 rx_headers;
+
+	/* (INT_STS_ND & INT_TRIG_RX_CMPLT) */
+	u32 rx_completes;
+
+	/* (INT_STS_ND & INT_TRIG_NO_RX_BUF) */
+	u32 rx_mem_overflow;
+
+	/* (INT_STS_ND & INT_TRIG_S_RX_RDY) */
+	u32 rx_rdys;
+
+	/* irqisr() */
+	u32 irqs;
+
+	/* (INT_STS_ND & INT_TRIG_TX_PROC) */
+	u32 tx_procs;
+
+	/* (INT_STS_ND & INT_TRIG_DECRYPT_DONE) */
+	u32 decrypt_done;
+
+	/* (INT_STS_ND & INT_TRIG_DMA0) */
+	u32 dma0_done;
+
+	/* (INT_STS_ND & INT_TRIG_DMA1) */
+	u32 dma1_done;
+
+	/* (INT_STS_ND & INT_TRIG_TX_EXC_CMPLT) */
+	u32 tx_exch_complete;
+
+	/* (INT_STS_ND & INT_TRIG_COMMAND) */
+	u32 commands;
+
+	/* (INT_STS_ND & INT_TRIG_RX_PROC) */
+	u32 rx_procs;
+
+	/* (INT_STS_ND & INT_TRIG_PM_802) */
+	u32 hw_pm_mode_changes;
+
+	/* (INT_STS_ND & INT_TRIG_ACKNOWLEDGE) */
+	u32 host_acknowledges;
+
+	/* (INT_STS_ND & INT_TRIG_PM_PCI) */
+	u32 pci_pm;
+
+	/* (INT_STS_ND & INT_TRIG_ACM_WAKEUP) */
+	u32 wakeups;
+
+	/* (INT_STS_ND & INT_TRIG_LOW_RSSI) */
+	u32 low_rssi;
+} __attribute__ ((packed));
+
+struct acx_wep_statistics {
+	/* WEP address keys configured */
+	u32 addr_key_count;
+
+	/* default keys configured */
+	u32 default_key_count;
+
+	u32 reserved;
+
+	/* number of times that WEP key not found on lookup */
+	u32 key_not_found;
+
+	/* number of times that WEP key decryption failed */
+	u32 decrypt_fail;
+
+	/* WEP packets decrypted */
+	u32 packets;
+
+	/* WEP decrypt interrupts */
+	u32 interrupt;
+} __attribute__ ((packed));
+
+#define ACX_MISSED_BEACONS_SPREAD 10
+
+struct acx_pwr_statistics {
+	/* the amount of enters into power save mode (both PD & ELP) */
+	u32 ps_enter;
+
+	/* the amount of enters into ELP mode */
+	u32 elp_enter;
+
+	/* the amount of missing beacon interrupts to the host */
+	u32 missing_bcns;
+
+	/* the amount of wake on host-access times */
+	u32 wake_on_host;
+
+	/* the amount of wake on timer-expire */
+	u32 wake_on_timer_exp;
+
+	/* the number of packets that were transmitted with PS bit set */
+	u32 tx_with_ps;
+
+	/* the number of packets that were transmitted with PS bit clear */
+	u32 tx_without_ps;
+
+	/* the number of received beacons */
+	u32 rcvd_beacons;
+
+	/* the number of entering into PowerOn (power save off) */
+	u32 power_save_off;
+
+	/* the number of entries into power save mode */
+	u16 enable_ps;
+
+	/*
+	 * the number of exits from power save, not including failed PS
+	 * transitions
+	 */
+	u16 disable_ps;
+
+	/*
+	 * the number of times the TSF counter was adjusted because
+	 * of drift
+	 */
+	u32 fix_tsf_ps;
+
+	/* Gives statistics about the spread continuous missed beacons.
+	 * The 16 LSB are dedicated for the PS mode.
+	 * The 16 MSB are dedicated for the PS mode.
+	 * cont_miss_bcns_spread[0] - single missed beacon.
+	 * cont_miss_bcns_spread[1] - two continuous missed beacons.
+	 * cont_miss_bcns_spread[2] - three continuous missed beacons.
+	 * ...
+	 * cont_miss_bcns_spread[9] - ten and more continuous missed beacons.
+	*/
+	u32 cont_miss_bcns_spread[ACX_MISSED_BEACONS_SPREAD];
+
+	/* the number of beacons in awake mode */
+	u32 rcvd_awake_beacons;
+} __attribute__ ((packed));
+
+struct acx_mic_statistics {
+	u32 rx_pkts;
+	u32 calc_failure;
+} __attribute__ ((packed));
+
+struct acx_aes_statistics {
+	u32 encrypt_fail;
+	u32 decrypt_fail;
+	u32 encrypt_packets;
+	u32 decrypt_packets;
+	u32 encrypt_interrupt;
+	u32 decrypt_interrupt;
+} __attribute__ ((packed));
+
+struct acx_event_statistics {
+	u32 heart_beat;
+	u32 calibration;
+	u32 rx_mismatch;
+	u32 rx_mem_empty;
+	u32 rx_pool;
+	u32 oom_late;
+	u32 phy_transmit_error;
+	u32 tx_stuck;
+} __attribute__ ((packed));
+
+struct acx_ps_statistics {
+	u32 pspoll_timeouts;
+	u32 upsd_timeouts;
+	u32 upsd_max_sptime;
+	u32 upsd_max_apturn;
+	u32 pspoll_max_apturn;
+	u32 pspoll_utilization;
+	u32 upsd_utilization;
+} __attribute__ ((packed));
+
+struct acx_rxpipe_statistics {
+	u32 rx_prep_beacon_drop;
+	u32 descr_host_int_trig_rx_data;
+	u32 beacon_buffer_thres_host_int_trig_rx_data;
+	u32 missed_beacon_host_int_trig_rx_data;
+	u32 tx_xfr_host_int_trig_rx_data;
+} __attribute__ ((packed));
+
+struct acx_statistics {
+	struct acx_header header;
+
+	struct acx_tx_statistics tx;
+	struct acx_rx_statistics rx;
+	struct acx_dma_statistics dma;
+	struct acx_isr_statistics isr;
+	struct acx_wep_statistics wep;
+	struct acx_pwr_statistics pwr;
+	struct acx_aes_statistics aes;
+	struct acx_mic_statistics mic;
+	struct acx_event_statistics event;
+	struct acx_ps_statistics ps;
+	struct acx_rxpipe_statistics rxpipe;
+} __attribute__ ((packed));
+
+enum {
+	ACX_WAKE_UP_CONDITIONS      = 0x0002,
+	ACX_MEM_CFG                 = 0x0003,
+	ACX_SLOT                    = 0x0004,
+	ACX_QUEUE_HEAD              = 0x0005, /* for MASTER mode only */
+	ACX_AC_CFG                  = 0x0007,
+	ACX_MEM_MAP                 = 0x0008,
+	ACX_AID                     = 0x000A,
+	ACX_RADIO_PARAM             = 0x000B, /* Not used */
+	ACX_CFG                     = 0x000C, /* Not used */
+	ACX_FW_REV                  = 0x000D,
+	ACX_MEDIUM_USAGE            = 0x000F,
+	ACX_RX_CFG                  = 0x0010,
+	ACX_TX_QUEUE_CFG            = 0x0011, /* FIXME: only used by wl1251 */
+	ACX_BSS_IN_PS               = 0x0012, /* for AP only */
+	ACX_STATISTICS              = 0x0013, /* Debug API */
+	ACX_FEATURE_CFG             = 0x0015,
+	ACX_MISC_CFG                = 0x0017, /* Not used */
+	ACX_TID_CFG                 = 0x001A,
+	ACX_BEACON_FILTER_OPT       = 0x001F,
+	ACX_LOW_RSSI                = 0x0020,
+	ACX_NOISE_HIST              = 0x0021,
+	ACX_HDK_VERSION             = 0x0022, /* ??? */
+	ACX_PD_THRESHOLD            = 0x0023,
+	ACX_DATA_PATH_PARAMS        = 0x0024, /* WO */
+	ACX_DATA_PATH_RESP_PARAMS   = 0x0024, /* RO */
+	ACX_CCA_THRESHOLD           = 0x0025,
+	ACX_EVENT_MBOX_MASK         = 0x0026,
+#ifdef FW_RUNNING_AS_AP
+	ACX_DTIM_PERIOD             = 0x0027, /* for AP only */
+#else
+	ACX_WR_TBTT_AND_DTIM        = 0x0027, /* STA only */
+#endif
+	ACX_ACI_OPTION_CFG          = 0x0029, /* OBSOLETE (for 1251)*/
+	ACX_GPIO_CFG                = 0x002A, /* Not used */
+	ACX_GPIO_SET                = 0x002B, /* Not used */
+	ACX_PM_CFG                  = 0x002C, /* To Be Documented */
+	ACX_CONN_MONIT_PARAMS       = 0x002D,
+	ACX_AVERAGE_RSSI            = 0x002E, /* Not used */
+	ACX_CONS_TX_FAILURE         = 0x002F,
+	ACX_BCN_DTIM_OPTIONS        = 0x0031,
+	ACX_SG_ENABLE               = 0x0032,
+	ACX_SG_CFG                  = 0x0033,
+	ACX_ANTENNA_DIVERSITY_CFG   = 0x0035, /* To Be Documented */
+	ACX_LOW_SNR		    = 0x0037, /* To Be Documented */
+	ACX_BEACON_FILTER_TABLE     = 0x0038,
+	ACX_ARP_IP_FILTER           = 0x0039,
+	ACX_ROAMING_STATISTICS_TBL  = 0x003B,
+	ACX_RATE_POLICY             = 0x003D,
+	ACX_CTS_PROTECTION          = 0x003E,
+	ACX_SLEEP_AUTH              = 0x003F,
+	ACX_PREAMBLE_TYPE	    = 0x0040,
+	ACX_ERROR_CNT               = 0x0041,
+	ACX_FW_GEN_FRAME_RATES      = 0x0042,
+	ACX_IBSS_FILTER		    = 0x0044,
+	ACX_SERVICE_PERIOD_TIMEOUT  = 0x0045,
+	ACX_TSF_INFO                = 0x0046,
+	ACX_CONFIG_PS_WMM           = 0x0049,
+	ACX_ENABLE_RX_DATA_FILTER   = 0x004A,
+	ACX_SET_RX_DATA_FILTER      = 0x004B,
+	ACX_GET_DATA_FILTER_STATISTICS = 0x004C,
+	ACX_POWER_LEVEL_TABLE       = 0x004D,
+	ACX_BET_ENABLE              = 0x0050,
+	DOT11_STATION_ID            = 0x1001,
+	DOT11_RX_MSDU_LIFE_TIME     = 0x1004,
+	DOT11_CUR_TX_PWR            = 0x100D,
+	DOT11_DEFAULT_KEY           = 0x1010,
+	DOT11_RX_DOT11_MODE         = 0x1012,
+	DOT11_RTS_THRESHOLD         = 0x1013,
+	DOT11_GROUP_ADDRESS_TBL     = 0x1014,
+
+	MAX_DOT11_IE = DOT11_GROUP_ADDRESS_TBL,
+
+	MAX_IE = 0xFFFF
+};
+
+
+int wl12xx_acx_frame_rates(struct wl12xx *wl, u8 ctrl_rate, u8 ctrl_mod,
+			   u8 mgt_rate, u8 mgt_mod);
+int wl12xx_acx_station_id(struct wl12xx *wl);
+int wl12xx_acx_default_key(struct wl12xx *wl, u8 key_id);
+int wl12xx_acx_wake_up_conditions(struct wl12xx *wl, u8 listen_interval);
+int wl12xx_acx_sleep_auth(struct wl12xx *wl, u8 sleep_auth);
+int wl12xx_acx_fw_version(struct wl12xx *wl, char *buf, size_t len);
+int wl12xx_acx_tx_power(struct wl12xx *wl, int power);
+int wl12xx_acx_feature_cfg(struct wl12xx *wl);
+int wl12xx_acx_mem_map(struct wl12xx *wl, void *mem_map, size_t len);
+int wl12xx_acx_data_path_params(struct wl12xx *wl,
+				struct acx_data_path_params_resp *data_path);
+int wl12xx_acx_rx_msdu_life_time(struct wl12xx *wl, u32 life_time);
+int wl12xx_acx_rx_config(struct wl12xx *wl, u32 config, u32 filter);
+int wl12xx_acx_pd_threshold(struct wl12xx *wl);
+int wl12xx_acx_slot(struct wl12xx *wl, enum acx_slot_type slot_time);
+int wl12xx_acx_group_address_tbl(struct wl12xx *wl);
+int wl12xx_acx_service_period_timeout(struct wl12xx *wl);
+int wl12xx_acx_rts_threshold(struct wl12xx *wl, u16 rts_threshold);
+int wl12xx_acx_beacon_filter_opt(struct wl12xx *wl);
+int wl12xx_acx_beacon_filter_table(struct wl12xx *wl);
+int wl12xx_acx_sg_enable(struct wl12xx *wl);
+int wl12xx_acx_sg_cfg(struct wl12xx *wl);
+int wl12xx_acx_cca_threshold(struct wl12xx *wl);
+int wl12xx_acx_bcn_dtim_options(struct wl12xx *wl);
+int wl12xx_acx_aid(struct wl12xx *wl, u16 aid);
+int wl12xx_acx_event_mbox_mask(struct wl12xx *wl, u32 event_mask);
+int wl12xx_acx_set_preamble(struct wl12xx *wl, enum acx_preamble_type preamble);
+int wl12xx_acx_cts_protect(struct wl12xx *wl,
+			    enum acx_ctsprotect_type ctsprotect);
+int wl12xx_acx_statistics(struct wl12xx *wl, struct acx_statistics *stats);
+
+#endif /* __WL12XX_ACX_H__ */
diff --git a/drivers/net/wireless/wl12xx/boot.c b/drivers/net/wireless/wl12xx/boot.c
new file mode 100644
index 00000000000..48ac08c429b
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/boot.c
@@ -0,0 +1,295 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/gpio.h>
+
+#include "reg.h"
+#include "boot.h"
+#include "spi.h"
+#include "event.h"
+
+static void wl12xx_boot_enable_interrupts(struct wl12xx *wl)
+{
+	enable_irq(wl->irq);
+}
+
+void wl12xx_boot_target_enable_interrupts(struct wl12xx *wl)
+{
+	wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_MASK, ~(wl->intr_mask));
+	wl12xx_reg_write32(wl, HI_CFG, HI_CFG_DEF_VAL);
+}
+
+int wl12xx_boot_soft_reset(struct wl12xx *wl)
+{
+	unsigned long timeout;
+	u32 boot_data;
+
+	/* perform soft reset */
+	wl12xx_reg_write32(wl, ACX_REG_SLV_SOFT_RESET, ACX_SLV_SOFT_RESET_BIT);
+
+	/* SOFT_RESET is self clearing */
+	timeout = jiffies + usecs_to_jiffies(SOFT_RESET_MAX_TIME);
+	while (1) {
+		boot_data = wl12xx_reg_read32(wl, ACX_REG_SLV_SOFT_RESET);
+		wl12xx_debug(DEBUG_BOOT, "soft reset bootdata 0x%x", boot_data);
+		if ((boot_data & ACX_SLV_SOFT_RESET_BIT) == 0)
+			break;
+
+		if (time_after(jiffies, timeout)) {
+			/* 1.2 check pWhalBus->uSelfClearTime if the
+			 * timeout was reached */
+			wl12xx_error("soft reset timeout");
+			return -1;
+		}
+
+		udelay(SOFT_RESET_STALL_TIME);
+	}
+
+	/* disable Rx/Tx */
+	wl12xx_reg_write32(wl, ENABLE, 0x0);
+
+	/* disable auto calibration on start*/
+	wl12xx_reg_write32(wl, SPARE_A2, 0xffff);
+
+	return 0;
+}
+
+int wl12xx_boot_init_seq(struct wl12xx *wl)
+{
+	u32 scr_pad6, init_data, tmp, elp_cmd, ref_freq;
+
+	/*
+	 * col #1: INTEGER_DIVIDER
+	 * col #2: FRACTIONAL_DIVIDER
+	 * col #3: ATTN_BB
+	 * col #4: ALPHA_BB
+	 * col #5: STOP_TIME_BB
+	 * col #6: BB_PLL_LOOP_FILTER
+	 */
+	static const u32 LUT[REF_FREQ_NUM][LUT_PARAM_NUM] = {
+
+		{   83, 87381,  0xB, 5, 0xF00,  3}, /* REF_FREQ_19_2*/
+		{   61, 141154, 0xB, 5, 0x1450, 2}, /* REF_FREQ_26_0*/
+		{   41, 174763, 0xC, 6, 0x2D00, 1}, /* REF_FREQ_38_4*/
+		{   40, 0,      0xC, 6, 0x2EE0, 1}, /* REF_FREQ_40_0*/
+		{   47, 162280, 0xC, 6, 0x2760, 1}  /* REF_FREQ_33_6        */
+	};
+
+	/* read NVS params */
+	scr_pad6 = wl12xx_reg_read32(wl, SCR_PAD6);
+	wl12xx_debug(DEBUG_BOOT, "scr_pad6 0x%x", scr_pad6);
+
+	/* read ELP_CMD */
+	elp_cmd = wl12xx_reg_read32(wl, ELP_CMD);
+	wl12xx_debug(DEBUG_BOOT, "elp_cmd 0x%x", elp_cmd);
+
+	/* set the BB calibration time to be 300 usec (PLL_CAL_TIME) */
+	ref_freq = scr_pad6 & 0x000000FF;
+	wl12xx_debug(DEBUG_BOOT, "ref_freq 0x%x", ref_freq);
+
+	wl12xx_reg_write32(wl, PLL_CAL_TIME, 0x9);
+
+	/*
+	 * PG 1.2: set the clock buffer time to be 210 usec (CLK_BUF_TIME)
+	 */
+	wl12xx_reg_write32(wl, CLK_BUF_TIME, 0x6);
+
+	/*
+	 * set the clock detect feature to work in the restart wu procedure
+	 * (ELP_CFG_MODE[14]) and Select the clock source type
+	 * (ELP_CFG_MODE[13:12])
+	 */
+	tmp = ((scr_pad6 & 0x0000FF00) << 4) | 0x00004000;
+	wl12xx_reg_write32(wl, ELP_CFG_MODE, tmp);
+
+	/* PG 1.2: enable the BB PLL fix. Enable the PLL_LIMP_CLK_EN_CMD */
+	elp_cmd |= 0x00000040;
+	wl12xx_reg_write32(wl, ELP_CMD, elp_cmd);
+
+	/* PG 1.2: Set the BB PLL stable time to be 1000usec
+	 * (PLL_STABLE_TIME) */
+	wl12xx_reg_write32(wl, CFG_PLL_SYNC_CNT, 0x20);
+
+	/* PG 1.2: read clock request time */
+	init_data = wl12xx_reg_read32(wl, CLK_REQ_TIME);
+
+	/*
+	 * PG 1.2: set the clock request time to be ref_clk_settling_time -
+	 * 1ms = 4ms
+	 */
+	if (init_data > 0x21)
+		tmp = init_data - 0x21;
+	else
+		tmp = 0;
+	wl12xx_reg_write32(wl, CLK_REQ_TIME, tmp);
+
+	/* set BB PLL configurations in RF AFE */
+	wl12xx_reg_write32(wl, 0x003058cc, 0x4B5);
+
+	/* set RF_AFE_REG_5 */
+	wl12xx_reg_write32(wl, 0x003058d4, 0x50);
+
+	/* set RF_AFE_CTRL_REG_2 */
+	wl12xx_reg_write32(wl, 0x00305948, 0x11c001);
+
+	/*
+	 * change RF PLL and BB PLL divider for VCO clock and adjust VCO
+	 * bais current(RF_AFE_REG_13)
+	 */
+	wl12xx_reg_write32(wl, 0x003058f4, 0x1e);
+
+	/* set BB PLL configurations */
+	tmp = LUT[ref_freq][LUT_PARAM_INTEGER_DIVIDER] | 0x00017000;
+	wl12xx_reg_write32(wl, 0x00305840, tmp);
+
+	/* set fractional divider according to Appendix C-BB PLL
+	 * Calculations
+	 */
+	tmp = LUT[ref_freq][LUT_PARAM_FRACTIONAL_DIVIDER];
+	wl12xx_reg_write32(wl, 0x00305844, tmp);
+
+	/* set the initial data for the sigma delta */
+	wl12xx_reg_write32(wl, 0x00305848, 0x3039);
+
+	/*
+	 * set the accumulator attenuation value, calibration loop1
+	 * (alpha), calibration loop2 (beta), calibration loop3 (gamma) and
+	 * the VCO gain
+	 */
+	tmp = (LUT[ref_freq][LUT_PARAM_ATTN_BB] << 16) |
+		(LUT[ref_freq][LUT_PARAM_ALPHA_BB] << 12) | 0x1;
+	wl12xx_reg_write32(wl, 0x00305854, tmp);
+
+	/*
+	 * set the calibration stop time after holdoff time expires and set
+	 * settling time HOLD_OFF_TIME_BB
+	 */
+	tmp = LUT[ref_freq][LUT_PARAM_STOP_TIME_BB] | 0x000A0000;
+	wl12xx_reg_write32(wl, 0x00305858, tmp);
+
+	/*
+	 * set BB PLL Loop filter capacitor3- BB_C3[2:0] and set BB PLL
+	 * constant leakage current to linearize PFD to 0uA -
+	 * BB_ILOOPF[7:3]
+	 */
+	tmp = LUT[ref_freq][LUT_PARAM_BB_PLL_LOOP_FILTER] | 0x00000030;
+	wl12xx_reg_write32(wl, 0x003058f8, tmp);
+
+	/*
+	 * set regulator output voltage for n divider to
+	 * 1.35-BB_REFDIV[1:0], set charge pump current- BB_CPGAIN[4:2],
+	 * set BB PLL Loop filter capacitor2- BB_C2[7:5], set gain of BB
+	 * PLL auto-call to normal mode- BB_CALGAIN_3DB[8]
+	 */
+	wl12xx_reg_write32(wl, 0x003058f0, 0x29);
+
+	/* enable restart wakeup sequence (ELP_CMD[0]) */
+	wl12xx_reg_write32(wl, ELP_CMD, elp_cmd | 0x1);
+
+	/* restart sequence completed */
+	udelay(2000);
+
+	return 0;
+}
+
+int wl12xx_boot_run_firmware(struct wl12xx *wl)
+{
+	int loop, ret;
+	u32 chip_id, interrupt;
+
+	wl->chip.op_set_ecpu_ctrl(wl, ECPU_CONTROL_HALT);
+
+	chip_id = wl12xx_reg_read32(wl, CHIP_ID_B);
+
+	wl12xx_debug(DEBUG_BOOT, "chip id after firmware boot: 0x%x", chip_id);
+
+	if (chip_id != wl->chip.id) {
+		wl12xx_error("chip id doesn't match after firmware boot");
+		return -EIO;
+	}
+
+	/* wait for init to complete */
+	loop = 0;
+	while (loop++ < INIT_LOOP) {
+		udelay(INIT_LOOP_DELAY);
+		interrupt = wl12xx_reg_read32(wl, ACX_REG_INTERRUPT_NO_CLEAR);
+
+		if (interrupt == 0xffffffff) {
+			wl12xx_error("error reading hardware complete "
+				     "init indication");
+			return -EIO;
+		}
+		/* check that ACX_INTR_INIT_COMPLETE is enabled */
+		else if (interrupt & wl->chip.intr_init_complete) {
+			wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_ACK,
+					   wl->chip.intr_init_complete);
+			break;
+		}
+	}
+
+	if (loop >= INIT_LOOP) {
+		wl12xx_error("timeout waiting for the hardware to "
+			     "complete initialization");
+		return -EIO;
+	}
+
+	/* get hardware config command mail box */
+	wl->cmd_box_addr = wl12xx_reg_read32(wl, REG_COMMAND_MAILBOX_PTR);
+
+	/* get hardware config event mail box */
+	wl->event_box_addr = wl12xx_reg_read32(wl, REG_EVENT_MAILBOX_PTR);
+
+	/* set the working partition to its "running" mode offset */
+	wl12xx_set_partition(wl,
+			     wl->chip.p_table[PART_WORK].mem.start,
+			     wl->chip.p_table[PART_WORK].mem.size,
+			     wl->chip.p_table[PART_WORK].reg.start,
+			     wl->chip.p_table[PART_WORK].reg.size);
+
+	wl12xx_debug(DEBUG_MAILBOX, "cmd_box_addr 0x%x event_box_addr 0x%x",
+		     wl->cmd_box_addr, wl->event_box_addr);
+
+	/*
+	 * in case of full asynchronous mode the firmware event must be
+	 * ready to receive event from the command mailbox
+	 */
+
+	/* enable gpio interrupts */
+	wl12xx_boot_enable_interrupts(wl);
+
+	wl->chip.op_target_enable_interrupts(wl);
+
+	/* unmask all mbox events  */
+	wl->event_mask = 0xffffffff;
+
+	ret = wl12xx_event_unmask(wl);
+	if (ret < 0) {
+		wl12xx_error("EVENT mask setting failed");
+		return ret;
+	}
+
+	wl12xx_event_mbox_config(wl);
+
+	/* firmware startup completed */
+	return 0;
+}
diff --git a/drivers/net/wireless/wl12xx/boot.h b/drivers/net/wireless/wl12xx/boot.h
new file mode 100644
index 00000000000..4fa73132baa
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/boot.h
@@ -0,0 +1,40 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __BOOT_H__
+#define __BOOT_H__
+
+#include "wl12xx.h"
+
+int wl12xx_boot_soft_reset(struct wl12xx *wl);
+int wl12xx_boot_init_seq(struct wl12xx *wl);
+int wl12xx_boot_run_firmware(struct wl12xx *wl);
+void wl12xx_boot_target_enable_interrupts(struct wl12xx *wl);
+
+/* number of times we try to read the INIT interrupt */
+#define INIT_LOOP 20000
+
+/* delay between retries */
+#define INIT_LOOP_DELAY 50
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/cmd.c b/drivers/net/wireless/wl12xx/cmd.c
new file mode 100644
index 00000000000..f73ab602b7a
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/cmd.c
@@ -0,0 +1,353 @@
+#include "cmd.h"
+
+#include <linux/module.h>
+#include <linux/crc7.h>
+#include <linux/spi/spi.h>
+
+#include "wl12xx.h"
+#include "wl12xx_80211.h"
+#include "reg.h"
+#include "spi.h"
+#include "ps.h"
+
+int wl12xx_cmd_send(struct wl12xx *wl, u16 type, void *buf, size_t buf_len)
+{
+	struct wl12xx_command cmd;
+	unsigned long timeout;
+	size_t cmd_len;
+	u32 intr;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.id = type;
+	cmd.status = 0;
+	memcpy(cmd.parameters, buf, buf_len);
+	cmd_len = ALIGN(buf_len, 4) + CMDMBOX_HEADER_LEN;
+
+	wl12xx_ps_elp_wakeup(wl);
+
+	wl12xx_spi_mem_write(wl, wl->cmd_box_addr, &cmd, cmd_len);
+
+	wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_TRIG, INTR_TRIG_CMD);
+
+	timeout = jiffies + msecs_to_jiffies(WL12XX_COMMAND_TIMEOUT);
+
+	intr = wl12xx_reg_read32(wl, ACX_REG_INTERRUPT_NO_CLEAR);
+	while (!(intr & wl->chip.intr_cmd_complete)) {
+		if (time_after(jiffies, timeout)) {
+			wl12xx_error("command complete timeout");
+			ret = -ETIMEDOUT;
+			goto out;
+		}
+
+		msleep(1);
+
+		intr = wl12xx_reg_read32(wl, ACX_REG_INTERRUPT_NO_CLEAR);
+	}
+
+	wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_ACK,
+			   wl->chip.intr_cmd_complete);
+
+out:
+	wl12xx_ps_elp_sleep(wl);
+
+	return ret;
+}
+
+int wl12xx_cmd_test(struct wl12xx *wl, void *buf, size_t buf_len, u8 answer)
+{
+	int ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd test");
+
+	ret = wl12xx_cmd_send(wl, CMD_TEST, buf, buf_len);
+	if (ret < 0) {
+		wl12xx_warning("TEST command failed");
+		return ret;
+	}
+
+	if (answer) {
+		struct wl12xx_command *cmd_answer;
+
+		/*
+		 * The test command got in, we can read the answer.
+		 * The answer would be a wl12xx_command, where the
+		 * parameter array contains the actual answer.
+		 */
+
+		wl12xx_ps_elp_wakeup(wl);
+
+		wl12xx_spi_mem_read(wl, wl->cmd_box_addr, buf, buf_len);
+
+		wl12xx_ps_elp_sleep(wl);
+
+		cmd_answer = buf;
+		if (cmd_answer->status != CMD_STATUS_SUCCESS)
+			wl12xx_error("TEST command answer error: %d",
+				     cmd_answer->status);
+	}
+
+	return 0;
+}
+
+
+int wl12xx_cmd_interrogate(struct wl12xx *wl, u16 ie_id, u16 ie_len,
+			   void *answer)
+{
+	struct wl12xx_command *cmd;
+	struct acx_header header;
+	int ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd interrogate");
+
+	header.id = ie_id;
+	header.len = ie_len - sizeof(header);
+
+	ret = wl12xx_cmd_send(wl, CMD_INTERROGATE, &header, sizeof(header));
+	if (ret < 0) {
+		wl12xx_error("INTERROGATE command failed");
+		return ret;
+	}
+
+	wl12xx_ps_elp_wakeup(wl);
+
+	/* the interrogate command got in, we can read the answer */
+	wl12xx_spi_mem_read(wl, wl->cmd_box_addr, answer,
+			    CMDMBOX_HEADER_LEN + ie_len);
+
+	wl12xx_ps_elp_sleep(wl);
+
+	cmd = answer;
+	if (cmd->status != CMD_STATUS_SUCCESS)
+		wl12xx_error("INTERROGATE command error: %d",
+			     cmd->status);
+
+	return 0;
+
+}
+
+int wl12xx_cmd_configure(struct wl12xx *wl, void *ie, int ie_len)
+{
+	int ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd configure");
+
+	ret = wl12xx_cmd_send(wl, CMD_CONFIGURE, ie,
+			      ie_len);
+	if (ret < 0) {
+		wl12xx_warning("CONFIGURE command NOK");
+		return ret;
+	}
+
+	return 0;
+
+}
+
+int wl12xx_cmd_vbm(struct wl12xx *wl, u8 identity,
+		   void *bitmap, u16 bitmap_len, u8 bitmap_control)
+{
+	struct vbm_update_request vbm;
+	int ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd vbm");
+
+	/* Count and period will be filled by the target */
+	vbm.tim.bitmap_ctrl = bitmap_control;
+	if (bitmap_len > PARTIAL_VBM_MAX) {
+		wl12xx_warning("cmd vbm len is %d B, truncating to %d",
+			       bitmap_len, PARTIAL_VBM_MAX);
+		bitmap_len = PARTIAL_VBM_MAX;
+	}
+	memcpy(vbm.tim.pvb_field, bitmap, bitmap_len);
+	vbm.tim.identity = identity;
+	vbm.tim.length = bitmap_len + 3;
+
+	vbm.len = cpu_to_le16(bitmap_len + 5);
+
+	ret = wl12xx_cmd_send(wl, CMD_VBM, &vbm, sizeof(vbm));
+	if (ret < 0) {
+		wl12xx_error("VBM command failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_cmd_data_path(struct wl12xx *wl, u8 channel, u8 enable)
+{
+	int ret;
+	u16 cmd_rx, cmd_tx;
+
+	wl12xx_debug(DEBUG_CMD, "cmd data path");
+
+	if (enable) {
+		cmd_rx = CMD_ENABLE_RX;
+		cmd_tx = CMD_ENABLE_TX;
+	} else {
+		cmd_rx = CMD_DISABLE_RX;
+		cmd_tx = CMD_DISABLE_TX;
+	}
+
+	ret = wl12xx_cmd_send(wl, cmd_rx, &channel, sizeof(channel));
+	if (ret < 0) {
+		wl12xx_error("rx %s cmd for channel %d failed",
+			     enable ? "start" : "stop", channel);
+		return ret;
+	}
+
+	wl12xx_debug(DEBUG_BOOT, "rx %s cmd channel %d",
+		     enable ? "start" : "stop", channel);
+
+	ret = wl12xx_cmd_send(wl, cmd_tx, &channel, sizeof(channel));
+	if (ret < 0) {
+		wl12xx_error("tx %s cmd for channel %d failed",
+			     enable ? "start" : "stop", channel);
+		return ret;
+	}
+
+	wl12xx_debug(DEBUG_BOOT, "tx %s cmd channel %d",
+		     enable ? "start" : "stop", channel);
+
+	return 0;
+}
+
+int wl12xx_cmd_join(struct wl12xx *wl, u8 bss_type, u8 dtim_interval,
+		    u16 beacon_interval, u8 wait)
+{
+	unsigned long timeout;
+	struct cmd_join join = {};
+	int ret, i;
+	u8 *bssid;
+
+	/* FIXME: this should be in main.c */
+	ret = wl12xx_acx_frame_rates(wl, DEFAULT_HW_GEN_TX_RATE,
+				     DEFAULT_HW_GEN_MODULATION_TYPE,
+				     wl->tx_mgmt_frm_rate,
+				     wl->tx_mgmt_frm_mod);
+	if (ret < 0)
+		return ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd join");
+
+	/* Reverse order BSSID */
+	bssid = (u8 *)&join.bssid_lsb;
+	for (i = 0; i < ETH_ALEN; i++)
+		bssid[i] = wl->bssid[ETH_ALEN - i - 1];
+
+	join.rx_config_options = wl->rx_config;
+	join.rx_filter_options = wl->rx_filter;
+
+	join.basic_rate_set = RATE_MASK_1MBPS | RATE_MASK_2MBPS |
+		RATE_MASK_5_5MBPS | RATE_MASK_11MBPS;
+
+	join.beacon_interval = beacon_interval;
+	join.dtim_interval = dtim_interval;
+	join.bss_type = bss_type;
+	join.channel = wl->channel;
+	join.ctrl = JOIN_CMD_CTRL_TX_FLUSH;
+
+	ret = wl12xx_cmd_send(wl, CMD_START_JOIN, &join, sizeof(join));
+	if (ret < 0) {
+		wl12xx_error("failed to initiate cmd join");
+		return ret;
+	}
+
+	timeout = msecs_to_jiffies(JOIN_TIMEOUT);
+
+	/*
+	 * ugly hack: we should wait for JOIN_EVENT_COMPLETE_ID but to
+	 * simplify locking we just sleep instead, for now
+	 */
+	if (wait)
+		msleep(10);
+
+	return 0;
+}
+
+int wl12xx_cmd_ps_mode(struct wl12xx *wl, u8 ps_mode)
+{
+	int ret;
+	struct acx_ps_params ps_params;
+
+	/* FIXME: this should be in ps.c */
+	ret = wl12xx_acx_wake_up_conditions(wl, wl->listen_int);
+	if (ret < 0) {
+		wl12xx_error("Couldnt set wake up conditions");
+		return ret;
+	}
+
+	wl12xx_debug(DEBUG_CMD, "cmd set ps mode");
+
+	ps_params.ps_mode = ps_mode;
+	ps_params.send_null_data = 1;
+	ps_params.retries = 5;
+	ps_params.hang_over_period = 128;
+	ps_params.null_data_rate = 1; /* 1 Mbps */
+
+	ret = wl12xx_cmd_send(wl, CMD_SET_PS_MODE, &ps_params,
+			      sizeof(ps_params));
+	if (ret < 0) {
+		wl12xx_error("cmd set_ps_mode failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_cmd_read_memory(struct wl12xx *wl, u32 addr, u32 len, void *answer)
+{
+	struct cmd_read_write_memory mem_cmd, *mem_answer;
+	struct wl12xx_command cmd;
+	int ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd read memory");
+
+	memset(&mem_cmd, 0, sizeof(mem_cmd));
+	mem_cmd.addr = addr;
+	mem_cmd.size = len;
+
+	ret = wl12xx_cmd_send(wl, CMD_READ_MEMORY, &mem_cmd, sizeof(mem_cmd));
+	if (ret < 0) {
+		wl12xx_error("read memory command failed: %d", ret);
+		return ret;
+	}
+
+	/* the read command got in, we can now read the answer */
+	wl12xx_spi_mem_read(wl, wl->cmd_box_addr, &cmd,
+			    CMDMBOX_HEADER_LEN + sizeof(mem_cmd));
+
+	if (cmd.status != CMD_STATUS_SUCCESS)
+		wl12xx_error("error in read command result: %d", cmd.status);
+
+	mem_answer = (struct cmd_read_write_memory *) cmd.parameters;
+	memcpy(answer, mem_answer->value, len);
+
+	return 0;
+}
+
+int wl12xx_cmd_template_set(struct wl12xx *wl, u16 cmd_id,
+			    void *buf, size_t buf_len)
+{
+	struct wl12xx_cmd_packet_template template;
+	int ret;
+
+	wl12xx_debug(DEBUG_CMD, "cmd template %d", cmd_id);
+
+	memset(&template, 0, sizeof(template));
+
+	WARN_ON(buf_len > WL12XX_MAX_TEMPLATE_SIZE);
+	buf_len = min_t(size_t, buf_len, WL12XX_MAX_TEMPLATE_SIZE);
+	template.size = cpu_to_le16(buf_len);
+
+	if (buf)
+		memcpy(template.template, buf, buf_len);
+
+	ret = wl12xx_cmd_send(wl, cmd_id, &template,
+			      sizeof(template.size) + buf_len);
+	if (ret < 0) {
+		wl12xx_warning("cmd set_template failed: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/net/wireless/wl12xx/cmd.h b/drivers/net/wireless/wl12xx/cmd.h
new file mode 100644
index 00000000000..aa307dcd081
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/cmd.h
@@ -0,0 +1,265 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_CMD_H__
+#define __WL12XX_CMD_H__
+
+#include "wl12xx.h"
+
+int wl12xx_cmd_send(struct wl12xx *wl, u16 type, void *buf, size_t buf_len);
+int wl12xx_cmd_test(struct wl12xx *wl, void *buf, size_t buf_len, u8 answer);
+int wl12xx_cmd_interrogate(struct wl12xx *wl, u16 ie_id, u16 ie_len,
+			   void *answer);
+int wl12xx_cmd_configure(struct wl12xx *wl, void *ie, int ie_len);
+int wl12xx_cmd_vbm(struct wl12xx *wl, u8 identity,
+		   void *bitmap, u16 bitmap_len, u8 bitmap_control);
+int wl12xx_cmd_data_path(struct wl12xx *wl, u8 channel, u8 enable);
+int wl12xx_cmd_join(struct wl12xx *wl, u8 bss_type, u8 dtim_interval,
+		    u16 beacon_interval, u8 wait);
+int wl12xx_cmd_ps_mode(struct wl12xx *wl, u8 ps_mode);
+int wl12xx_cmd_read_memory(struct wl12xx *wl, u32 addr, u32 len, void *answer);
+int wl12xx_cmd_template_set(struct wl12xx *wl, u16 cmd_id,
+			    void *buf, size_t buf_len);
+
+/* unit ms */
+#define WL12XX_COMMAND_TIMEOUT 2000
+
+#define WL12XX_MAX_TEMPLATE_SIZE 300
+
+struct wl12xx_cmd_packet_template {
+	__le16 size;
+	u8 template[WL12XX_MAX_TEMPLATE_SIZE];
+} __attribute__ ((packed));
+
+enum wl12xx_commands {
+	CMD_RESET           = 0,
+	CMD_INTERROGATE     = 1,    /*use this to read information elements*/
+	CMD_CONFIGURE       = 2,    /*use this to write information elements*/
+	CMD_ENABLE_RX       = 3,
+	CMD_ENABLE_TX       = 4,
+	CMD_DISABLE_RX      = 5,
+	CMD_DISABLE_TX      = 6,
+	CMD_SCAN            = 8,
+	CMD_STOP_SCAN       = 9,
+	CMD_VBM             = 10,
+	CMD_START_JOIN      = 11,
+	CMD_SET_KEYS        = 12,
+	CMD_READ_MEMORY     = 13,
+	CMD_WRITE_MEMORY    = 14,
+	CMD_BEACON          = 19,
+	CMD_PROBE_RESP      = 20,
+	CMD_NULL_DATA       = 21,
+	CMD_PROBE_REQ       = 22,
+	CMD_TEST            = 23,
+	CMD_RADIO_CALIBRATE     = 25,   /* OBSOLETE */
+	CMD_ENABLE_RX_PATH      = 27,   /* OBSOLETE */
+	CMD_NOISE_HIST      = 28,
+	CMD_RX_RESET        = 29,
+	CMD_PS_POLL         = 30,
+	CMD_QOS_NULL_DATA   = 31,
+	CMD_LNA_CONTROL     = 32,
+	CMD_SET_BCN_MODE    = 33,
+	CMD_MEASUREMENT      = 34,
+	CMD_STOP_MEASUREMENT = 35,
+	CMD_DISCONNECT       = 36,
+	CMD_SET_PS_MODE      = 37,
+	CMD_CHANNEL_SWITCH   = 38,
+	CMD_STOP_CHANNEL_SWICTH = 39,
+	CMD_AP_DISCOVERY     = 40,
+	CMD_STOP_AP_DISCOVERY = 41,
+	CMD_SPS_SCAN = 42,
+	CMD_STOP_SPS_SCAN = 43,
+	CMD_HEALTH_CHECK     = 45,
+	CMD_DEBUG            = 46,
+	CMD_TRIGGER_SCAN_TO  = 47,
+
+	NUM_COMMANDS,
+	MAX_COMMAND_ID = 0xFFFF,
+};
+
+#define MAX_CMD_PARAMS 572
+
+struct  wl12xx_command {
+	u16 id;
+	u16 status;
+	u8  parameters[MAX_CMD_PARAMS];
+};
+
+enum {
+	CMD_MAILBOX_IDLE              		=  0,
+	CMD_STATUS_SUCCESS            		=  1,
+	CMD_STATUS_UNKNOWN_CMD        		=  2,
+	CMD_STATUS_UNKNOWN_IE         		=  3,
+	CMD_STATUS_REJECT_MEAS_SG_ACTIVE 	= 11,
+	CMD_STATUS_RX_BUSY            		= 13,
+	CMD_STATUS_INVALID_PARAM      		= 14,
+	CMD_STATUS_TEMPLATE_TOO_LARGE 		= 15,
+	CMD_STATUS_OUT_OF_MEMORY      		= 16,
+	CMD_STATUS_STA_TABLE_FULL     		= 17,
+	CMD_STATUS_RADIO_ERROR        		= 18,
+	CMD_STATUS_WRONG_NESTING      		= 19,
+	CMD_STATUS_TIMEOUT            		= 21, /* Driver internal use.*/
+	CMD_STATUS_FW_RESET           		= 22, /* Driver internal use.*/
+	MAX_COMMAND_STATUS            		= 0xff
+};
+
+
+/*
+ * CMD_READ_MEMORY
+ *
+ * The host issues this command to read the WiLink device memory/registers.
+ *
+ * Note: The Base Band address has special handling (16 bits registers and
+ * addresses). For more information, see the hardware specification.
+ */
+/*
+ * CMD_WRITE_MEMORY
+ *
+ * The host issues this command to write the WiLink device memory/registers.
+ *
+ * The Base Band address has special handling (16 bits registers and
+ * addresses). For more information, see the hardware specification.
+ */
+#define MAX_READ_SIZE 256
+
+struct cmd_read_write_memory {
+	/* The address of the memory to read from or write to.*/
+	u32 addr;
+
+	/* The amount of data in bytes to read from or write to the WiLink
+	 * device.*/
+	u32 size;
+
+	/* The actual value read from or written to the Wilink. The source
+	   of this field is the Host in WRITE command or the Wilink in READ
+	   command. */
+	u8 value[MAX_READ_SIZE];
+};
+
+#define CMDMBOX_HEADER_LEN 4
+#define CMDMBOX_INFO_ELEM_HEADER_LEN 4
+
+
+struct basic_scan_parameters {
+	u32 rx_config_options;
+	u32 rx_filter_options;
+
+	/*
+	 * Scan options:
+	 * bit 0: When this bit is set, passive scan.
+	 * bit 1: Band, when this bit is set we scan
+	 * in the 5Ghz band.
+	 * bit 2: voice mode, 0 for normal scan.
+	 * bit 3: scan priority, 1 for high priority.
+	 */
+	u16 scan_options;
+
+	/* Number of channels to scan */
+	u8 num_channels;
+
+	/* Number opf probe requests to send, per channel */
+	u8 num_probe_requests;
+
+	/* Rate and modulation for probe requests */
+	u16 tx_rate;
+
+	u8 tid_trigger;
+	u8 ssid_len;
+	u32 ssid[8];
+
+} __attribute__ ((packed));
+
+struct basic_scan_channel_parameters {
+	u32 min_duration; /* in TU */
+	u32 max_duration; /* in TU */
+	u32 bssid_lsb;
+	u16 bssid_msb;
+
+	/*
+	 * bits 0-3: Early termination count.
+	 * bits 4-5: Early termination condition.
+	 */
+	u8 early_termination;
+
+	u8 tx_power_att;
+	u8 channel;
+	u8 pad[3];
+} __attribute__ ((packed));
+
+/* SCAN parameters */
+#define SCAN_MAX_NUM_OF_CHANNELS 16
+
+struct cmd_scan {
+	struct basic_scan_parameters params;
+	struct basic_scan_channel_parameters channels[SCAN_MAX_NUM_OF_CHANNELS];
+} __attribute__ ((packed));
+
+enum {
+	BSS_TYPE_IBSS = 0,
+	BSS_TYPE_STA_BSS = 2,
+	BSS_TYPE_AP_BSS = 3,
+	MAX_BSS_TYPE = 0xFF
+};
+
+#define JOIN_CMD_CTRL_TX_FLUSH             0x80 /* Firmware flushes all Tx */
+#define JOIN_CMD_CTRL_EARLY_WAKEUP_ENABLE  0x01 /* Early wakeup time */
+
+
+struct cmd_join {
+	u32 bssid_lsb;
+	u16 bssid_msb;
+	u16 beacon_interval; /* in TBTTs */
+	u32 rx_config_options;
+	u32 rx_filter_options;
+
+	/*
+	 * The target uses this field to determine the rate at
+	 * which to transmit control frame responses (such as
+	 * ACK or CTS frames).
+	 */
+	u16 basic_rate_set;
+	u8 dtim_interval;
+	u8 tx_ctrl_frame_rate; /* OBSOLETE */
+	u8 tx_ctrl_frame_mod;  /* OBSOLETE */
+	/*
+	 * bits 0-2: This bitwise field specifies the type
+	 * of BSS to start or join (BSS_TYPE_*).
+	 * bit 4: Band - The radio band in which to join
+	 * or start.
+	 *  0 - 2.4GHz band
+	 *  1 - 5GHz band
+	 * bits 3, 5-7: Reserved
+	 */
+	u8 bss_type;
+	u8 channel;
+	u8 ssid_len;
+	u8 ssid[IW_ESSID_MAX_SIZE];
+	u8 ctrl; /* JOIN_CMD_CTRL_* */
+	u8 tx_mgt_frame_rate; /* OBSOLETE */
+	u8 tx_mgt_frame_mod;  /* OBSOLETE */
+	u8 reserved;
+} __attribute__ ((packed));
+
+
+#endif /* __WL12XX_CMD_H__ */
diff --git a/drivers/net/wireless/wl12xx/debugfs.c b/drivers/net/wireless/wl12xx/debugfs.c
new file mode 100644
index 00000000000..cdb368ce4da
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/debugfs.c
@@ -0,0 +1,508 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include "debugfs.h"
+
+#include <linux/skbuff.h>
+
+#include "wl12xx.h"
+#include "acx.h"
+
+/* ms */
+#define WL12XX_DEBUGFS_STATS_LIFETIME 1000
+
+/* debugfs macros idea from mac80211 */
+
+#define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...)		\
+static ssize_t name## _read(struct file *file, char __user *userbuf,	\
+			    size_t count, loff_t *ppos)			\
+{									\
+	struct wl12xx *wl = file->private_data;				\
+	char buf[buflen];						\
+	int res;							\
+									\
+	res = scnprintf(buf, buflen, fmt "\n", ##value);		\
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);	\
+}									\
+									\
+static const struct file_operations name## _ops = {			\
+	.read = name## _read,						\
+	.open = wl12xx_open_file_generic,				\
+};
+
+#define DEBUGFS_ADD(name, parent)					\
+	wl->debugfs.name = debugfs_create_file(#name, 0400, parent,	\
+					       wl, &name## _ops);	\
+	if (IS_ERR(wl->debugfs.name)) {					\
+		ret = PTR_ERR(wl->debugfs.name);			\
+		wl->debugfs.name = NULL;				\
+		goto out;						\
+	}
+
+#define DEBUGFS_DEL(name)						\
+	do {								\
+		debugfs_remove(wl->debugfs.name);			\
+		wl->debugfs.name = NULL;				\
+	} while (0)
+
+#define DEBUGFS_FWSTATS_FILE(sub, name, buflen, fmt)			\
+static ssize_t sub## _ ##name## _read(struct file *file,		\
+				      char __user *userbuf,		\
+				      size_t count, loff_t *ppos)	\
+{									\
+	struct wl12xx *wl = file->private_data;				\
+	char buf[buflen];						\
+	int res;							\
+									\
+	wl12xx_debugfs_update_stats(wl);				\
+									\
+	res = scnprintf(buf, buflen, fmt "\n",				\
+			wl->stats.fw_stats->sub.name);			\
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);	\
+}									\
+									\
+static const struct file_operations sub## _ ##name## _ops = {		\
+	.read = sub## _ ##name## _read,					\
+	.open = wl12xx_open_file_generic,				\
+};
+
+#define DEBUGFS_FWSTATS_ADD(sub, name)				\
+	DEBUGFS_ADD(sub## _ ##name, wl->debugfs.fw_statistics)
+
+#define DEBUGFS_FWSTATS_DEL(sub, name)				\
+	DEBUGFS_DEL(sub## _ ##name)
+
+static void wl12xx_debugfs_update_stats(struct wl12xx *wl)
+{
+	mutex_lock(&wl->mutex);
+
+	if (wl->state == WL12XX_STATE_ON &&
+	    time_after(jiffies, wl->stats.fw_stats_update +
+		       msecs_to_jiffies(WL12XX_DEBUGFS_STATS_LIFETIME))) {
+		wl12xx_acx_statistics(wl, wl->stats.fw_stats);
+		wl->stats.fw_stats_update = jiffies;
+	}
+
+	mutex_unlock(&wl->mutex);
+}
+
+static int wl12xx_open_file_generic(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+DEBUGFS_FWSTATS_FILE(tx, internal_desc_overflow, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(rx, out_of_mem, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, hdr_overflow, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, hw_stuck, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, dropped, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, fcs_err, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, xfr_hint_trig, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, path_reset, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rx, reset_counter, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(dma, rx_requested, 20, "%u");
+DEBUGFS_FWSTATS_FILE(dma, rx_errors, 20, "%u");
+DEBUGFS_FWSTATS_FILE(dma, tx_requested, 20, "%u");
+DEBUGFS_FWSTATS_FILE(dma, tx_errors, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(isr, cmd_cmplt, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, fiqs, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, rx_headers, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, rx_mem_overflow, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, rx_rdys, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, irqs, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, tx_procs, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, decrypt_done, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, dma0_done, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, dma1_done, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, tx_exch_complete, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, commands, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, rx_procs, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, hw_pm_mode_changes, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, host_acknowledges, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, pci_pm, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, wakeups, 20, "%u");
+DEBUGFS_FWSTATS_FILE(isr, low_rssi, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(wep, addr_key_count, 20, "%u");
+DEBUGFS_FWSTATS_FILE(wep, default_key_count, 20, "%u");
+/* skipping wep.reserved */
+DEBUGFS_FWSTATS_FILE(wep, key_not_found, 20, "%u");
+DEBUGFS_FWSTATS_FILE(wep, decrypt_fail, 20, "%u");
+DEBUGFS_FWSTATS_FILE(wep, packets, 20, "%u");
+DEBUGFS_FWSTATS_FILE(wep, interrupt, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(pwr, ps_enter, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, elp_enter, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, missing_bcns, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, wake_on_host, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, wake_on_timer_exp, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, tx_with_ps, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, tx_without_ps, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, rcvd_beacons, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, power_save_off, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, enable_ps, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, disable_ps, 20, "%u");
+DEBUGFS_FWSTATS_FILE(pwr, fix_tsf_ps, 20, "%u");
+/* skipping cont_miss_bcns_spread for now */
+DEBUGFS_FWSTATS_FILE(pwr, rcvd_awake_beacons, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(mic, rx_pkts, 20, "%u");
+DEBUGFS_FWSTATS_FILE(mic, calc_failure, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(aes, encrypt_fail, 20, "%u");
+DEBUGFS_FWSTATS_FILE(aes, decrypt_fail, 20, "%u");
+DEBUGFS_FWSTATS_FILE(aes, encrypt_packets, 20, "%u");
+DEBUGFS_FWSTATS_FILE(aes, decrypt_packets, 20, "%u");
+DEBUGFS_FWSTATS_FILE(aes, encrypt_interrupt, 20, "%u");
+DEBUGFS_FWSTATS_FILE(aes, decrypt_interrupt, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(event, heart_beat, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, calibration, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, rx_mismatch, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, rx_mem_empty, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, rx_pool, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, oom_late, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, phy_transmit_error, 20, "%u");
+DEBUGFS_FWSTATS_FILE(event, tx_stuck, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(ps, pspoll_timeouts, 20, "%u");
+DEBUGFS_FWSTATS_FILE(ps, upsd_timeouts, 20, "%u");
+DEBUGFS_FWSTATS_FILE(ps, upsd_max_sptime, 20, "%u");
+DEBUGFS_FWSTATS_FILE(ps, upsd_max_apturn, 20, "%u");
+DEBUGFS_FWSTATS_FILE(ps, pspoll_max_apturn, 20, "%u");
+DEBUGFS_FWSTATS_FILE(ps, pspoll_utilization, 20, "%u");
+DEBUGFS_FWSTATS_FILE(ps, upsd_utilization, 20, "%u");
+
+DEBUGFS_FWSTATS_FILE(rxpipe, rx_prep_beacon_drop, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rxpipe, descr_host_int_trig_rx_data, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rxpipe, beacon_buffer_thres_host_int_trig_rx_data,
+		     20, "%u");
+DEBUGFS_FWSTATS_FILE(rxpipe, missed_beacon_host_int_trig_rx_data, 20, "%u");
+DEBUGFS_FWSTATS_FILE(rxpipe, tx_xfr_host_int_trig_rx_data, 20, "%u");
+
+DEBUGFS_READONLY_FILE(retry_count, 20, "%u", wl->stats.retry_count);
+DEBUGFS_READONLY_FILE(excessive_retries, 20, "%u",
+		      wl->stats.excessive_retries);
+
+static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf,
+				 size_t count, loff_t *ppos)
+{
+	struct wl12xx *wl = file->private_data;
+	u32 queue_len;
+	char buf[20];
+	int res;
+
+	queue_len = skb_queue_len(&wl->tx_queue);
+
+	res = scnprintf(buf, sizeof(buf), "%u\n", queue_len);
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+
+static const struct file_operations tx_queue_len_ops = {
+	.read = tx_queue_len_read,
+	.open = wl12xx_open_file_generic,
+};
+
+static void wl12xx_debugfs_delete_files(struct wl12xx *wl)
+{
+	DEBUGFS_FWSTATS_DEL(tx, internal_desc_overflow);
+
+	DEBUGFS_FWSTATS_DEL(rx, out_of_mem);
+	DEBUGFS_FWSTATS_DEL(rx, hdr_overflow);
+	DEBUGFS_FWSTATS_DEL(rx, hw_stuck);
+	DEBUGFS_FWSTATS_DEL(rx, dropped);
+	DEBUGFS_FWSTATS_DEL(rx, fcs_err);
+	DEBUGFS_FWSTATS_DEL(rx, xfr_hint_trig);
+	DEBUGFS_FWSTATS_DEL(rx, path_reset);
+	DEBUGFS_FWSTATS_DEL(rx, reset_counter);
+
+	DEBUGFS_FWSTATS_DEL(dma, rx_requested);
+	DEBUGFS_FWSTATS_DEL(dma, rx_errors);
+	DEBUGFS_FWSTATS_DEL(dma, tx_requested);
+	DEBUGFS_FWSTATS_DEL(dma, tx_errors);
+
+	DEBUGFS_FWSTATS_DEL(isr, cmd_cmplt);
+	DEBUGFS_FWSTATS_DEL(isr, fiqs);
+	DEBUGFS_FWSTATS_DEL(isr, rx_headers);
+	DEBUGFS_FWSTATS_DEL(isr, rx_mem_overflow);
+	DEBUGFS_FWSTATS_DEL(isr, rx_rdys);
+	DEBUGFS_FWSTATS_DEL(isr, irqs);
+	DEBUGFS_FWSTATS_DEL(isr, tx_procs);
+	DEBUGFS_FWSTATS_DEL(isr, decrypt_done);
+	DEBUGFS_FWSTATS_DEL(isr, dma0_done);
+	DEBUGFS_FWSTATS_DEL(isr, dma1_done);
+	DEBUGFS_FWSTATS_DEL(isr, tx_exch_complete);
+	DEBUGFS_FWSTATS_DEL(isr, commands);
+	DEBUGFS_FWSTATS_DEL(isr, rx_procs);
+	DEBUGFS_FWSTATS_DEL(isr, hw_pm_mode_changes);
+	DEBUGFS_FWSTATS_DEL(isr, host_acknowledges);
+	DEBUGFS_FWSTATS_DEL(isr, pci_pm);
+	DEBUGFS_FWSTATS_DEL(isr, wakeups);
+	DEBUGFS_FWSTATS_DEL(isr, low_rssi);
+
+	DEBUGFS_FWSTATS_DEL(wep, addr_key_count);
+	DEBUGFS_FWSTATS_DEL(wep, default_key_count);
+	/* skipping wep.reserved */
+	DEBUGFS_FWSTATS_DEL(wep, key_not_found);
+	DEBUGFS_FWSTATS_DEL(wep, decrypt_fail);
+	DEBUGFS_FWSTATS_DEL(wep, packets);
+	DEBUGFS_FWSTATS_DEL(wep, interrupt);
+
+	DEBUGFS_FWSTATS_DEL(pwr, ps_enter);
+	DEBUGFS_FWSTATS_DEL(pwr, elp_enter);
+	DEBUGFS_FWSTATS_DEL(pwr, missing_bcns);
+	DEBUGFS_FWSTATS_DEL(pwr, wake_on_host);
+	DEBUGFS_FWSTATS_DEL(pwr, wake_on_timer_exp);
+	DEBUGFS_FWSTATS_DEL(pwr, tx_with_ps);
+	DEBUGFS_FWSTATS_DEL(pwr, tx_without_ps);
+	DEBUGFS_FWSTATS_DEL(pwr, rcvd_beacons);
+	DEBUGFS_FWSTATS_DEL(pwr, power_save_off);
+	DEBUGFS_FWSTATS_DEL(pwr, enable_ps);
+	DEBUGFS_FWSTATS_DEL(pwr, disable_ps);
+	DEBUGFS_FWSTATS_DEL(pwr, fix_tsf_ps);
+	/* skipping cont_miss_bcns_spread for now */
+	DEBUGFS_FWSTATS_DEL(pwr, rcvd_awake_beacons);
+
+	DEBUGFS_FWSTATS_DEL(mic, rx_pkts);
+	DEBUGFS_FWSTATS_DEL(mic, calc_failure);
+
+	DEBUGFS_FWSTATS_DEL(aes, encrypt_fail);
+	DEBUGFS_FWSTATS_DEL(aes, decrypt_fail);
+	DEBUGFS_FWSTATS_DEL(aes, encrypt_packets);
+	DEBUGFS_FWSTATS_DEL(aes, decrypt_packets);
+	DEBUGFS_FWSTATS_DEL(aes, encrypt_interrupt);
+	DEBUGFS_FWSTATS_DEL(aes, decrypt_interrupt);
+
+	DEBUGFS_FWSTATS_DEL(event, heart_beat);
+	DEBUGFS_FWSTATS_DEL(event, calibration);
+	DEBUGFS_FWSTATS_DEL(event, rx_mismatch);
+	DEBUGFS_FWSTATS_DEL(event, rx_mem_empty);
+	DEBUGFS_FWSTATS_DEL(event, rx_pool);
+	DEBUGFS_FWSTATS_DEL(event, oom_late);
+	DEBUGFS_FWSTATS_DEL(event, phy_transmit_error);
+	DEBUGFS_FWSTATS_DEL(event, tx_stuck);
+
+	DEBUGFS_FWSTATS_DEL(ps, pspoll_timeouts);
+	DEBUGFS_FWSTATS_DEL(ps, upsd_timeouts);
+	DEBUGFS_FWSTATS_DEL(ps, upsd_max_sptime);
+	DEBUGFS_FWSTATS_DEL(ps, upsd_max_apturn);
+	DEBUGFS_FWSTATS_DEL(ps, pspoll_max_apturn);
+	DEBUGFS_FWSTATS_DEL(ps, pspoll_utilization);
+	DEBUGFS_FWSTATS_DEL(ps, upsd_utilization);
+
+	DEBUGFS_FWSTATS_DEL(rxpipe, rx_prep_beacon_drop);
+	DEBUGFS_FWSTATS_DEL(rxpipe, descr_host_int_trig_rx_data);
+	DEBUGFS_FWSTATS_DEL(rxpipe, beacon_buffer_thres_host_int_trig_rx_data);
+	DEBUGFS_FWSTATS_DEL(rxpipe, missed_beacon_host_int_trig_rx_data);
+	DEBUGFS_FWSTATS_DEL(rxpipe, tx_xfr_host_int_trig_rx_data);
+
+	DEBUGFS_DEL(tx_queue_len);
+	DEBUGFS_DEL(retry_count);
+	DEBUGFS_DEL(excessive_retries);
+}
+
+static int wl12xx_debugfs_add_files(struct wl12xx *wl)
+{
+	int ret = 0;
+
+	DEBUGFS_FWSTATS_ADD(tx, internal_desc_overflow);
+
+	DEBUGFS_FWSTATS_ADD(rx, out_of_mem);
+	DEBUGFS_FWSTATS_ADD(rx, hdr_overflow);
+	DEBUGFS_FWSTATS_ADD(rx, hw_stuck);
+	DEBUGFS_FWSTATS_ADD(rx, dropped);
+	DEBUGFS_FWSTATS_ADD(rx, fcs_err);
+	DEBUGFS_FWSTATS_ADD(rx, xfr_hint_trig);
+	DEBUGFS_FWSTATS_ADD(rx, path_reset);
+	DEBUGFS_FWSTATS_ADD(rx, reset_counter);
+
+	DEBUGFS_FWSTATS_ADD(dma, rx_requested);
+	DEBUGFS_FWSTATS_ADD(dma, rx_errors);
+	DEBUGFS_FWSTATS_ADD(dma, tx_requested);
+	DEBUGFS_FWSTATS_ADD(dma, tx_errors);
+
+	DEBUGFS_FWSTATS_ADD(isr, cmd_cmplt);
+	DEBUGFS_FWSTATS_ADD(isr, fiqs);
+	DEBUGFS_FWSTATS_ADD(isr, rx_headers);
+	DEBUGFS_FWSTATS_ADD(isr, rx_mem_overflow);
+	DEBUGFS_FWSTATS_ADD(isr, rx_rdys);
+	DEBUGFS_FWSTATS_ADD(isr, irqs);
+	DEBUGFS_FWSTATS_ADD(isr, tx_procs);
+	DEBUGFS_FWSTATS_ADD(isr, decrypt_done);
+	DEBUGFS_FWSTATS_ADD(isr, dma0_done);
+	DEBUGFS_FWSTATS_ADD(isr, dma1_done);
+	DEBUGFS_FWSTATS_ADD(isr, tx_exch_complete);
+	DEBUGFS_FWSTATS_ADD(isr, commands);
+	DEBUGFS_FWSTATS_ADD(isr, rx_procs);
+	DEBUGFS_FWSTATS_ADD(isr, hw_pm_mode_changes);
+	DEBUGFS_FWSTATS_ADD(isr, host_acknowledges);
+	DEBUGFS_FWSTATS_ADD(isr, pci_pm);
+	DEBUGFS_FWSTATS_ADD(isr, wakeups);
+	DEBUGFS_FWSTATS_ADD(isr, low_rssi);
+
+	DEBUGFS_FWSTATS_ADD(wep, addr_key_count);
+	DEBUGFS_FWSTATS_ADD(wep, default_key_count);
+	/* skipping wep.reserved */
+	DEBUGFS_FWSTATS_ADD(wep, key_not_found);
+	DEBUGFS_FWSTATS_ADD(wep, decrypt_fail);
+	DEBUGFS_FWSTATS_ADD(wep, packets);
+	DEBUGFS_FWSTATS_ADD(wep, interrupt);
+
+	DEBUGFS_FWSTATS_ADD(pwr, ps_enter);
+	DEBUGFS_FWSTATS_ADD(pwr, elp_enter);
+	DEBUGFS_FWSTATS_ADD(pwr, missing_bcns);
+	DEBUGFS_FWSTATS_ADD(pwr, wake_on_host);
+	DEBUGFS_FWSTATS_ADD(pwr, wake_on_timer_exp);
+	DEBUGFS_FWSTATS_ADD(pwr, tx_with_ps);
+	DEBUGFS_FWSTATS_ADD(pwr, tx_without_ps);
+	DEBUGFS_FWSTATS_ADD(pwr, rcvd_beacons);
+	DEBUGFS_FWSTATS_ADD(pwr, power_save_off);
+	DEBUGFS_FWSTATS_ADD(pwr, enable_ps);
+	DEBUGFS_FWSTATS_ADD(pwr, disable_ps);
+	DEBUGFS_FWSTATS_ADD(pwr, fix_tsf_ps);
+	/* skipping cont_miss_bcns_spread for now */
+	DEBUGFS_FWSTATS_ADD(pwr, rcvd_awake_beacons);
+
+	DEBUGFS_FWSTATS_ADD(mic, rx_pkts);
+	DEBUGFS_FWSTATS_ADD(mic, calc_failure);
+
+	DEBUGFS_FWSTATS_ADD(aes, encrypt_fail);
+	DEBUGFS_FWSTATS_ADD(aes, decrypt_fail);
+	DEBUGFS_FWSTATS_ADD(aes, encrypt_packets);
+	DEBUGFS_FWSTATS_ADD(aes, decrypt_packets);
+	DEBUGFS_FWSTATS_ADD(aes, encrypt_interrupt);
+	DEBUGFS_FWSTATS_ADD(aes, decrypt_interrupt);
+
+	DEBUGFS_FWSTATS_ADD(event, heart_beat);
+	DEBUGFS_FWSTATS_ADD(event, calibration);
+	DEBUGFS_FWSTATS_ADD(event, rx_mismatch);
+	DEBUGFS_FWSTATS_ADD(event, rx_mem_empty);
+	DEBUGFS_FWSTATS_ADD(event, rx_pool);
+	DEBUGFS_FWSTATS_ADD(event, oom_late);
+	DEBUGFS_FWSTATS_ADD(event, phy_transmit_error);
+	DEBUGFS_FWSTATS_ADD(event, tx_stuck);
+
+	DEBUGFS_FWSTATS_ADD(ps, pspoll_timeouts);
+	DEBUGFS_FWSTATS_ADD(ps, upsd_timeouts);
+	DEBUGFS_FWSTATS_ADD(ps, upsd_max_sptime);
+	DEBUGFS_FWSTATS_ADD(ps, upsd_max_apturn);
+	DEBUGFS_FWSTATS_ADD(ps, pspoll_max_apturn);
+	DEBUGFS_FWSTATS_ADD(ps, pspoll_utilization);
+	DEBUGFS_FWSTATS_ADD(ps, upsd_utilization);
+
+	DEBUGFS_FWSTATS_ADD(rxpipe, rx_prep_beacon_drop);
+	DEBUGFS_FWSTATS_ADD(rxpipe, descr_host_int_trig_rx_data);
+	DEBUGFS_FWSTATS_ADD(rxpipe, beacon_buffer_thres_host_int_trig_rx_data);
+	DEBUGFS_FWSTATS_ADD(rxpipe, missed_beacon_host_int_trig_rx_data);
+	DEBUGFS_FWSTATS_ADD(rxpipe, tx_xfr_host_int_trig_rx_data);
+
+	DEBUGFS_ADD(tx_queue_len, wl->debugfs.rootdir);
+	DEBUGFS_ADD(retry_count, wl->debugfs.rootdir);
+	DEBUGFS_ADD(excessive_retries, wl->debugfs.rootdir);
+
+out:
+	if (ret < 0)
+		wl12xx_debugfs_delete_files(wl);
+
+	return ret;
+}
+
+void wl12xx_debugfs_reset(struct wl12xx *wl)
+{
+	memset(wl->stats.fw_stats, 0, sizeof(*wl->stats.fw_stats));
+	wl->stats.retry_count = 0;
+	wl->stats.excessive_retries = 0;
+}
+
+int wl12xx_debugfs_init(struct wl12xx *wl)
+{
+	int ret;
+
+	wl->debugfs.rootdir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	if (IS_ERR(wl->debugfs.rootdir)) {
+		ret = PTR_ERR(wl->debugfs.rootdir);
+		wl->debugfs.rootdir = NULL;
+		goto err;
+	}
+
+	wl->debugfs.fw_statistics = debugfs_create_dir("fw-statistics",
+						       wl->debugfs.rootdir);
+
+	if (IS_ERR(wl->debugfs.fw_statistics)) {
+		ret = PTR_ERR(wl->debugfs.fw_statistics);
+		wl->debugfs.fw_statistics = NULL;
+		goto err_root;
+	}
+
+	wl->stats.fw_stats = kzalloc(sizeof(*wl->stats.fw_stats),
+				      GFP_KERNEL);
+
+	if (!wl->stats.fw_stats) {
+		ret = -ENOMEM;
+		goto err_fw;
+	}
+
+	wl->stats.fw_stats_update = jiffies;
+
+	ret = wl12xx_debugfs_add_files(wl);
+
+	if (ret < 0)
+		goto err_file;
+
+	return 0;
+
+err_file:
+	kfree(wl->stats.fw_stats);
+	wl->stats.fw_stats = NULL;
+
+err_fw:
+	debugfs_remove(wl->debugfs.fw_statistics);
+	wl->debugfs.fw_statistics = NULL;
+
+err_root:
+	debugfs_remove(wl->debugfs.rootdir);
+	wl->debugfs.rootdir = NULL;
+
+err:
+	return ret;
+}
+
+void wl12xx_debugfs_exit(struct wl12xx *wl)
+{
+	wl12xx_debugfs_delete_files(wl);
+
+	kfree(wl->stats.fw_stats);
+	wl->stats.fw_stats = NULL;
+
+	debugfs_remove(wl->debugfs.fw_statistics);
+	wl->debugfs.fw_statistics = NULL;
+
+	debugfs_remove(wl->debugfs.rootdir);
+	wl->debugfs.rootdir = NULL;
+
+}
diff --git a/drivers/net/wireless/wl12xx/debugfs.h b/drivers/net/wireless/wl12xx/debugfs.h
new file mode 100644
index 00000000000..562cdcbcc87
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/debugfs.h
@@ -0,0 +1,33 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef WL12XX_DEBUGFS_H
+#define WL12XX_DEBUGFS_H
+
+#include "wl12xx.h"
+
+int wl12xx_debugfs_init(struct wl12xx *wl);
+void wl12xx_debugfs_exit(struct wl12xx *wl);
+void wl12xx_debugfs_reset(struct wl12xx *wl);
+
+#endif /* WL12XX_DEBUGFS_H */
diff --git a/drivers/net/wireless/wl12xx/event.c b/drivers/net/wireless/wl12xx/event.c
new file mode 100644
index 00000000000..99529ca89a7
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/event.c
@@ -0,0 +1,127 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include "wl12xx.h"
+#include "reg.h"
+#include "spi.h"
+#include "event.h"
+#include "ps.h"
+
+static int wl12xx_event_scan_complete(struct wl12xx *wl,
+				      struct event_mailbox *mbox)
+{
+	wl12xx_debug(DEBUG_EVENT, "status: 0x%x, channels: %d",
+		     mbox->scheduled_scan_status,
+		     mbox->scheduled_scan_channels);
+
+	if (wl->scanning) {
+		mutex_unlock(&wl->mutex);
+		ieee80211_scan_completed(wl->hw, false);
+		mutex_lock(&wl->mutex);
+		wl->scanning = false;
+	}
+
+	return 0;
+}
+
+static void wl12xx_event_mbox_dump(struct event_mailbox *mbox)
+{
+	wl12xx_debug(DEBUG_EVENT, "MBOX DUMP:");
+	wl12xx_debug(DEBUG_EVENT, "\tvector: 0x%x", mbox->events_vector);
+	wl12xx_debug(DEBUG_EVENT, "\tmask: 0x%x", mbox->events_mask);
+}
+
+static int wl12xx_event_process(struct wl12xx *wl, struct event_mailbox *mbox)
+{
+	int ret;
+	u32 vector;
+
+	wl12xx_event_mbox_dump(mbox);
+
+	vector = mbox->events_vector & ~(mbox->events_mask);
+	wl12xx_debug(DEBUG_EVENT, "vector: 0x%x", vector);
+
+	if (vector & SCAN_COMPLETE_EVENT_ID) {
+		ret = wl12xx_event_scan_complete(wl, mbox);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (vector & BSS_LOSE_EVENT_ID) {
+		wl12xx_debug(DEBUG_EVENT, "BSS_LOSE_EVENT");
+
+		if (wl->psm_requested && wl->psm) {
+			ret = wl12xx_ps_set_mode(wl, STATION_ACTIVE_MODE);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int wl12xx_event_unmask(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_event_mbox_mask(wl, ~(wl->event_mask));
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+void wl12xx_event_mbox_config(struct wl12xx *wl)
+{
+	wl->mbox_ptr[0] = wl12xx_reg_read32(wl, REG_EVENT_MAILBOX_PTR);
+	wl->mbox_ptr[1] = wl->mbox_ptr[0] + sizeof(struct event_mailbox);
+
+	wl12xx_debug(DEBUG_EVENT, "MBOX ptrs: 0x%x 0x%x",
+		     wl->mbox_ptr[0], wl->mbox_ptr[1]);
+}
+
+int wl12xx_event_handle(struct wl12xx *wl, u8 mbox_num)
+{
+	struct event_mailbox mbox;
+	int ret;
+
+	wl12xx_debug(DEBUG_EVENT, "EVENT on mbox %d", mbox_num);
+
+	if (mbox_num > 1)
+		return -EINVAL;
+
+	/* first we read the mbox descriptor */
+	wl12xx_spi_mem_read(wl, wl->mbox_ptr[mbox_num], &mbox,
+			    sizeof(struct event_mailbox));
+
+	/* process the descriptor */
+	ret = wl12xx_event_process(wl, &mbox);
+	if (ret < 0)
+		return ret;
+
+	/* then we let the firmware know it can go on...*/
+	wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_TRIG, INTR_TRIG_EVENT_ACK);
+
+	return 0;
+}
diff --git a/drivers/net/wireless/wl12xx/event.h b/drivers/net/wireless/wl12xx/event.h
new file mode 100644
index 00000000000..1f4c2f7438a
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/event.h
@@ -0,0 +1,121 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_EVENT_H__
+#define __WL12XX_EVENT_H__
+
+/*
+ * Mbox events
+ *
+ * The event mechanism is based on a pair of event buffers (buffers A and
+ * B) at fixed locations in the target's memory. The host processes one
+ * buffer while the other buffer continues to collect events. If the host
+ * is not processing events, an interrupt is issued to signal that a buffer
+ * is ready. Once the host is done with processing events from one buffer,
+ * it signals the target (with an ACK interrupt) that the event buffer is
+ * free.
+ */
+
+enum {
+	RESERVED1_EVENT_ID                       = BIT(0),
+	RESERVED2_EVENT_ID                       = BIT(1),
+	MEASUREMENT_START_EVENT_ID               = BIT(2),
+	SCAN_COMPLETE_EVENT_ID                   = BIT(3),
+	CALIBRATION_COMPLETE_EVENT_ID            = BIT(4),
+	ROAMING_TRIGGER_LOW_RSSI_EVENT_ID        = BIT(5),
+	PS_REPORT_EVENT_ID                       = BIT(6),
+	SYNCHRONIZATION_TIMEOUT_EVENT_ID         = BIT(7),
+	HEALTH_REPORT_EVENT_ID                   = BIT(8),
+	ACI_DETECTION_EVENT_ID                   = BIT(9),
+	DEBUG_REPORT_EVENT_ID                    = BIT(10),
+	MAC_STATUS_EVENT_ID                      = BIT(11),
+	DISCONNECT_EVENT_COMPLETE_ID             = BIT(12),
+	JOIN_EVENT_COMPLETE_ID                   = BIT(13),
+	CHANNEL_SWITCH_COMPLETE_EVENT_ID         = BIT(14),
+	BSS_LOSE_EVENT_ID                        = BIT(15),
+	ROAMING_TRIGGER_MAX_TX_RETRY_EVENT_ID    = BIT(16),
+	MEASUREMENT_COMPLETE_EVENT_ID            = BIT(17),
+	AP_DISCOVERY_COMPLETE_EVENT_ID           = BIT(18),
+	SCHEDULED_SCAN_COMPLETE_EVENT_ID         = BIT(19),
+	PSPOLL_DELIVERY_FAILURE_EVENT_ID 	 = BIT(20),
+	RESET_BSS_EVENT_ID                       = BIT(21),
+	REGAINED_BSS_EVENT_ID                    = BIT(22),
+	ROAMING_TRIGGER_REGAINED_RSSI_EVENT_ID   = BIT(23),
+	ROAMING_TRIGGER_LOW_SNR_EVENT_ID         = BIT(24),
+	ROAMING_TRIGGER_REGAINED_SNR_EVENT_ID    = BIT(25),
+
+	DBG_EVENT_ID                             = BIT(26),
+	BT_PTA_SENSE_EVENT_ID                    = BIT(27),
+	BT_PTA_PREDICTION_EVENT_ID               = BIT(28),
+	BT_PTA_AVALANCHE_EVENT_ID                = BIT(29),
+
+	PLT_RX_CALIBRATION_COMPLETE_EVENT_ID     = BIT(30),
+
+	EVENT_MBOX_ALL_EVENT_ID                  = 0x7fffffff,
+};
+
+struct event_debug_report {
+	u8 debug_event_id;
+	u8 num_params;
+	u16 pad;
+	u32 report_1;
+	u32 report_2;
+	u32 report_3;
+} __attribute__ ((packed));
+
+struct event_mailbox {
+	u32 events_vector;
+	u32 events_mask;
+	u32 reserved_1;
+	u32 reserved_2;
+
+	char average_rssi_level;
+	u8 ps_status;
+	u8 channel_switch_status;
+	u8 scheduled_scan_status;
+
+	/* Channels scanned by the scheduled scan */
+	u16 scheduled_scan_channels;
+
+	/* If bit 0 is set -> target's fatal error */
+	u16 health_report;
+	u16 bad_fft_counter;
+	u8 bt_pta_sense_info;
+	u8 bt_pta_protective_info;
+	u32 reserved;
+	u32 debug_report[2];
+
+	/* Number of FCS errors since last event */
+	u32 fcs_err_counter;
+
+	struct event_debug_report report;
+	u8 average_snr_level;
+	u8 padding[19];
+} __attribute__ ((packed));
+
+int wl12xx_event_unmask(struct wl12xx *wl);
+void wl12xx_event_mbox_config(struct wl12xx *wl);
+int wl12xx_event_handle(struct wl12xx *wl, u8 mbox);
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/init.c b/drivers/net/wireless/wl12xx/init.c
new file mode 100644
index 00000000000..2a573a6010b
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/init.c
@@ -0,0 +1,200 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "init.h"
+#include "wl12xx_80211.h"
+#include "acx.h"
+#include "cmd.h"
+
+int wl12xx_hw_init_hwenc_config(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_feature_cfg(wl);
+	if (ret < 0) {
+		wl12xx_warning("couldn't set feature config");
+		return ret;
+	}
+
+	ret = wl12xx_acx_default_key(wl, wl->default_key);
+	if (ret < 0) {
+		wl12xx_warning("couldn't set default key");
+		return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_hw_init_templates_config(struct wl12xx *wl)
+{
+	int ret;
+	u8 partial_vbm[PARTIAL_VBM_MAX];
+
+	/* send empty templates for fw memory reservation */
+	ret = wl12xx_cmd_template_set(wl, CMD_PROBE_REQ, NULL,
+				      sizeof(struct wl12xx_probe_req_template));
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_template_set(wl, CMD_NULL_DATA, NULL,
+				      sizeof(struct wl12xx_null_data_template));
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_template_set(wl, CMD_PS_POLL, NULL,
+				      sizeof(struct wl12xx_ps_poll_template));
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_template_set(wl, CMD_QOS_NULL_DATA, NULL,
+				      sizeof
+				      (struct wl12xx_qos_null_data_template));
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_template_set(wl, CMD_PROBE_RESP, NULL,
+				      sizeof
+				      (struct wl12xx_probe_resp_template));
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_template_set(wl, CMD_BEACON, NULL,
+				      sizeof
+				      (struct wl12xx_beacon_template));
+	if (ret < 0)
+		return ret;
+
+	/* tim templates, first reserve space then allocate an empty one */
+	memset(partial_vbm, 0, PARTIAL_VBM_MAX);
+	ret = wl12xx_cmd_vbm(wl, TIM_ELE_ID, partial_vbm, PARTIAL_VBM_MAX, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_vbm(wl, TIM_ELE_ID, partial_vbm, 1, 0);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_rx_config(struct wl12xx *wl, u32 config, u32 filter)
+{
+	int ret;
+
+	ret = wl12xx_acx_rx_msdu_life_time(wl, RX_MSDU_LIFETIME_DEF);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_rx_config(wl, config, filter);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_phy_config(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_pd_threshold(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_slot(wl, DEFAULT_SLOT_TIME);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_group_address_tbl(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_service_period_timeout(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_rts_threshold(wl, RTS_THRESHOLD_DEF);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_beacon_filter(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_beacon_filter_opt(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_beacon_filter_table(wl);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_pta(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_sg_enable(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_acx_sg_cfg(wl);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_energy_detection(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_cca_threshold(wl);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_beacon_broadcast(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl12xx_acx_bcn_dtim_options(wl);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_hw_init_power_auth(struct wl12xx *wl)
+{
+	return wl12xx_acx_sleep_auth(wl, WL12XX_PSM_CAM);
+}
diff --git a/drivers/net/wireless/wl12xx/init.h b/drivers/net/wireless/wl12xx/init.h
new file mode 100644
index 00000000000..c8b6cd0b7c3
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/init.h
@@ -0,0 +1,40 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_INIT_H__
+#define __WL12XX_INIT_H__
+
+#include "wl12xx.h"
+
+int wl12xx_hw_init_hwenc_config(struct wl12xx *wl);
+int wl12xx_hw_init_templates_config(struct wl12xx *wl);
+int wl12xx_hw_init_mem_config(struct wl12xx *wl);
+int wl12xx_hw_init_rx_config(struct wl12xx *wl, u32 config, u32 filter);
+int wl12xx_hw_init_phy_config(struct wl12xx *wl);
+int wl12xx_hw_init_beacon_filter(struct wl12xx *wl);
+int wl12xx_hw_init_pta(struct wl12xx *wl);
+int wl12xx_hw_init_energy_detection(struct wl12xx *wl);
+int wl12xx_hw_init_beacon_broadcast(struct wl12xx *wl);
+int wl12xx_hw_init_power_auth(struct wl12xx *wl);
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/main.c b/drivers/net/wireless/wl12xx/main.c
new file mode 100644
index 00000000000..744f4ce5993
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/main.c
@@ -0,0 +1,1358 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/firmware.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/spi/spi.h>
+#include <linux/crc32.h>
+#include <linux/etherdevice.h>
+#include <linux/spi/wl12xx.h>
+
+#include "wl12xx.h"
+#include "wl12xx_80211.h"
+#include "reg.h"
+#include "wl1251.h"
+#include "spi.h"
+#include "event.h"
+#include "tx.h"
+#include "rx.h"
+#include "ps.h"
+#include "init.h"
+#include "debugfs.h"
+
+static void wl12xx_disable_interrupts(struct wl12xx *wl)
+{
+	disable_irq(wl->irq);
+}
+
+static void wl12xx_power_off(struct wl12xx *wl)
+{
+	wl->set_power(false);
+}
+
+static void wl12xx_power_on(struct wl12xx *wl)
+{
+	wl->set_power(true);
+}
+
+static irqreturn_t wl12xx_irq(int irq, void *cookie)
+{
+	struct wl12xx *wl;
+
+	wl12xx_debug(DEBUG_IRQ, "IRQ");
+
+	wl = cookie;
+
+	schedule_work(&wl->irq_work);
+
+	return IRQ_HANDLED;
+}
+
+static int wl12xx_fetch_firmware(struct wl12xx *wl)
+{
+	const struct firmware *fw;
+	int ret;
+
+	ret = request_firmware(&fw, wl->chip.fw_filename, &wl->spi->dev);
+
+	if (ret < 0) {
+		wl12xx_error("could not get firmware: %d", ret);
+		return ret;
+	}
+
+	if (fw->size % 4) {
+		wl12xx_error("firmware size is not multiple of 32 bits: %d",
+			     fw->size);
+		ret = -EILSEQ;
+		goto out;
+	}
+
+	wl->fw_len = fw->size;
+	wl->fw = kmalloc(wl->fw_len, GFP_KERNEL);
+
+	if (!wl->fw) {
+		wl12xx_error("could not allocate memory for the firmware");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(wl->fw, fw->data, wl->fw_len);
+
+	ret = 0;
+
+out:
+	release_firmware(fw);
+
+	return ret;
+}
+
+static int wl12xx_fetch_nvs(struct wl12xx *wl)
+{
+	const struct firmware *fw;
+	int ret;
+
+	ret = request_firmware(&fw, wl->chip.nvs_filename, &wl->spi->dev);
+
+	if (ret < 0) {
+		wl12xx_error("could not get nvs file: %d", ret);
+		return ret;
+	}
+
+	if (fw->size % 4) {
+		wl12xx_error("nvs size is not multiple of 32 bits: %d",
+			     fw->size);
+		ret = -EILSEQ;
+		goto out;
+	}
+
+	wl->nvs_len = fw->size;
+	wl->nvs = kmalloc(wl->nvs_len, GFP_KERNEL);
+
+	if (!wl->nvs) {
+		wl12xx_error("could not allocate memory for the nvs file");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(wl->nvs, fw->data, wl->nvs_len);
+
+	ret = 0;
+
+out:
+	release_firmware(fw);
+
+	return ret;
+}
+
+static void wl12xx_fw_wakeup(struct wl12xx *wl)
+{
+	u32 elp_reg;
+
+	elp_reg = ELPCTRL_WAKE_UP;
+	wl12xx_write32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR, elp_reg);
+	elp_reg = wl12xx_read32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR);
+
+	if (!(elp_reg & ELPCTRL_WLAN_READY)) {
+		wl12xx_warning("WLAN not ready");
+		elp_reg = ELPCTRL_WAKE_UP_WLAN_READY;
+		wl12xx_write32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR, elp_reg);
+	}
+}
+
+static int wl12xx_chip_wakeup(struct wl12xx *wl)
+{
+	int ret = 0;
+
+	wl12xx_power_on(wl);
+	msleep(wl->chip.power_on_sleep);
+	wl12xx_spi_reset(wl);
+	wl12xx_spi_init(wl);
+
+	/* We don't need a real memory partition here, because we only want
+	 * to use the registers at this point. */
+	wl12xx_set_partition(wl,
+			     0x00000000,
+			     0x00000000,
+			     REGISTERS_BASE,
+			     REGISTERS_DOWN_SIZE);
+
+	/* ELP module wake up */
+	wl12xx_fw_wakeup(wl);
+
+	/* whal_FwCtrl_BootSm() */
+
+	/* 0. read chip id from CHIP_ID */
+	wl->chip.id = wl12xx_reg_read32(wl, CHIP_ID_B);
+
+	/* 1. check if chip id is valid */
+
+	switch (wl->chip.id) {
+	case CHIP_ID_1251_PG12:
+		wl12xx_debug(DEBUG_BOOT, "chip id 0x%x (1251 PG12)",
+			     wl->chip.id);
+
+		wl1251_setup(wl);
+
+		break;
+	case CHIP_ID_1271_PG10:
+	case CHIP_ID_1251_PG10:
+	case CHIP_ID_1251_PG11:
+	default:
+		wl12xx_error("unsupported chip id: 0x%x", wl->chip.id);
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (wl->fw == NULL) {
+		ret = wl12xx_fetch_firmware(wl);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* No NVS from netlink, try to get it from the filesystem */
+	if (wl->nvs == NULL) {
+		ret = wl12xx_fetch_nvs(wl);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+static void wl12xx_filter_work(struct work_struct *work)
+{
+	struct wl12xx *wl =
+		container_of(work, struct wl12xx, filter_work);
+	int ret;
+
+	mutex_lock(&wl->mutex);
+
+	if (wl->state == WL12XX_STATE_OFF)
+		goto out;
+
+	ret = wl12xx_cmd_join(wl, wl->bss_type, 1, 100, 0);
+	if (ret < 0)
+		goto out;
+
+out:
+	mutex_unlock(&wl->mutex);
+}
+
+int wl12xx_plt_start(struct wl12xx *wl)
+{
+	int ret;
+
+	wl12xx_notice("power up");
+
+	if (wl->state != WL12XX_STATE_OFF) {
+		wl12xx_error("cannot go into PLT state because not "
+			     "in off state: %d", wl->state);
+		return -EBUSY;
+	}
+
+	wl->state = WL12XX_STATE_PLT;
+
+	ret = wl12xx_chip_wakeup(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl->chip.op_boot(wl);
+	if (ret < 0)
+		return ret;
+
+	wl12xx_notice("firmware booted in PLT mode (%s)", wl->chip.fw_ver);
+
+	ret = wl->chip.op_plt_init(wl);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int wl12xx_plt_stop(struct wl12xx *wl)
+{
+	wl12xx_notice("power down");
+
+	if (wl->state != WL12XX_STATE_PLT) {
+		wl12xx_error("cannot power down because not in PLT "
+			     "state: %d", wl->state);
+		return -EBUSY;
+	}
+
+	wl12xx_disable_interrupts(wl);
+	wl12xx_power_off(wl);
+
+	wl->state = WL12XX_STATE_OFF;
+
+	return 0;
+}
+
+
+static int wl12xx_op_tx(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	struct wl12xx *wl = hw->priv;
+
+	skb_queue_tail(&wl->tx_queue, skb);
+
+	schedule_work(&wl->tx_work);
+
+	/*
+	 * The workqueue is slow to process the tx_queue and we need stop
+	 * the queue here, otherwise the queue will get too long.
+	 */
+	if (skb_queue_len(&wl->tx_queue) >= WL12XX_TX_QUEUE_MAX_LENGTH) {
+		ieee80211_stop_queues(wl->hw);
+
+		/*
+		 * FIXME: this is racy, the variable is not properly
+		 * protected. Maybe fix this by removing the stupid
+		 * variable altogether and checking the real queue state?
+		 */
+		wl->tx_queue_stopped = true;
+	}
+
+	return NETDEV_TX_OK;
+}
+
+static int wl12xx_op_start(struct ieee80211_hw *hw)
+{
+	struct wl12xx *wl = hw->priv;
+	int ret = 0;
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 start");
+
+	mutex_lock(&wl->mutex);
+
+	if (wl->state != WL12XX_STATE_OFF) {
+		wl12xx_error("cannot start because not in off state: %d",
+			     wl->state);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = wl12xx_chip_wakeup(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl->chip.op_boot(wl);
+	if (ret < 0)
+		goto out;
+
+	ret = wl->chip.op_hw_init(wl);
+	if (ret < 0)
+		goto out;
+
+	ret = wl12xx_acx_station_id(wl);
+	if (ret < 0)
+		goto out;
+
+	wl->state = WL12XX_STATE_ON;
+
+	wl12xx_info("firmware booted (%s)", wl->chip.fw_ver);
+
+out:
+	if (ret < 0)
+		wl12xx_power_off(wl);
+
+	mutex_unlock(&wl->mutex);
+
+	return ret;
+}
+
+static void wl12xx_op_stop(struct ieee80211_hw *hw)
+{
+	struct wl12xx *wl = hw->priv;
+
+	wl12xx_info("down");
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 stop");
+
+	mutex_lock(&wl->mutex);
+
+	WARN_ON(wl->state != WL12XX_STATE_ON);
+
+	if (wl->scanning) {
+		mutex_unlock(&wl->mutex);
+		ieee80211_scan_completed(wl->hw, true);
+		mutex_lock(&wl->mutex);
+		wl->scanning = false;
+	}
+
+	wl->state = WL12XX_STATE_OFF;
+
+	wl12xx_disable_interrupts(wl);
+
+	mutex_unlock(&wl->mutex);
+
+	cancel_work_sync(&wl->irq_work);
+	cancel_work_sync(&wl->tx_work);
+	cancel_work_sync(&wl->filter_work);
+
+	mutex_lock(&wl->mutex);
+
+	/* let's notify MAC80211 about the remaining pending TX frames */
+	wl12xx_tx_flush(wl);
+
+	wl12xx_power_off(wl);
+
+	memset(wl->bssid, 0, ETH_ALEN);
+	wl->listen_int = 1;
+	wl->bss_type = MAX_BSS_TYPE;
+
+	wl->data_in_count = 0;
+	wl->rx_counter = 0;
+	wl->rx_handled = 0;
+	wl->rx_current_buffer = 0;
+	wl->rx_last_id = 0;
+	wl->next_tx_complete = 0;
+	wl->elp = false;
+	wl->psm = 0;
+	wl->tx_queue_stopped = false;
+	wl->power_level = WL12XX_DEFAULT_POWER_LEVEL;
+
+	wl12xx_debugfs_reset(wl);
+
+	mutex_unlock(&wl->mutex);
+}
+
+static int wl12xx_op_add_interface(struct ieee80211_hw *hw,
+				   struct ieee80211_if_init_conf *conf)
+{
+	struct wl12xx *wl = hw->priv;
+	DECLARE_MAC_BUF(mac);
+	int ret = 0;
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 add interface type %d mac %s",
+		     conf->type, print_mac(mac, conf->mac_addr));
+
+	mutex_lock(&wl->mutex);
+
+	switch (conf->type) {
+	case NL80211_IFTYPE_STATION:
+		wl->bss_type = BSS_TYPE_STA_BSS;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		wl->bss_type = BSS_TYPE_IBSS;
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (memcmp(wl->mac_addr, conf->mac_addr, ETH_ALEN)) {
+		memcpy(wl->mac_addr, conf->mac_addr, ETH_ALEN);
+		SET_IEEE80211_PERM_ADDR(wl->hw, wl->mac_addr);
+		ret = wl12xx_acx_station_id(wl);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	mutex_unlock(&wl->mutex);
+	return ret;
+}
+
+static void wl12xx_op_remove_interface(struct ieee80211_hw *hw,
+					 struct ieee80211_if_init_conf *conf)
+{
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 remove interface");
+}
+
+static int wl12xx_build_null_data(struct wl12xx *wl)
+{
+	struct wl12xx_null_data_template template;
+
+	if (!is_zero_ether_addr(wl->bssid)) {
+		memcpy(template.header.da, wl->bssid, ETH_ALEN);
+		memcpy(template.header.bssid, wl->bssid, ETH_ALEN);
+	} else {
+		memset(template.header.da, 0xff, ETH_ALEN);
+		memset(template.header.bssid, 0xff, ETH_ALEN);
+	}
+
+	memcpy(template.header.sa, wl->mac_addr, ETH_ALEN);
+	template.header.frame_ctl = cpu_to_le16(IEEE80211_FTYPE_DATA |
+						IEEE80211_STYPE_NULLFUNC);
+
+	return wl12xx_cmd_template_set(wl, CMD_NULL_DATA, &template,
+				       sizeof(template));
+
+}
+
+static int wl12xx_build_ps_poll(struct wl12xx *wl, u16 aid)
+{
+	struct wl12xx_ps_poll_template template;
+
+	memcpy(template.bssid, wl->bssid, ETH_ALEN);
+	memcpy(template.ta, wl->mac_addr, ETH_ALEN);
+	template.aid = aid;
+	template.fc = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL);
+
+	return wl12xx_cmd_template_set(wl, CMD_PS_POLL, &template,
+				       sizeof(template));
+
+}
+
+static int wl12xx_op_config(struct ieee80211_hw *hw, u32 changed)
+{
+	struct wl12xx *wl = hw->priv;
+	struct ieee80211_conf *conf = &hw->conf;
+	int channel, ret = 0;
+
+	channel = ieee80211_frequency_to_channel(conf->channel->center_freq);
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 config ch %d psm %s power %d",
+		     channel,
+		     conf->flags & IEEE80211_CONF_PS ? "on" : "off",
+		     conf->power_level);
+
+	mutex_lock(&wl->mutex);
+
+	if (channel != wl->channel) {
+		/* FIXME: use beacon interval provided by mac80211 */
+		ret = wl12xx_cmd_join(wl, wl->bss_type, 1, 100, 0);
+		if (ret < 0)
+			goto out;
+
+		wl->channel = channel;
+	}
+
+	ret = wl12xx_build_null_data(wl);
+	if (ret < 0)
+		goto out;
+
+	if (conf->flags & IEEE80211_CONF_PS && !wl->psm_requested) {
+		wl12xx_info("psm enabled");
+
+		wl->psm_requested = true;
+
+		/*
+		 * We enter PSM only if we're already associated.
+		 * If we're not, we'll enter it when joining an SSID,
+		 * through the bss_info_changed() hook.
+		 */
+		ret = wl12xx_ps_set_mode(wl, STATION_POWER_SAVE_MODE);
+	} else if (!(conf->flags & IEEE80211_CONF_PS) &&
+		   wl->psm_requested) {
+		wl12xx_info("psm disabled");
+
+		wl->psm_requested = false;
+
+		if (wl->psm)
+			ret = wl12xx_ps_set_mode(wl, STATION_ACTIVE_MODE);
+	}
+
+	if (conf->power_level != wl->power_level) {
+		ret = wl12xx_acx_tx_power(wl, conf->power_level);
+		if (ret < 0)
+			goto out;
+
+		wl->power_level = conf->power_level;
+	}
+
+out:
+	mutex_unlock(&wl->mutex);
+	return ret;
+}
+
+#define WL12XX_SUPPORTED_FILTERS (FIF_PROMISC_IN_BSS | \
+				  FIF_ALLMULTI | \
+				  FIF_FCSFAIL | \
+				  FIF_BCN_PRBRESP_PROMISC | \
+				  FIF_CONTROL | \
+				  FIF_OTHER_BSS)
+
+static void wl12xx_op_configure_filter(struct ieee80211_hw *hw,
+				       unsigned int changed,
+				       unsigned int *total,
+				       int mc_count,
+				       struct dev_addr_list *mc_list)
+{
+	struct wl12xx *wl = hw->priv;
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 configure filter");
+
+	*total &= WL12XX_SUPPORTED_FILTERS;
+	changed &= WL12XX_SUPPORTED_FILTERS;
+
+	if (changed == 0)
+		/* no filters which we support changed */
+		return;
+
+	/* FIXME: wl->rx_config and wl->rx_filter are not protected */
+
+	wl->rx_config = WL12XX_DEFAULT_RX_CONFIG;
+	wl->rx_filter = WL12XX_DEFAULT_RX_FILTER;
+
+	if (*total & FIF_PROMISC_IN_BSS) {
+		wl->rx_config |= CFG_BSSID_FILTER_EN;
+		wl->rx_config |= CFG_RX_ALL_GOOD;
+	}
+	if (*total & FIF_ALLMULTI)
+		/*
+		 * CFG_MC_FILTER_EN in rx_config needs to be 0 to receive
+		 * all multicast frames
+		 */
+		wl->rx_config &= ~CFG_MC_FILTER_EN;
+	if (*total & FIF_FCSFAIL)
+		wl->rx_filter |= CFG_RX_FCS_ERROR;
+	if (*total & FIF_BCN_PRBRESP_PROMISC) {
+		wl->rx_config &= ~CFG_BSSID_FILTER_EN;
+		wl->rx_config &= ~CFG_SSID_FILTER_EN;
+	}
+	if (*total & FIF_CONTROL)
+		wl->rx_filter |= CFG_RX_CTL_EN;
+	if (*total & FIF_OTHER_BSS)
+		wl->rx_filter &= ~CFG_BSSID_FILTER_EN;
+
+	/*
+	 * FIXME: workqueues need to be properly cancelled on stop(), for
+	 * now let's just disable changing the filter settings. They will
+	 * be updated any on config().
+	 */
+	/* schedule_work(&wl->filter_work); */
+}
+
+/* HW encryption */
+static int wl12xx_set_key_type(struct wl12xx *wl, struct acx_set_key *key,
+			       enum set_key_cmd cmd,
+			       struct ieee80211_key_conf *mac80211_key,
+			       const u8 *addr)
+{
+	switch (mac80211_key->alg) {
+	case ALG_WEP:
+		if (is_broadcast_ether_addr(addr))
+			key->key_type = KEY_WEP_DEFAULT;
+		else
+			key->key_type = KEY_WEP_ADDR;
+
+		mac80211_key->hw_key_idx = mac80211_key->keyidx;
+		break;
+	case ALG_TKIP:
+		if (is_broadcast_ether_addr(addr))
+			key->key_type = KEY_TKIP_MIC_GROUP;
+		else
+			key->key_type = KEY_TKIP_MIC_PAIRWISE;
+
+		mac80211_key->hw_key_idx = mac80211_key->keyidx;
+		break;
+	case ALG_CCMP:
+		if (is_broadcast_ether_addr(addr))
+			key->key_type = KEY_AES_GROUP;
+		else
+			key->key_type = KEY_AES_PAIRWISE;
+		mac80211_key->flags |= IEEE80211_KEY_FLAG_GENERATE_IV;
+		break;
+	default:
+		wl12xx_error("Unknown key algo 0x%x", mac80211_key->alg);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int wl12xx_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
+			     struct ieee80211_vif *vif,
+			     struct ieee80211_sta *sta,
+			     struct ieee80211_key_conf *key)
+{
+	struct wl12xx *wl = hw->priv;
+	struct acx_set_key wl_key;
+	const u8 *addr;
+	int ret;
+
+	static const u8 bcast_addr[ETH_ALEN] =
+		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 set key");
+
+	memset(&wl_key, 0, sizeof(wl_key));
+
+	addr = sta ? sta->addr : bcast_addr;
+
+	wl12xx_debug(DEBUG_CRYPT, "CMD: 0x%x", cmd);
+	wl12xx_dump(DEBUG_CRYPT, "ADDR: ", addr, ETH_ALEN);
+	wl12xx_debug(DEBUG_CRYPT, "Key: algo:0x%x, id:%d, len:%d flags 0x%x",
+		     key->alg, key->keyidx, key->keylen, key->flags);
+	wl12xx_dump(DEBUG_CRYPT, "KEY: ", key->key, key->keylen);
+
+	mutex_lock(&wl->mutex);
+
+	switch (cmd) {
+	case SET_KEY:
+		wl_key.key_action = KEY_ADD_OR_REPLACE;
+		break;
+	case DISABLE_KEY:
+		wl_key.key_action = KEY_REMOVE;
+		break;
+	default:
+		wl12xx_error("Unsupported key cmd 0x%x", cmd);
+		break;
+	}
+
+	ret = wl12xx_set_key_type(wl, &wl_key, cmd, key, addr);
+	if (ret < 0) {
+		wl12xx_error("Set KEY type failed");
+		goto out;
+	}
+
+	if (wl_key.key_type != KEY_WEP_DEFAULT)
+		memcpy(wl_key.addr, addr, ETH_ALEN);
+
+	if ((wl_key.key_type == KEY_TKIP_MIC_GROUP) ||
+	    (wl_key.key_type == KEY_TKIP_MIC_PAIRWISE)) {
+		/*
+		 * We get the key in the following form:
+		 * TKIP (16 bytes) - TX MIC (8 bytes) - RX MIC (8 bytes)
+		 * but the target is expecting:
+		 * TKIP - RX MIC - TX MIC
+		 */
+		memcpy(wl_key.key, key->key, 16);
+		memcpy(wl_key.key + 16, key->key + 24, 8);
+		memcpy(wl_key.key + 24, key->key + 16, 8);
+
+	} else {
+		memcpy(wl_key.key, key->key, key->keylen);
+	}
+	wl_key.key_size = key->keylen;
+
+	wl_key.id = key->keyidx;
+	wl_key.ssid_profile = 0;
+
+	wl12xx_dump(DEBUG_CRYPT, "TARGET KEY: ", &wl_key, sizeof(wl_key));
+
+	if (wl12xx_cmd_send(wl, CMD_SET_KEYS, &wl_key, sizeof(wl_key)) < 0) {
+		wl12xx_error("Set KEY failed");
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+out:
+	mutex_unlock(&wl->mutex);
+	return ret;
+}
+
+static int wl12xx_build_basic_rates(char *rates)
+{
+	u8 index = 0;
+
+	rates[index++] = IEEE80211_BASIC_RATE_MASK | IEEE80211_CCK_RATE_1MB;
+	rates[index++] = IEEE80211_BASIC_RATE_MASK | IEEE80211_CCK_RATE_2MB;
+	rates[index++] = IEEE80211_BASIC_RATE_MASK | IEEE80211_CCK_RATE_5MB;
+	rates[index++] = IEEE80211_BASIC_RATE_MASK | IEEE80211_CCK_RATE_11MB;
+
+	return index;
+}
+
+static int wl12xx_build_extended_rates(char *rates)
+{
+	u8 index = 0;
+
+	rates[index++] = IEEE80211_OFDM_RATE_6MB;
+	rates[index++] = IEEE80211_OFDM_RATE_9MB;
+	rates[index++] = IEEE80211_OFDM_RATE_12MB;
+	rates[index++] = IEEE80211_OFDM_RATE_18MB;
+	rates[index++] = IEEE80211_OFDM_RATE_24MB;
+	rates[index++] = IEEE80211_OFDM_RATE_36MB;
+	rates[index++] = IEEE80211_OFDM_RATE_48MB;
+	rates[index++] = IEEE80211_OFDM_RATE_54MB;
+
+	return index;
+}
+
+
+static int wl12xx_build_probe_req(struct wl12xx *wl, u8 *ssid, size_t ssid_len)
+{
+	struct wl12xx_probe_req_template template;
+	struct wl12xx_ie_rates *rates;
+	char *ptr;
+	u16 size;
+
+	ptr = (char *)&template;
+	size = sizeof(struct ieee80211_header);
+
+	memset(template.header.da, 0xff, ETH_ALEN);
+	memset(template.header.bssid, 0xff, ETH_ALEN);
+	memcpy(template.header.sa, wl->mac_addr, ETH_ALEN);
+	template.header.frame_ctl = cpu_to_le16(IEEE80211_STYPE_PROBE_REQ);
+
+	/* IEs */
+	/* SSID */
+	template.ssid.header.id = WLAN_EID_SSID;
+	template.ssid.header.len = ssid_len;
+	if (ssid_len && ssid)
+		memcpy(template.ssid.ssid, ssid, ssid_len);
+	size += sizeof(struct wl12xx_ie_header) + ssid_len;
+	ptr += size;
+
+	/* Basic Rates */
+	rates = (struct wl12xx_ie_rates *)ptr;
+	rates->header.id = WLAN_EID_SUPP_RATES;
+	rates->header.len = wl12xx_build_basic_rates(rates->rates);
+	size += sizeof(struct wl12xx_ie_header) + rates->header.len;
+	ptr += sizeof(struct wl12xx_ie_header) + rates->header.len;
+
+	/* Extended rates */
+	rates = (struct wl12xx_ie_rates *)ptr;
+	rates->header.id = WLAN_EID_EXT_SUPP_RATES;
+	rates->header.len = wl12xx_build_extended_rates(rates->rates);
+	size += sizeof(struct wl12xx_ie_header) + rates->header.len;
+
+	wl12xx_dump(DEBUG_SCAN, "PROBE REQ: ", &template, size);
+
+	return wl12xx_cmd_template_set(wl, CMD_PROBE_REQ, &template,
+				      size);
+}
+
+static int wl12xx_hw_scan(struct wl12xx *wl, u8 *ssid, size_t len,
+			  u8 active_scan, u8 high_prio, u8 num_channels,
+			  u8 probe_requests)
+{
+	int i, ret;
+	u32 split_scan = 0;
+	u16 scan_options = 0;
+	struct cmd_scan *params;
+	struct wl12xx_command *cmd_answer;
+
+	if (wl->scanning)
+		return -EINVAL;
+
+	params = kzalloc(sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	params->params.rx_config_options = cpu_to_le32(CFG_RX_ALL_GOOD);
+	params->params.rx_filter_options =
+		cpu_to_le32(CFG_RX_PRSP_EN | CFG_RX_MGMT_EN | CFG_RX_BCN_EN);
+
+	/* High priority scan */
+	if (!active_scan)
+		scan_options |= SCAN_PASSIVE;
+	if (high_prio)
+		scan_options |= SCAN_PRIORITY_HIGH;
+	params->params.scan_options = scan_options;
+
+	params->params.num_channels = num_channels;
+	params->params.num_probe_requests = probe_requests;
+	params->params.tx_rate = cpu_to_le16(1 << 1); /* 2 Mbps */
+	params->params.tid_trigger = 0;
+
+	for (i = 0; i < num_channels; i++) {
+		params->channels[i].min_duration = cpu_to_le32(30000);
+		params->channels[i].max_duration = cpu_to_le32(60000);
+		memset(&params->channels[i].bssid_lsb, 0xff, 4);
+		memset(&params->channels[i].bssid_msb, 0xff, 2);
+		params->channels[i].early_termination = 0;
+		params->channels[i].tx_power_att = 0;
+		params->channels[i].channel = i + 1;
+		memset(params->channels[i].pad, 0, 3);
+	}
+
+	for (i = num_channels; i < SCAN_MAX_NUM_OF_CHANNELS; i++)
+		memset(&params->channels[i], 0,
+		       sizeof(struct basic_scan_channel_parameters));
+
+	if (len && ssid) {
+		params->params.ssid_len = len;
+		memcpy(params->params.ssid, ssid, len);
+	} else {
+		params->params.ssid_len = 0;
+		memset(params->params.ssid, 0, 32);
+	}
+
+	ret = wl12xx_build_probe_req(wl, ssid, len);
+	if (ret < 0) {
+		wl12xx_error("PROBE request template failed");
+		goto out;
+	}
+
+	ret = wl12xx_cmd_send(wl, CMD_TRIGGER_SCAN_TO, &split_scan,
+			      sizeof(u32));
+	if (ret < 0) {
+		wl12xx_error("Split SCAN failed");
+		goto out;
+	}
+
+	wl12xx_dump(DEBUG_SCAN, "SCAN: ", params, sizeof(*params));
+
+	wl->scanning = true;
+
+	ret = wl12xx_cmd_send(wl, CMD_SCAN, params, sizeof(*params));
+	if (ret < 0)
+		wl12xx_error("SCAN failed");
+
+	wl12xx_spi_mem_read(wl, wl->cmd_box_addr, params, sizeof(*params));
+
+	cmd_answer = (struct wl12xx_command *) params;
+	if (cmd_answer->status != CMD_STATUS_SUCCESS) {
+		wl12xx_error("TEST command answer error: %d",
+			     cmd_answer->status);
+		wl->scanning = false;
+		ret = -EIO;
+		goto out;
+	}
+
+out:
+	kfree(params);
+	return ret;
+
+}
+
+static int wl12xx_op_hw_scan(struct ieee80211_hw *hw,
+			     struct cfg80211_scan_request *req)
+{
+	struct wl12xx *wl = hw->priv;
+	int ret;
+	u8 *ssid = NULL;
+	size_t ssid_len = 0;
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 hw scan");
+
+	if (req->n_ssids) {
+		ssid = req->ssids[0].ssid;
+		ssid_len = req->ssids[0].ssid_len;
+	}
+
+	mutex_lock(&wl->mutex);
+	ret = wl12xx_hw_scan(hw->priv, ssid, ssid_len, 1, 0, 13, 3);
+	mutex_unlock(&wl->mutex);
+
+	return ret;
+}
+
+static int wl12xx_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+{
+	struct wl12xx *wl = hw->priv;
+	int ret;
+
+	ret = wl12xx_acx_rts_threshold(wl, (u16) value);
+
+	if (ret < 0)
+		wl12xx_warning("wl12xx_op_set_rts_threshold failed: %d", ret);
+
+	return ret;
+}
+
+static void wl12xx_op_bss_info_changed(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       struct ieee80211_bss_conf *bss_conf,
+				       u32 changed)
+{
+	enum acx_ps_mode mode;
+	struct wl12xx *wl = hw->priv;
+	struct sk_buff *beacon;
+	int ret;
+
+	wl12xx_debug(DEBUG_MAC80211, "mac80211 bss info changed");
+
+	mutex_lock(&wl->mutex);
+
+	if (changed & BSS_CHANGED_ASSOC) {
+		if (bss_conf->assoc) {
+			wl->aid = bss_conf->aid;
+
+			ret = wl12xx_build_ps_poll(wl, wl->aid);
+			if (ret < 0)
+				goto out;
+
+			ret = wl12xx_acx_aid(wl, wl->aid);
+			if (ret < 0)
+				goto out;
+
+			/* If we want to go in PSM but we're not there yet */
+			if (wl->psm_requested && !wl->psm) {
+				mode = STATION_POWER_SAVE_MODE;
+				ret = wl12xx_ps_set_mode(wl, mode);
+				if (ret < 0)
+					goto out;
+			}
+		}
+	}
+	if (changed & BSS_CHANGED_ERP_SLOT) {
+		if (bss_conf->use_short_slot)
+			ret = wl12xx_acx_slot(wl, SLOT_TIME_SHORT);
+		else
+			ret = wl12xx_acx_slot(wl, SLOT_TIME_LONG);
+		if (ret < 0) {
+			wl12xx_warning("Set slot time failed %d", ret);
+			goto out;
+		}
+	}
+
+	if (changed & BSS_CHANGED_ERP_PREAMBLE) {
+		if (bss_conf->use_short_preamble)
+			wl12xx_acx_set_preamble(wl, ACX_PREAMBLE_SHORT);
+		else
+			wl12xx_acx_set_preamble(wl, ACX_PREAMBLE_LONG);
+	}
+
+	if (changed & BSS_CHANGED_ERP_CTS_PROT) {
+		if (bss_conf->use_cts_prot)
+			ret = wl12xx_acx_cts_protect(wl, CTSPROTECT_ENABLE);
+		else
+			ret = wl12xx_acx_cts_protect(wl, CTSPROTECT_DISABLE);
+		if (ret < 0) {
+			wl12xx_warning("Set ctsprotect failed %d", ret);
+			goto out;
+		}
+	}
+
+	if (changed & BSS_CHANGED_BSSID) {
+		memcpy(wl->bssid, bss_conf->bssid, ETH_ALEN);
+
+		ret = wl12xx_build_null_data(wl);
+		if (ret < 0)
+			goto out;
+
+		if (wl->bss_type != BSS_TYPE_IBSS) {
+			ret = wl12xx_cmd_join(wl, wl->bss_type, 5, 100, 1);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	if (changed & BSS_CHANGED_BEACON) {
+		beacon = ieee80211_beacon_get(hw, vif);
+		ret = wl12xx_cmd_template_set(wl, CMD_BEACON, beacon->data,
+					      beacon->len);
+
+		if (ret < 0) {
+			dev_kfree_skb(beacon);
+			goto out;
+		}
+
+		ret = wl12xx_cmd_template_set(wl, CMD_PROBE_RESP, beacon->data,
+					      beacon->len);
+
+		dev_kfree_skb(beacon);
+
+		if (ret < 0)
+			goto out;
+
+		ret = wl12xx_cmd_join(wl, wl->bss_type, 1, 100, 0);
+
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	mutex_unlock(&wl->mutex);
+}
+
+
+/* can't be const, mac80211 writes to this */
+static struct ieee80211_rate wl12xx_rates[] = {
+	{ .bitrate = 10,
+	  .hw_value = 0x1,
+	  .hw_value_short = 0x1, },
+	{ .bitrate = 20,
+	  .hw_value = 0x2,
+	  .hw_value_short = 0x2,
+	  .flags = IEEE80211_RATE_SHORT_PREAMBLE },
+	{ .bitrate = 55,
+	  .hw_value = 0x4,
+	  .hw_value_short = 0x4,
+	  .flags = IEEE80211_RATE_SHORT_PREAMBLE },
+	{ .bitrate = 110,
+	  .hw_value = 0x20,
+	  .hw_value_short = 0x20,
+	  .flags = IEEE80211_RATE_SHORT_PREAMBLE },
+	{ .bitrate = 60,
+	  .hw_value = 0x8,
+	  .hw_value_short = 0x8, },
+	{ .bitrate = 90,
+	  .hw_value = 0x10,
+	  .hw_value_short = 0x10, },
+	{ .bitrate = 120,
+	  .hw_value = 0x40,
+	  .hw_value_short = 0x40, },
+	{ .bitrate = 180,
+	  .hw_value = 0x80,
+	  .hw_value_short = 0x80, },
+	{ .bitrate = 240,
+	  .hw_value = 0x200,
+	  .hw_value_short = 0x200, },
+	{ .bitrate = 360,
+	 .hw_value = 0x400,
+	 .hw_value_short = 0x400, },
+	{ .bitrate = 480,
+	  .hw_value = 0x800,
+	  .hw_value_short = 0x800, },
+	{ .bitrate = 540,
+	  .hw_value = 0x1000,
+	  .hw_value_short = 0x1000, },
+};
+
+/* can't be const, mac80211 writes to this */
+static struct ieee80211_channel wl12xx_channels[] = {
+	{ .hw_value = 1, .center_freq = 2412},
+	{ .hw_value = 2, .center_freq = 2417},
+	{ .hw_value = 3, .center_freq = 2422},
+	{ .hw_value = 4, .center_freq = 2427},
+	{ .hw_value = 5, .center_freq = 2432},
+	{ .hw_value = 6, .center_freq = 2437},
+	{ .hw_value = 7, .center_freq = 2442},
+	{ .hw_value = 8, .center_freq = 2447},
+	{ .hw_value = 9, .center_freq = 2452},
+	{ .hw_value = 10, .center_freq = 2457},
+	{ .hw_value = 11, .center_freq = 2462},
+	{ .hw_value = 12, .center_freq = 2467},
+	{ .hw_value = 13, .center_freq = 2472},
+};
+
+/* can't be const, mac80211 writes to this */
+static struct ieee80211_supported_band wl12xx_band_2ghz = {
+	.channels = wl12xx_channels,
+	.n_channels = ARRAY_SIZE(wl12xx_channels),
+	.bitrates = wl12xx_rates,
+	.n_bitrates = ARRAY_SIZE(wl12xx_rates),
+};
+
+static const struct ieee80211_ops wl12xx_ops = {
+	.start = wl12xx_op_start,
+	.stop = wl12xx_op_stop,
+	.add_interface = wl12xx_op_add_interface,
+	.remove_interface = wl12xx_op_remove_interface,
+	.config = wl12xx_op_config,
+	.configure_filter = wl12xx_op_configure_filter,
+	.tx = wl12xx_op_tx,
+	.set_key = wl12xx_op_set_key,
+	.hw_scan = wl12xx_op_hw_scan,
+	.bss_info_changed = wl12xx_op_bss_info_changed,
+	.set_rts_threshold = wl12xx_op_set_rts_threshold,
+};
+
+static int wl12xx_register_hw(struct wl12xx *wl)
+{
+	int ret;
+
+	if (wl->mac80211_registered)
+		return 0;
+
+	SET_IEEE80211_PERM_ADDR(wl->hw, wl->mac_addr);
+
+	ret = ieee80211_register_hw(wl->hw);
+	if (ret < 0) {
+		wl12xx_error("unable to register mac80211 hw: %d", ret);
+		return ret;
+	}
+
+	wl->mac80211_registered = true;
+
+	wl12xx_notice("loaded");
+
+	return 0;
+}
+
+static int wl12xx_init_ieee80211(struct wl12xx *wl)
+{
+	/* The tx descriptor buffer and the TKIP space */
+	wl->hw->extra_tx_headroom = sizeof(struct tx_double_buffer_desc)
+		+ WL12XX_TKIP_IV_SPACE;
+
+	/* unit us */
+	/* FIXME: find a proper value */
+	wl->hw->channel_change_time = 10000;
+
+	wl->hw->flags = IEEE80211_HW_SIGNAL_DBM |
+		IEEE80211_HW_NOISE_DBM;
+
+	wl->hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION);
+	wl->hw->wiphy->max_scan_ssids = 1;
+	wl->hw->wiphy->bands[IEEE80211_BAND_2GHZ] = &wl12xx_band_2ghz;
+
+	SET_IEEE80211_DEV(wl->hw, &wl->spi->dev);
+
+	return 0;
+}
+
+#define WL12XX_DEFAULT_CHANNEL 1
+static int __devinit wl12xx_probe(struct spi_device *spi)
+{
+	struct wl12xx_platform_data *pdata;
+	struct ieee80211_hw *hw;
+	struct wl12xx *wl;
+	int ret, i;
+	static const u8 nokia_oui[3] = {0x00, 0x1f, 0xdf};
+
+	pdata = spi->dev.platform_data;
+	if (!pdata) {
+		wl12xx_error("no platform data");
+		return -ENODEV;
+	}
+
+	hw = ieee80211_alloc_hw(sizeof(*wl), &wl12xx_ops);
+	if (!hw) {
+		wl12xx_error("could not alloc ieee80211_hw");
+		return -ENOMEM;
+	}
+
+	wl = hw->priv;
+	memset(wl, 0, sizeof(*wl));
+
+	wl->hw = hw;
+	dev_set_drvdata(&spi->dev, wl);
+	wl->spi = spi;
+
+	wl->data_in_count = 0;
+
+	skb_queue_head_init(&wl->tx_queue);
+
+	INIT_WORK(&wl->tx_work, wl12xx_tx_work);
+	INIT_WORK(&wl->filter_work, wl12xx_filter_work);
+	wl->channel = WL12XX_DEFAULT_CHANNEL;
+	wl->scanning = false;
+	wl->default_key = 0;
+	wl->listen_int = 1;
+	wl->rx_counter = 0;
+	wl->rx_handled = 0;
+	wl->rx_current_buffer = 0;
+	wl->rx_last_id = 0;
+	wl->rx_config = WL12XX_DEFAULT_RX_CONFIG;
+	wl->rx_filter = WL12XX_DEFAULT_RX_FILTER;
+	wl->elp = false;
+	wl->psm = 0;
+	wl->psm_requested = false;
+	wl->tx_queue_stopped = false;
+	wl->power_level = WL12XX_DEFAULT_POWER_LEVEL;
+
+	/* We use the default power on sleep time until we know which chip
+	 * we're using */
+	wl->chip.power_on_sleep = WL12XX_DEFAULT_POWER_ON_SLEEP;
+
+	for (i = 0; i < FW_TX_CMPLT_BLOCK_SIZE; i++)
+		wl->tx_frames[i] = NULL;
+
+	wl->next_tx_complete = 0;
+
+	/*
+	 * In case our MAC address is not correctly set,
+	 * we use a random but Nokia MAC.
+	 */
+	memcpy(wl->mac_addr, nokia_oui, 3);
+	get_random_bytes(wl->mac_addr + 3, 3);
+
+	wl->state = WL12XX_STATE_OFF;
+	mutex_init(&wl->mutex);
+
+	wl->tx_mgmt_frm_rate = DEFAULT_HW_GEN_TX_RATE;
+	wl->tx_mgmt_frm_mod = DEFAULT_HW_GEN_MODULATION_TYPE;
+
+	/* This is the only SPI value that we need to set here, the rest
+	 * comes from the board-peripherals file */
+	spi->bits_per_word = 32;
+
+	ret = spi_setup(spi);
+	if (ret < 0) {
+		wl12xx_error("spi_setup failed");
+		goto out_free;
+	}
+
+	wl->set_power = pdata->set_power;
+	if (!wl->set_power) {
+		wl12xx_error("set power function missing in platform data");
+		return -ENODEV;
+	}
+
+	wl->irq = spi->irq;
+	if (wl->irq < 0) {
+		wl12xx_error("irq missing in platform data");
+		return -ENODEV;
+	}
+
+	ret = request_irq(wl->irq, wl12xx_irq, 0, DRIVER_NAME, wl);
+	if (ret < 0) {
+		wl12xx_error("request_irq() failed: %d", ret);
+		goto out_free;
+	}
+
+	set_irq_type(wl->irq, IRQ_TYPE_EDGE_RISING);
+
+	disable_irq(wl->irq);
+
+	ret = wl12xx_init_ieee80211(wl);
+	if (ret)
+		goto out_irq;
+
+	ret = wl12xx_register_hw(wl);
+	if (ret)
+		goto out_irq;
+
+	wl12xx_debugfs_init(wl);
+
+	wl12xx_notice("initialized");
+
+	return 0;
+
+ out_irq:
+	free_irq(wl->irq, wl);
+
+ out_free:
+	ieee80211_free_hw(hw);
+
+	return ret;
+}
+
+static int __devexit wl12xx_remove(struct spi_device *spi)
+{
+	struct wl12xx *wl = dev_get_drvdata(&spi->dev);
+
+	ieee80211_unregister_hw(wl->hw);
+
+	wl12xx_debugfs_exit(wl);
+
+	free_irq(wl->irq, wl);
+	kfree(wl->target_mem_map);
+	kfree(wl->data_path);
+	kfree(wl->fw);
+	wl->fw = NULL;
+	kfree(wl->nvs);
+	wl->nvs = NULL;
+	ieee80211_free_hw(wl->hw);
+
+	return 0;
+}
+
+
+static struct spi_driver wl12xx_spi_driver = {
+	.driver = {
+		.name		= "wl12xx",
+		.bus		= &spi_bus_type,
+		.owner		= THIS_MODULE,
+	},
+
+	.probe		= wl12xx_probe,
+	.remove		= __devexit_p(wl12xx_remove),
+};
+
+static int __init wl12xx_init(void)
+{
+	int ret;
+
+	ret = spi_register_driver(&wl12xx_spi_driver);
+	if (ret < 0) {
+		wl12xx_error("failed to register spi driver: %d", ret);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static void __exit wl12xx_exit(void)
+{
+	spi_unregister_driver(&wl12xx_spi_driver);
+
+	wl12xx_notice("unloaded");
+}
+
+module_init(wl12xx_init);
+module_exit(wl12xx_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kalle Valo <Kalle.Valo@nokia.com>, "
+		"Luciano Coelho <luciano.coelho@nokia.com>");
diff --git a/drivers/net/wireless/wl12xx/ps.c b/drivers/net/wireless/wl12xx/ps.c
new file mode 100644
index 00000000000..83a10117330
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/ps.c
@@ -0,0 +1,151 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include "reg.h"
+#include "ps.h"
+#include "spi.h"
+
+#define WL12XX_WAKEUP_TIMEOUT 2000
+
+/* Routines to toggle sleep mode while in ELP */
+void wl12xx_ps_elp_sleep(struct wl12xx *wl)
+{
+	if (wl->elp || !wl->psm)
+		return;
+
+	wl12xx_debug(DEBUG_PSM, "chip to elp");
+
+	wl12xx_write32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR, ELPCTRL_SLEEP);
+
+	wl->elp = true;
+}
+
+int wl12xx_ps_elp_wakeup(struct wl12xx *wl)
+{
+	unsigned long timeout;
+	u32 elp_reg;
+
+	if (!wl->elp)
+		return 0;
+
+	wl12xx_debug(DEBUG_PSM, "waking up chip from elp");
+
+	timeout = jiffies + msecs_to_jiffies(WL12XX_WAKEUP_TIMEOUT);
+
+	wl12xx_write32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR, ELPCTRL_WAKE_UP);
+
+	elp_reg = wl12xx_read32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR);
+
+	/*
+	 * FIXME: we should wait for irq from chip but, as a temporary
+	 * solution to simplify locking, let's poll instead
+	 */
+	while (!(elp_reg & ELPCTRL_WLAN_READY)) {
+		if (time_after(jiffies, timeout)) {
+			wl12xx_error("elp wakeup timeout");
+			return -ETIMEDOUT;
+		}
+		msleep(1);
+		elp_reg = wl12xx_read32(wl, HW_ACCESS_ELP_CTRL_REG_ADDR);
+	}
+
+	wl12xx_debug(DEBUG_PSM, "wakeup time: %u ms",
+		     jiffies_to_msecs(jiffies) -
+		     (jiffies_to_msecs(timeout) - WL12XX_WAKEUP_TIMEOUT));
+
+	wl->elp = false;
+
+	return 0;
+}
+
+static int wl12xx_ps_set_elp(struct wl12xx *wl, bool enable)
+{
+	int ret;
+
+	if (enable) {
+		wl12xx_debug(DEBUG_PSM, "sleep auth psm/elp");
+
+		/*
+		 * FIXME: we should PSM_ELP, but because of firmware wakeup
+		 * problems let's use only PSM_PS
+		 */
+		ret = wl12xx_acx_sleep_auth(wl, WL12XX_PSM_PS);
+		if (ret < 0)
+			return ret;
+
+		wl12xx_ps_elp_sleep(wl);
+	} else {
+		wl12xx_debug(DEBUG_PSM, "sleep auth cam");
+
+		/*
+		 * When the target is in ELP, we can only
+		 * access the ELP control register. Thus,
+		 * we have to wake the target up before
+		 * changing the power authorization.
+		 */
+
+		wl12xx_ps_elp_wakeup(wl);
+
+		ret = wl12xx_acx_sleep_auth(wl, WL12XX_PSM_CAM);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+int wl12xx_ps_set_mode(struct wl12xx *wl, enum acx_ps_mode mode)
+{
+	int ret;
+
+	switch (mode) {
+	case STATION_POWER_SAVE_MODE:
+		wl12xx_debug(DEBUG_PSM, "entering psm");
+		ret = wl12xx_cmd_ps_mode(wl, STATION_POWER_SAVE_MODE);
+		if (ret < 0)
+			return ret;
+
+		ret = wl12xx_ps_set_elp(wl, true);
+		if (ret < 0)
+			return ret;
+
+		wl->psm = 1;
+		break;
+	case STATION_ACTIVE_MODE:
+	default:
+		wl12xx_debug(DEBUG_PSM, "leaving psm");
+		ret = wl12xx_ps_set_elp(wl, false);
+		if (ret < 0)
+			return ret;
+
+		ret = wl12xx_cmd_ps_mode(wl, STATION_ACTIVE_MODE);
+		if (ret < 0)
+			return ret;
+
+		wl->psm = 0;
+		break;
+	}
+
+	return ret;
+}
+
diff --git a/drivers/net/wireless/wl12xx/ps.h b/drivers/net/wireless/wl12xx/ps.h
new file mode 100644
index 00000000000..5d7c5255383
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/ps.h
@@ -0,0 +1,36 @@
+#ifndef __WL12XX_PS_H__
+#define __WL12XX_PS_H__
+
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include "wl12xx.h"
+#include "acx.h"
+
+int wl12xx_ps_set_mode(struct wl12xx *wl, enum acx_ps_mode mode);
+void wl12xx_ps_elp_sleep(struct wl12xx *wl);
+int wl12xx_ps_elp_wakeup(struct wl12xx *wl);
+
+
+#endif /* __WL12XX_PS_H__ */
diff --git a/drivers/net/wireless/wl12xx/reg.h b/drivers/net/wireless/wl12xx/reg.h
new file mode 100644
index 00000000000..e421643215c
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/reg.h
@@ -0,0 +1,745 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __REG_H__
+#define __REG_H__
+
+#include <linux/bitops.h>
+#include "wl12xx.h"
+
+#define REGISTERS_BASE 0x00300000
+#define DRPW_BASE      0x00310000
+
+#define REGISTERS_DOWN_SIZE 0x00008800
+#define REGISTERS_WORK_SIZE 0x0000b000
+
+#define HW_ACCESS_ELP_CTRL_REG_ADDR         0x1FFFC
+
+/* ELP register commands */
+#define ELPCTRL_WAKE_UP             0x1
+#define ELPCTRL_WAKE_UP_WLAN_READY  0x5
+#define ELPCTRL_SLEEP               0x0
+/* ELP WLAN_READY bit */
+#define ELPCTRL_WLAN_READY          0x2
+
+/*
+ * Interrupt registers.
+ * 64 bit interrupt sources registers ws ced.
+ * sme interupts were removed and new ones were added.
+ * Order was changed.
+ */
+#define FIQ_MASK                       (REGISTERS_BASE + 0x0400)
+#define FIQ_MASK_L                     (REGISTERS_BASE + 0x0400)
+#define FIQ_MASK_H                     (REGISTERS_BASE + 0x0404)
+#define FIQ_MASK_SET                   (REGISTERS_BASE + 0x0408)
+#define FIQ_MASK_SET_L                 (REGISTERS_BASE + 0x0408)
+#define FIQ_MASK_SET_H                 (REGISTERS_BASE + 0x040C)
+#define FIQ_MASK_CLR                   (REGISTERS_BASE + 0x0410)
+#define FIQ_MASK_CLR_L                 (REGISTERS_BASE + 0x0410)
+#define FIQ_MASK_CLR_H                 (REGISTERS_BASE + 0x0414)
+#define IRQ_MASK                       (REGISTERS_BASE + 0x0418)
+#define IRQ_MASK_L                     (REGISTERS_BASE + 0x0418)
+#define IRQ_MASK_H                     (REGISTERS_BASE + 0x041C)
+#define IRQ_MASK_SET                   (REGISTERS_BASE + 0x0420)
+#define IRQ_MASK_SET_L                 (REGISTERS_BASE + 0x0420)
+#define IRQ_MASK_SET_H                 (REGISTERS_BASE + 0x0424)
+#define IRQ_MASK_CLR                   (REGISTERS_BASE + 0x0428)
+#define IRQ_MASK_CLR_L                 (REGISTERS_BASE + 0x0428)
+#define IRQ_MASK_CLR_H                 (REGISTERS_BASE + 0x042C)
+#define ECPU_MASK                      (REGISTERS_BASE + 0x0448)
+#define FIQ_STS_L                      (REGISTERS_BASE + 0x044C)
+#define FIQ_STS_H                      (REGISTERS_BASE + 0x0450)
+#define IRQ_STS_L                      (REGISTERS_BASE + 0x0454)
+#define IRQ_STS_H                      (REGISTERS_BASE + 0x0458)
+#define INT_STS_ND                     (REGISTERS_BASE + 0x0464)
+#define INT_STS_RAW_L                  (REGISTERS_BASE + 0x0464)
+#define INT_STS_RAW_H                  (REGISTERS_BASE + 0x0468)
+#define INT_STS_CLR                    (REGISTERS_BASE + 0x04B4)
+#define INT_STS_CLR_L                  (REGISTERS_BASE + 0x04B4)
+#define INT_STS_CLR_H                  (REGISTERS_BASE + 0x04B8)
+#define INT_ACK                        (REGISTERS_BASE + 0x046C)
+#define INT_ACK_L                      (REGISTERS_BASE + 0x046C)
+#define INT_ACK_H                      (REGISTERS_BASE + 0x0470)
+#define INT_TRIG                       (REGISTERS_BASE + 0x0474)
+#define INT_TRIG_L                     (REGISTERS_BASE + 0x0474)
+#define INT_TRIG_H                     (REGISTERS_BASE + 0x0478)
+#define HOST_STS_L                     (REGISTERS_BASE + 0x045C)
+#define HOST_STS_H                     (REGISTERS_BASE + 0x0460)
+#define HOST_MASK                      (REGISTERS_BASE + 0x0430)
+#define HOST_MASK_L                    (REGISTERS_BASE + 0x0430)
+#define HOST_MASK_H                    (REGISTERS_BASE + 0x0434)
+#define HOST_MASK_SET                  (REGISTERS_BASE + 0x0438)
+#define HOST_MASK_SET_L                (REGISTERS_BASE + 0x0438)
+#define HOST_MASK_SET_H                (REGISTERS_BASE + 0x043C)
+#define HOST_MASK_CLR                  (REGISTERS_BASE + 0x0440)
+#define HOST_MASK_CLR_L                (REGISTERS_BASE + 0x0440)
+#define HOST_MASK_CLR_H                (REGISTERS_BASE + 0x0444)
+
+/* Host Interrupts*/
+#define HINT_MASK                      (REGISTERS_BASE + 0x0494)
+#define HINT_MASK_SET                  (REGISTERS_BASE + 0x0498)
+#define HINT_MASK_CLR                  (REGISTERS_BASE + 0x049C)
+#define HINT_STS_ND_MASKED             (REGISTERS_BASE + 0x04A0)
+/*1150 spec calls this HINT_STS_RAW*/
+#define HINT_STS_ND		       (REGISTERS_BASE + 0x04B0)
+#define HINT_STS_CLR                   (REGISTERS_BASE + 0x04A4)
+#define HINT_ACK                       (REGISTERS_BASE + 0x04A8)
+#define HINT_TRIG                      (REGISTERS_BASE + 0x04AC)
+
+/* Device Configuration registers*/
+#define SOR_CFG                        (REGISTERS_BASE + 0x0800)
+#define ECPU_CTRL                      (REGISTERS_BASE + 0x0804)
+#define HI_CFG                         (REGISTERS_BASE + 0x0808)
+#define EE_START                       (REGISTERS_BASE + 0x080C)
+
+#define CHIP_ID_B                      (REGISTERS_BASE + 0x5674)
+
+#define CHIP_ID_1251_PG10	           (0x7010101)
+#define CHIP_ID_1251_PG11	           (0x7020101)
+#define CHIP_ID_1251_PG12	           (0x7030101)
+
+#define ENABLE                         (REGISTERS_BASE + 0x5450)
+
+/* Power Management registers */
+#define ELP_CFG_MODE                   (REGISTERS_BASE + 0x5804)
+#define ELP_CMD                        (REGISTERS_BASE + 0x5808)
+#define PLL_CAL_TIME                   (REGISTERS_BASE + 0x5810)
+#define CLK_REQ_TIME                   (REGISTERS_BASE + 0x5814)
+#define CLK_BUF_TIME                   (REGISTERS_BASE + 0x5818)
+
+#define CFG_PLL_SYNC_CNT               (REGISTERS_BASE + 0x5820)
+
+/* Scratch Pad registers*/
+#define SCR_PAD0                       (REGISTERS_BASE + 0x5608)
+#define SCR_PAD1                       (REGISTERS_BASE + 0x560C)
+#define SCR_PAD2                       (REGISTERS_BASE + 0x5610)
+#define SCR_PAD3                       (REGISTERS_BASE + 0x5614)
+#define SCR_PAD4                       (REGISTERS_BASE + 0x5618)
+#define SCR_PAD4_SET                   (REGISTERS_BASE + 0x561C)
+#define SCR_PAD4_CLR                   (REGISTERS_BASE + 0x5620)
+#define SCR_PAD5                       (REGISTERS_BASE + 0x5624)
+#define SCR_PAD5_SET                   (REGISTERS_BASE + 0x5628)
+#define SCR_PAD5_CLR                   (REGISTERS_BASE + 0x562C)
+#define SCR_PAD6                       (REGISTERS_BASE + 0x5630)
+#define SCR_PAD7                       (REGISTERS_BASE + 0x5634)
+#define SCR_PAD8                       (REGISTERS_BASE + 0x5638)
+#define SCR_PAD9                       (REGISTERS_BASE + 0x563C)
+
+/* Spare registers*/
+#define SPARE_A1                       (REGISTERS_BASE + 0x0994)
+#define SPARE_A2                       (REGISTERS_BASE + 0x0998)
+#define SPARE_A3                       (REGISTERS_BASE + 0x099C)
+#define SPARE_A4                       (REGISTERS_BASE + 0x09A0)
+#define SPARE_A5                       (REGISTERS_BASE + 0x09A4)
+#define SPARE_A6                       (REGISTERS_BASE + 0x09A8)
+#define SPARE_A7                       (REGISTERS_BASE + 0x09AC)
+#define SPARE_A8                       (REGISTERS_BASE + 0x09B0)
+#define SPARE_B1                       (REGISTERS_BASE + 0x5420)
+#define SPARE_B2                       (REGISTERS_BASE + 0x5424)
+#define SPARE_B3                       (REGISTERS_BASE + 0x5428)
+#define SPARE_B4                       (REGISTERS_BASE + 0x542C)
+#define SPARE_B5                       (REGISTERS_BASE + 0x5430)
+#define SPARE_B6                       (REGISTERS_BASE + 0x5434)
+#define SPARE_B7                       (REGISTERS_BASE + 0x5438)
+#define SPARE_B8                       (REGISTERS_BASE + 0x543C)
+
+enum wl12xx_acx_int_reg {
+	ACX_REG_INTERRUPT_TRIG,
+	ACX_REG_INTERRUPT_TRIG_H,
+
+/*=============================================
+  Host Interrupt Mask Register - 32bit (RW)
+  ------------------------------------------
+  Setting a bit in this register masks the
+  corresponding interrupt to the host.
+  0 - RX0		- Rx first dubble buffer Data Interrupt
+  1 - TXD		- Tx Data Interrupt
+  2 - TXXFR		- Tx Transfer Interrupt
+  3 - RX1		- Rx second dubble buffer Data Interrupt
+  4 - RXXFR		- Rx Transfer Interrupt
+  5 - EVENT_A	- Event Mailbox interrupt
+  6 - EVENT_B	- Event Mailbox interrupt
+  7 - WNONHST	- Wake On Host Interrupt
+  8 - TRACE_A	- Debug Trace interrupt
+  9 - TRACE_B	- Debug Trace interrupt
+ 10 - CDCMP		- Command Complete Interrupt
+ 11 -
+ 12 -
+ 13 -
+ 14 - ICOMP		- Initialization Complete Interrupt
+ 16 - SG SE		- Soft Gemini - Sense enable interrupt
+ 17 - SG SD		- Soft Gemini - Sense disable interrupt
+ 18 -			-
+ 19 -			-
+ 20 -			-
+ 21-			-
+ Default: 0x0001
+*==============================================*/
+	ACX_REG_INTERRUPT_MASK,
+
+/*=============================================
+  Host Interrupt Mask Set 16bit, (Write only)
+  ------------------------------------------
+ Setting a bit in this register sets
+ the corresponding bin in ACX_HINT_MASK register
+ without effecting the mask
+ state of other bits (0 = no effect).
+==============================================*/
+	ACX_REG_HINT_MASK_SET,
+
+/*=============================================
+  Host Interrupt Mask Clear 16bit,(Write only)
+  ------------------------------------------
+ Setting a bit in this register clears
+ the corresponding bin in ACX_HINT_MASK register
+ without effecting the mask
+ state of other bits (0 = no effect).
+=============================================*/
+	ACX_REG_HINT_MASK_CLR,
+
+/*=============================================
+  Host Interrupt Status Nondestructive Read
+  16bit,(Read only)
+  ------------------------------------------
+ The host can read this register to determine
+ which interrupts are active.
+ Reading this register doesn't
+ effect its content.
+=============================================*/
+	ACX_REG_INTERRUPT_NO_CLEAR,
+
+/*=============================================
+  Host Interrupt Status Clear on Read  Register
+  16bit,(Read only)
+  ------------------------------------------
+ The host can read this register to determine
+ which interrupts are active.
+ Reading this register clears it,
+ thus making all interrupts inactive.
+==============================================*/
+	ACX_REG_INTERRUPT_CLEAR,
+
+/*=============================================
+  Host Interrupt Acknowledge Register
+  16bit,(Write only)
+  ------------------------------------------
+ The host can set individual bits in this
+ register to clear (acknowledge) the corresp.
+ interrupt status bits in the HINT_STS_CLR and
+ HINT_STS_ND registers, thus making the
+ assotiated interrupt inactive. (0-no effect)
+==============================================*/
+	ACX_REG_INTERRUPT_ACK,
+
+/*===============================================
+   Host Software Reset - 32bit RW
+ ------------------------------------------
+    [31:1] Reserved
+    0  SOFT_RESET Soft Reset  - When this bit is set,
+    it holds the Wlan hardware in a soft reset state.
+    This reset disables all MAC and baseband processor
+    clocks except the CardBus/PCI interface clock.
+    It also initializes all MAC state machines except
+    the host interface. It does not reload the
+    contents of the EEPROM. When this bit is cleared
+    (not self-clearing), the Wlan hardware
+    exits the software reset state.
+===============================================*/
+	ACX_REG_SLV_SOFT_RESET,
+
+/*===============================================
+ EEPROM Burst Read Start  - 32bit RW
+ ------------------------------------------
+ [31:1] Reserved
+ 0  ACX_EE_START -  EEPROM Burst Read Start 0
+ Setting this bit starts a burst read from
+ the external EEPROM.
+ If this bit is set (after reset) before an EEPROM read/write,
+ the burst read starts at EEPROM address 0.
+ Otherwise, it starts at the address
+ following the address of the previous access.
+ TheWlan hardware hardware clears this bit automatically.
+
+ Default: 0x00000000
+*================================================*/
+	ACX_REG_EE_START,
+
+/* Embedded ARM CPU Control */
+
+/*===============================================
+ Halt eCPU   - 32bit RW
+ ------------------------------------------
+ 0 HALT_ECPU Halt Embedded CPU - This bit is the
+ compliment of bit 1 (MDATA2) in the SOR_CFG register.
+ During a hardware reset, this bit holds
+ the inverse of MDATA2.
+ When downloading firmware from the host,
+ set this bit (pull down MDATA2).
+ The host clears this bit after downloading the firmware into
+ zero-wait-state SSRAM.
+ When loading firmware from Flash, clear this bit (pull up MDATA2)
+ so that the eCPU can run the bootloader code in Flash
+ HALT_ECPU eCPU State
+ --------------------
+ 1 halt eCPU
+ 0 enable eCPU
+ ===============================================*/
+	ACX_REG_ECPU_CONTROL,
+
+	ACX_REG_TABLE_LEN
+};
+
+#define ACX_SLV_SOFT_RESET_BIT   BIT(1)
+#define ACX_REG_EEPROM_START_BIT BIT(1)
+
+/* Command/Information Mailbox Pointers */
+
+/*===============================================
+  Command Mailbox Pointer - 32bit RW
+ ------------------------------------------
+ This register holds the start address of
+ the command mailbox located in the Wlan hardware memory.
+ The host must read this pointer after a reset to
+ find the location of the command mailbox.
+ The Wlan hardware initializes the command mailbox
+ pointer with the default address of the command mailbox.
+ The command mailbox pointer is not valid until after
+ the host receives the Init Complete interrupt from
+ the Wlan hardware.
+ ===============================================*/
+#define REG_COMMAND_MAILBOX_PTR				(SCR_PAD0)
+
+/*===============================================
+  Information Mailbox Pointer - 32bit RW
+ ------------------------------------------
+ This register holds the start address of
+ the information mailbox located in the Wlan hardware memory.
+ The host must read this pointer after a reset to find
+ the location of the information mailbox.
+ The Wlan hardware initializes the information mailbox pointer
+ with the default address of the information mailbox.
+ The information mailbox pointer is not valid
+ until after the host receives the Init Complete interrupt from
+ the Wlan hardware.
+ ===============================================*/
+#define REG_EVENT_MAILBOX_PTR				(SCR_PAD1)
+
+
+/* Misc */
+
+#define REG_ENABLE_TX_RX				(ENABLE)
+/*
+ * Rx configuration (filter) information element
+ * ---------------------------------------------
+ */
+#define REG_RX_CONFIG				(RX_CFG)
+#define REG_RX_FILTER				(RX_FILTER_CFG)
+
+
+#define RX_CFG_ENABLE_PHY_HEADER_PLCP	 0x0002
+
+/* promiscuous - receives all valid frames */
+#define RX_CFG_PROMISCUOUS		 0x0008
+
+/* receives frames from any BSSID */
+#define RX_CFG_BSSID			 0x0020
+
+/* receives frames destined to any MAC address */
+#define RX_CFG_MAC			 0x0010
+
+#define RX_CFG_ENABLE_ONLY_MY_DEST_MAC	 0x0010
+#define RX_CFG_ENABLE_ANY_DEST_MAC	 0x0000
+#define RX_CFG_ENABLE_ONLY_MY_BSSID	 0x0020
+#define RX_CFG_ENABLE_ANY_BSSID		 0x0000
+
+/* discards all broadcast frames */
+#define RX_CFG_DISABLE_BCAST		 0x0200
+
+#define RX_CFG_ENABLE_ONLY_MY_SSID	 0x0400
+#define RX_CFG_ENABLE_RX_CMPLT_FCS_ERROR 0x0800
+#define RX_CFG_COPY_RX_STATUS		 0x2000
+#define RX_CFG_TSF			 0x10000
+
+#define RX_CONFIG_OPTION_ANY_DST_MY_BSS	 (RX_CFG_ENABLE_ANY_DEST_MAC | \
+					  RX_CFG_ENABLE_ONLY_MY_BSSID)
+
+#define RX_CONFIG_OPTION_MY_DST_ANY_BSS	 (RX_CFG_ENABLE_ONLY_MY_DEST_MAC\
+					  | RX_CFG_ENABLE_ANY_BSSID)
+
+#define RX_CONFIG_OPTION_ANY_DST_ANY_BSS (RX_CFG_ENABLE_ANY_DEST_MAC | \
+					  RX_CFG_ENABLE_ANY_BSSID)
+
+#define RX_CONFIG_OPTION_MY_DST_MY_BSS	 (RX_CFG_ENABLE_ONLY_MY_DEST_MAC\
+					  | RX_CFG_ENABLE_ONLY_MY_BSSID)
+
+#define RX_CONFIG_OPTION_FOR_SCAN  (RX_CFG_ENABLE_PHY_HEADER_PLCP \
+				    | RX_CFG_ENABLE_RX_CMPLT_FCS_ERROR \
+				    | RX_CFG_COPY_RX_STATUS | RX_CFG_TSF)
+
+#define RX_CONFIG_OPTION_FOR_MEASUREMENT (RX_CFG_ENABLE_ANY_DEST_MAC)
+
+#define RX_CONFIG_OPTION_FOR_JOIN	 (RX_CFG_ENABLE_ONLY_MY_BSSID | \
+					  RX_CFG_ENABLE_ONLY_MY_DEST_MAC)
+
+#define RX_CONFIG_OPTION_FOR_IBSS_JOIN   (RX_CFG_ENABLE_ONLY_MY_SSID | \
+					  RX_CFG_ENABLE_ONLY_MY_DEST_MAC)
+
+#define RX_FILTER_OPTION_DEF	      (CFG_RX_MGMT_EN | CFG_RX_DATA_EN\
+				       | CFG_RX_CTL_EN | CFG_RX_BCN_EN\
+				       | CFG_RX_AUTH_EN | CFG_RX_ASSOC_EN)
+
+#define RX_FILTER_OPTION_FILTER_ALL	 0
+
+#define RX_FILTER_OPTION_DEF_PRSP_BCN  (CFG_RX_PRSP_EN | CFG_RX_MGMT_EN\
+					| CFG_RX_RCTS_ACK | CFG_RX_BCN_EN)
+
+#define RX_FILTER_OPTION_JOIN	     (CFG_RX_MGMT_EN | CFG_RX_DATA_EN\
+				      | CFG_RX_BCN_EN | CFG_RX_AUTH_EN\
+				      | CFG_RX_ASSOC_EN | CFG_RX_RCTS_ACK\
+				      | CFG_RX_PRSP_EN)
+
+
+/*===============================================
+  Phy regs
+ ===============================================*/
+#define ACX_PHY_ADDR_REG                SBB_ADDR
+#define ACX_PHY_DATA_REG                SBB_DATA
+#define ACX_PHY_CTRL_REG                SBB_CTL
+#define ACX_PHY_REG_WR_MASK             0x00000001ul
+#define ACX_PHY_REG_RD_MASK             0x00000002ul
+
+
+/*===============================================
+ EEPROM Read/Write Request 32bit RW
+ ------------------------------------------
+ 1 EE_READ - EEPROM Read Request 1 - Setting this bit
+ loads a single byte of data into the EE_DATA
+ register from the EEPROM location specified in
+ the EE_ADDR register.
+ The Wlan hardware hardware clears this bit automatically.
+ EE_DATA is valid when this bit is cleared.
+
+ 0 EE_WRITE  - EEPROM Write Request  - Setting this bit
+ writes a single byte of data from the EE_DATA register into the
+ EEPROM location specified in the EE_ADDR register.
+ The Wlan hardware hardware clears this bit automatically.
+*===============================================*/
+#define ACX_EE_CTL_REG                      EE_CTL
+#define EE_WRITE                            0x00000001ul
+#define EE_READ                             0x00000002ul
+
+/*===============================================
+  EEPROM Address  - 32bit RW
+  ------------------------------------------
+  This register specifies the address
+  within the EEPROM from/to which to read/write data.
+  ===============================================*/
+#define ACX_EE_ADDR_REG                     EE_ADDR
+
+/*===============================================
+  EEPROM Data  - 32bit RW
+  ------------------------------------------
+  This register either holds the read 8 bits of
+  data from the EEPROM or the write data
+  to be written to the EEPROM.
+  ===============================================*/
+#define ACX_EE_DATA_REG                     EE_DATA
+
+/*===============================================
+  EEPROM Base Address  - 32bit RW
+  ------------------------------------------
+  This register holds the upper nine bits
+  [23:15] of the 24-bit Wlan hardware memory
+  address for burst reads from EEPROM accesses.
+  The EEPROM provides the lower 15 bits of this address.
+  The MSB of the address from the EEPROM is ignored.
+  ===============================================*/
+#define ACX_EE_CFG                          EE_CFG
+
+/*===============================================
+  GPIO Output Values  -32bit, RW
+  ------------------------------------------
+  [31:16]  Reserved
+  [15: 0]  Specify the output values (at the output driver inputs) for
+  GPIO[15:0], respectively.
+  ===============================================*/
+#define ACX_GPIO_OUT_REG            GPIO_OUT
+#define ACX_MAX_GPIO_LINES          15
+
+/*===============================================
+  Contention window  -32bit, RW
+  ------------------------------------------
+  [31:26]  Reserved
+  [25:16]  Max (0x3ff)
+  [15:07]  Reserved
+  [06:00]  Current contention window value - default is 0x1F
+  ===============================================*/
+#define ACX_CONT_WIND_CFG_REG    CONT_WIND_CFG
+#define ACX_CONT_WIND_MIN_MASK   0x0000007f
+#define ACX_CONT_WIND_MAX        0x03ff0000
+
+/*
+ * Indirect slave register/memory registers
+ * ----------------------------------------
+ */
+#define HW_SLAVE_REG_ADDR_REG		0x00000004
+#define HW_SLAVE_REG_DATA_REG		0x00000008
+#define HW_SLAVE_REG_CTRL_REG		0x0000000c
+
+#define SLAVE_AUTO_INC				0x00010000
+#define SLAVE_NO_AUTO_INC			0x00000000
+#define SLAVE_HOST_LITTLE_ENDIAN	0x00000000
+
+#define HW_SLAVE_MEM_ADDR_REG		SLV_MEM_ADDR
+#define HW_SLAVE_MEM_DATA_REG		SLV_MEM_DATA
+#define HW_SLAVE_MEM_CTRL_REG		SLV_MEM_CTL
+#define HW_SLAVE_MEM_ENDIAN_REG		SLV_END_CTL
+
+#define HW_FUNC_EVENT_INT_EN		0x8000
+#define HW_FUNC_EVENT_MASK_REG		0x00000034
+
+#define ACX_MAC_TIMESTAMP_REG	(MAC_TIMESTAMP)
+
+/*===============================================
+  HI_CFG Interface Configuration Register Values
+  ------------------------------------------
+  ===============================================*/
+#define HI_CFG_UART_ENABLE          0x00000004
+#define HI_CFG_RST232_ENABLE        0x00000008
+#define HI_CFG_CLOCK_REQ_SELECT     0x00000010
+#define HI_CFG_HOST_INT_ENABLE      0x00000020
+#define HI_CFG_VLYNQ_OUTPUT_ENABLE  0x00000040
+#define HI_CFG_HOST_INT_ACTIVE_LOW  0x00000080
+#define HI_CFG_UART_TX_OUT_GPIO_15  0x00000100
+#define HI_CFG_UART_TX_OUT_GPIO_14  0x00000200
+#define HI_CFG_UART_TX_OUT_GPIO_7   0x00000400
+
+/*
+ * NOTE: USE_ACTIVE_HIGH compilation flag should be defined in makefile
+ *       for platforms using active high interrupt level
+ */
+#ifdef USE_ACTIVE_HIGH
+#define HI_CFG_DEF_VAL              \
+	(HI_CFG_UART_ENABLE |        \
+	HI_CFG_RST232_ENABLE |      \
+	HI_CFG_CLOCK_REQ_SELECT |   \
+	HI_CFG_HOST_INT_ENABLE)
+#else
+#define HI_CFG_DEF_VAL              \
+	(HI_CFG_UART_ENABLE |        \
+	HI_CFG_RST232_ENABLE |      \
+	HI_CFG_CLOCK_REQ_SELECT |   \
+	HI_CFG_HOST_INT_ENABLE)
+
+#endif
+
+#define REF_FREQ_19_2                       0
+#define REF_FREQ_26_0                       1
+#define REF_FREQ_38_4                       2
+#define REF_FREQ_40_0                       3
+#define REF_FREQ_33_6                       4
+#define REF_FREQ_NUM                        5
+
+#define LUT_PARAM_INTEGER_DIVIDER           0
+#define LUT_PARAM_FRACTIONAL_DIVIDER        1
+#define LUT_PARAM_ATTN_BB                   2
+#define LUT_PARAM_ALPHA_BB                  3
+#define LUT_PARAM_STOP_TIME_BB              4
+#define LUT_PARAM_BB_PLL_LOOP_FILTER        5
+#define LUT_PARAM_NUM                       6
+
+#define ACX_EEPROMLESS_IND_REG              (SCR_PAD4)
+#define USE_EEPROM                          0
+#define SOFT_RESET_MAX_TIME                 1000000
+#define SOFT_RESET_STALL_TIME               1000
+#define NVS_DATA_BUNDARY_ALIGNMENT          4
+
+
+/* Firmware image load chunk size */
+#define CHUNK_SIZE          512
+
+/* Firmware image header size */
+#define FW_HDR_SIZE 8
+
+#define ECPU_CONTROL_HALT					0x00000101
+
+
+/******************************************************************************
+
+    CHANNELS, BAND & REG DOMAINS definitions
+
+******************************************************************************/
+
+
+enum {
+	RADIO_BAND_2_4GHZ = 0,  /* 2.4 Ghz band */
+	RADIO_BAND_5GHZ = 1,    /* 5 Ghz band */
+	RADIO_BAND_JAPAN_4_9_GHZ = 2,
+	DEFAULT_BAND = RADIO_BAND_2_4GHZ,
+	INVALID_BAND = 0xFE,
+	MAX_RADIO_BANDS = 0xFF
+};
+
+enum {
+	NO_RATE      = 0,
+	RATE_1MBPS   = 0x0A,
+	RATE_2MBPS   = 0x14,
+	RATE_5_5MBPS = 0x37,
+	RATE_6MBPS   = 0x0B,
+	RATE_9MBPS   = 0x0F,
+	RATE_11MBPS  = 0x6E,
+	RATE_12MBPS  = 0x0A,
+	RATE_18MBPS  = 0x0E,
+	RATE_22MBPS  = 0xDC,
+	RATE_24MBPS  = 0x09,
+	RATE_36MBPS  = 0x0D,
+	RATE_48MBPS  = 0x08,
+	RATE_54MBPS  = 0x0C
+};
+
+enum {
+	RATE_INDEX_1MBPS   =  0,
+	RATE_INDEX_2MBPS   =  1,
+	RATE_INDEX_5_5MBPS =  2,
+	RATE_INDEX_6MBPS   =  3,
+	RATE_INDEX_9MBPS   =  4,
+	RATE_INDEX_11MBPS  =  5,
+	RATE_INDEX_12MBPS  =  6,
+	RATE_INDEX_18MBPS  =  7,
+	RATE_INDEX_22MBPS  =  8,
+	RATE_INDEX_24MBPS  =  9,
+	RATE_INDEX_36MBPS  =  10,
+	RATE_INDEX_48MBPS  =  11,
+	RATE_INDEX_54MBPS  =  12,
+	RATE_INDEX_MAX     =  RATE_INDEX_54MBPS,
+	MAX_RATE_INDEX,
+	INVALID_RATE_INDEX = MAX_RATE_INDEX,
+	RATE_INDEX_ENUM_MAX_SIZE = 0x7FFFFFFF
+};
+
+enum {
+	RATE_MASK_1MBPS = 0x1,
+	RATE_MASK_2MBPS = 0x2,
+	RATE_MASK_5_5MBPS = 0x4,
+	RATE_MASK_11MBPS = 0x20,
+};
+
+#define SHORT_PREAMBLE_BIT   BIT(0) /* CCK or Barker depending on the rate */
+#define OFDM_RATE_BIT        BIT(6)
+#define PBCC_RATE_BIT        BIT(7)
+
+enum {
+	CCK_LONG = 0,
+	CCK_SHORT = SHORT_PREAMBLE_BIT,
+	PBCC_LONG = PBCC_RATE_BIT,
+	PBCC_SHORT = PBCC_RATE_BIT | SHORT_PREAMBLE_BIT,
+	OFDM = OFDM_RATE_BIT
+};
+
+/******************************************************************************
+
+Transmit-Descriptor RATE-SET field definitions...
+
+Define a new "Rate-Set" for TX path that incorporates the
+Rate & Modulation info into a single 16-bit field.
+
+TxdRateSet_t:
+b15   - Indicates Preamble type (1=SHORT, 0=LONG).
+	Notes:
+	Must be LONG (0) for 1Mbps rate.
+	Does not apply (set to 0) for RevG-OFDM rates.
+b14   - Indicates PBCC encoding (1=PBCC, 0=not).
+	Notes:
+	Does not apply (set to 0) for rates 1 and 2 Mbps.
+	Does not apply (set to 0) for RevG-OFDM rates.
+b13    - Unused (set to 0).
+b12-b0 - Supported Rate indicator bits as defined below.
+
+******************************************************************************/
+
+
+#define TNETW1251_CHIP_ID_PG1_0         0x07010101
+#define TNETW1251_CHIP_ID_PG1_1         0x07020101
+#define TNETW1251_CHIP_ID_PG1_2	        0x07030101
+
+/*************************************************************************
+
+    Interrupt Trigger Register (Host -> WiLink)
+
+**************************************************************************/
+
+/* Hardware to Embedded CPU Interrupts - first 32-bit register set */
+
+/*
+ * Host Command Interrupt. Setting this bit masks
+ * the interrupt that the host issues to inform
+ * the FW that it has sent a command
+ * to the Wlan hardware Command Mailbox.
+ */
+#define INTR_TRIG_CMD       BIT(0)
+
+/*
+ * Host Event Acknowlegde Interrupt. The host
+ * sets this bit to acknowledge that it received
+ * the unsolicited information from the event
+ * mailbox.
+ */
+#define INTR_TRIG_EVENT_ACK BIT(1)
+
+/*
+ * The host sets this bit to inform the Wlan
+ * FW that a TX packet is in the XFER
+ * Buffer #0.
+ */
+#define INTR_TRIG_TX_PROC0 BIT(2)
+
+/*
+ * The host sets this bit to inform the FW
+ * that it read a packet from RX XFER
+ * Buffer #0.
+ */
+#define INTR_TRIG_RX_PROC0 BIT(3)
+
+#define INTR_TRIG_DEBUG_ACK BIT(4)
+
+#define INTR_TRIG_STATE_CHANGED BIT(5)
+
+
+/* Hardware to Embedded CPU Interrupts - second 32-bit register set */
+
+/*
+ * The host sets this bit to inform the FW
+ * that it read a packet from RX XFER
+ * Buffer #1.
+ */
+#define INTR_TRIG_RX_PROC1 BIT(17)
+
+/*
+ * The host sets this bit to inform the Wlan
+ * hardware that a TX packet is in the XFER
+ * Buffer #1.
+ */
+#define INTR_TRIG_TX_PROC1 BIT(18)
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/rx.c b/drivers/net/wireless/wl12xx/rx.c
new file mode 100644
index 00000000000..981ea259eb8
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/rx.c
@@ -0,0 +1,208 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <net/mac80211.h>
+
+#include "wl12xx.h"
+#include "reg.h"
+#include "spi.h"
+#include "rx.h"
+
+static void wl12xx_rx_header(struct wl12xx *wl,
+			     struct wl12xx_rx_descriptor *desc)
+{
+	u32 rx_packet_ring_addr;
+
+	rx_packet_ring_addr = wl->data_path->rx_packet_ring_addr;
+	if (wl->rx_current_buffer)
+		rx_packet_ring_addr += wl->data_path->rx_packet_ring_chunk_size;
+
+	wl12xx_spi_mem_read(wl, rx_packet_ring_addr, desc,
+			    sizeof(struct wl12xx_rx_descriptor));
+}
+
+static void wl12xx_rx_status(struct wl12xx *wl,
+			     struct wl12xx_rx_descriptor *desc,
+			     struct ieee80211_rx_status *status,
+			     u8 beacon)
+{
+	memset(status, 0, sizeof(struct ieee80211_rx_status));
+
+	status->band = IEEE80211_BAND_2GHZ;
+	status->mactime = desc->timestamp;
+
+	/*
+	 * The rx status timestamp is a 32 bits value while the TSF is a
+	 * 64 bits one.
+	 * For IBSS merging, TSF is mandatory, so we have to get it
+	 * somehow, so we ask for ACX_TSF_INFO.
+	 * That could be moved to the get_tsf() hook, but unfortunately,
+	 * this one must be atomic, while our SPI routines can sleep.
+	 */
+	if ((wl->bss_type == BSS_TYPE_IBSS) && beacon) {
+		u64 mactime;
+		int ret;
+		struct wl12xx_command cmd;
+		struct acx_tsf_info *tsf_info;
+
+		memset(&cmd, 0, sizeof(cmd));
+
+		ret = wl12xx_cmd_interrogate(wl, ACX_TSF_INFO,
+					     sizeof(struct acx_tsf_info),
+					     &cmd);
+		if (ret < 0) {
+			wl12xx_warning("ACX_FW_REV interrogate failed");
+			return;
+		}
+
+		tsf_info = (struct acx_tsf_info *)&(cmd.parameters);
+
+		mactime = tsf_info->current_tsf_lsb |
+			(tsf_info->current_tsf_msb << 31);
+
+		status->mactime = mactime;
+	}
+
+	status->signal = desc->rssi;
+	status->qual = (desc->rssi - WL12XX_RX_MIN_RSSI) * 100 /
+		(WL12XX_RX_MAX_RSSI - WL12XX_RX_MIN_RSSI);
+	status->qual = min(status->qual, 100);
+	status->qual = max(status->qual, 0);
+
+	/*
+	 * FIXME: guessing that snr needs to be divided by two, otherwise
+	 * the values don't make any sense
+	 */
+	status->noise = desc->rssi - desc->snr / 2;
+
+	status->freq = ieee80211_channel_to_frequency(desc->channel);
+
+	status->flag |= RX_FLAG_TSFT;
+
+	if (desc->flags & RX_DESC_ENCRYPTION_MASK) {
+		status->flag |= RX_FLAG_IV_STRIPPED | RX_FLAG_MMIC_STRIPPED;
+
+		if (likely(!(desc->flags & RX_DESC_DECRYPT_FAIL)))
+			status->flag |= RX_FLAG_DECRYPTED;
+
+		if (unlikely(desc->flags & RX_DESC_MIC_FAIL))
+			status->flag |= RX_FLAG_MMIC_ERROR;
+	}
+
+	if (unlikely(!(desc->flags & RX_DESC_VALID_FCS)))
+		status->flag |= RX_FLAG_FAILED_FCS_CRC;
+
+
+	/* FIXME: set status->rate_idx */
+}
+
+static void wl12xx_rx_body(struct wl12xx *wl,
+			   struct wl12xx_rx_descriptor *desc)
+{
+	struct sk_buff *skb;
+	struct ieee80211_rx_status status;
+	u8 *rx_buffer, beacon = 0;
+	u16 length, *fc;
+	u32 curr_id, last_id_inc, rx_packet_ring_addr;
+
+	length = WL12XX_RX_ALIGN(desc->length  - PLCP_HEADER_LENGTH);
+	curr_id = (desc->flags & RX_DESC_SEQNUM_MASK) >> RX_DESC_PACKETID_SHIFT;
+	last_id_inc = (wl->rx_last_id + 1) % (RX_MAX_PACKET_ID + 1);
+
+	if (last_id_inc != curr_id) {
+		wl12xx_warning("curr ID:%d, last ID inc:%d",
+			       curr_id, last_id_inc);
+		wl->rx_last_id = curr_id;
+	} else {
+		wl->rx_last_id = last_id_inc;
+	}
+
+	rx_packet_ring_addr = wl->data_path->rx_packet_ring_addr +
+		sizeof(struct wl12xx_rx_descriptor) + 20;
+	if (wl->rx_current_buffer)
+		rx_packet_ring_addr += wl->data_path->rx_packet_ring_chunk_size;
+
+	skb = dev_alloc_skb(length);
+	if (!skb) {
+		wl12xx_error("Couldn't allocate RX frame");
+		return;
+	}
+
+	rx_buffer = skb_put(skb, length);
+	wl12xx_spi_mem_read(wl, rx_packet_ring_addr, rx_buffer, length);
+
+	/* The actual lenght doesn't include the target's alignment */
+	skb->len = desc->length  - PLCP_HEADER_LENGTH;
+
+	fc = (u16 *)skb->data;
+
+	if ((*fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_BEACON)
+		beacon = 1;
+
+	wl12xx_rx_status(wl, desc, &status, beacon);
+
+	wl12xx_debug(DEBUG_RX, "rx skb 0x%p: %d B %s", skb, skb->len,
+		     beacon ? "beacon" : "");
+
+	ieee80211_rx(wl->hw, skb, &status);
+}
+
+static void wl12xx_rx_ack(struct wl12xx *wl)
+{
+	u32 data, addr;
+
+	if (wl->rx_current_buffer) {
+		addr = ACX_REG_INTERRUPT_TRIG_H;
+		data = INTR_TRIG_RX_PROC1;
+	} else {
+		addr = ACX_REG_INTERRUPT_TRIG;
+		data = INTR_TRIG_RX_PROC0;
+	}
+
+	wl12xx_reg_write32(wl, addr, data);
+
+	/* Toggle buffer ring */
+	wl->rx_current_buffer = !wl->rx_current_buffer;
+}
+
+
+void wl12xx_rx(struct wl12xx *wl)
+{
+	struct wl12xx_rx_descriptor rx_desc;
+
+	if (wl->state != WL12XX_STATE_ON)
+		return;
+
+	/* We first read the frame's header */
+	wl12xx_rx_header(wl, &rx_desc);
+
+	/* Now we can read the body */
+	wl12xx_rx_body(wl, &rx_desc);
+
+	/* Finally, we need to ACK the RX */
+	wl12xx_rx_ack(wl);
+
+	return;
+}
diff --git a/drivers/net/wireless/wl12xx/rx.h b/drivers/net/wireless/wl12xx/rx.h
new file mode 100644
index 00000000000..8a23fdea501
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/rx.h
@@ -0,0 +1,122 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_RX_H__
+#define __WL12XX_RX_H__
+
+#include <linux/bitops.h>
+
+/*
+ * RX PATH
+ *
+ * The Rx path uses a double buffer and an rx_contro structure, each located
+ * at a fixed address in the device memory. The host keeps track of which
+ * buffer is available and alternates between them on a per packet basis.
+ * The size of each of the two buffers is large enough to hold the longest
+ * 802.3 packet.
+ * The RX path goes like that:
+ * 1) The target generates an interrupt each time a new packet is received.
+ *   There are 2 RX interrupts, one for each buffer.
+ * 2) The host reads the received packet from one of the double buffers.
+ * 3) The host triggers a target interrupt.
+ * 4) The target prepares the next RX packet.
+ */
+
+#define WL12XX_RX_MAX_RSSI -30
+#define WL12XX_RX_MIN_RSSI -95
+
+#define WL12XX_RX_ALIGN_TO 4
+#define WL12XX_RX_ALIGN(len) (((len) + WL12XX_RX_ALIGN_TO - 1) & \
+			     ~(WL12XX_RX_ALIGN_TO - 1))
+
+#define SHORT_PREAMBLE_BIT   BIT(0)
+#define OFDM_RATE_BIT        BIT(6)
+#define PBCC_RATE_BIT        BIT(7)
+
+#define PLCP_HEADER_LENGTH 8
+#define RX_DESC_PACKETID_SHIFT 11
+#define RX_MAX_PACKET_ID 3
+
+#define RX_DESC_VALID_FCS         0x0001
+#define RX_DESC_MATCH_RXADDR1     0x0002
+#define RX_DESC_MCAST             0x0004
+#define RX_DESC_STAINTIM          0x0008
+#define RX_DESC_VIRTUAL_BM        0x0010
+#define RX_DESC_BCAST             0x0020
+#define RX_DESC_MATCH_SSID        0x0040
+#define RX_DESC_MATCH_BSSID       0x0080
+#define RX_DESC_ENCRYPTION_MASK   0x0300
+#define RX_DESC_MEASURMENT        0x0400
+#define RX_DESC_SEQNUM_MASK       0x1800
+#define	RX_DESC_MIC_FAIL	  0x2000
+#define	RX_DESC_DECRYPT_FAIL	  0x4000
+
+struct wl12xx_rx_descriptor {
+	u32 timestamp; /* In microseconds */
+	u16 length; /* Paylod length, including headers */
+	u16 flags;
+
+	/*
+	 * 0 - 802.11
+	 * 1 - 802.3
+	 * 2 - IP
+	 * 3 - Raw Codec
+	 */
+	u8 type;
+
+	/*
+	 * Recevied Rate:
+	 * 0x0A - 1MBPS
+	 * 0x14 - 2MBPS
+	 * 0x37 - 5_5MBPS
+	 * 0x0B - 6MBPS
+	 * 0x0F - 9MBPS
+	 * 0x6E - 11MBPS
+	 * 0x0A - 12MBPS
+	 * 0x0E - 18MBPS
+	 * 0xDC - 22MBPS
+	 * 0x09 - 24MBPS
+	 * 0x0D - 36MBPS
+	 * 0x08 - 48MBPS
+	 * 0x0C - 54MBPS
+	 */
+	u8 rate;
+
+	u8 mod_pre; /* Modulation and preamble */
+	u8 channel;
+
+	/*
+	 * 0 - 2.4 Ghz
+	 * 1 - 5 Ghz
+	 */
+	u8 band;
+
+	s8 rssi; /* in dB */
+	u8 rcpi; /* in dB */
+	u8 snr; /* in dB */
+} __attribute__ ((packed));
+
+void wl12xx_rx(struct wl12xx *wl);
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/spi.c b/drivers/net/wireless/wl12xx/spi.c
new file mode 100644
index 00000000000..abdf171a47e
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/spi.c
@@ -0,0 +1,358 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/crc7.h>
+#include <linux/spi/spi.h>
+
+#include "wl12xx.h"
+#include "wl12xx_80211.h"
+#include "reg.h"
+#include "spi.h"
+#include "ps.h"
+
+static int wl12xx_translate_reg_addr(struct wl12xx *wl, int addr)
+{
+	/* If the address is lower than REGISTERS_BASE, it means that this is
+	 * a chip-specific register address, so look it up in the registers
+	 * table */
+	if (addr < REGISTERS_BASE) {
+		/* Make sure we don't go over the table */
+		if (addr >= ACX_REG_TABLE_LEN) {
+			wl12xx_error("address out of range (%d)", addr);
+			return -EINVAL;
+		}
+		addr = wl->chip.acx_reg_table[addr];
+	}
+
+	return addr - wl->physical_reg_addr + wl->virtual_reg_addr;
+}
+
+static int wl12xx_translate_mem_addr(struct wl12xx *wl, int addr)
+{
+	return addr - wl->physical_mem_addr + wl->virtual_mem_addr;
+}
+
+
+void wl12xx_spi_reset(struct wl12xx *wl)
+{
+	u8 *cmd;
+	struct spi_transfer t;
+	struct spi_message m;
+
+	cmd = kzalloc(WSPI_INIT_CMD_LEN, GFP_KERNEL);
+	if (!cmd) {
+		wl12xx_error("could not allocate cmd for spi reset");
+		return;
+	}
+
+	memset(&t, 0, sizeof(t));
+	spi_message_init(&m);
+
+	memset(cmd, 0xff, WSPI_INIT_CMD_LEN);
+
+	t.tx_buf = cmd;
+	t.len = WSPI_INIT_CMD_LEN;
+	spi_message_add_tail(&t, &m);
+
+	spi_sync(wl->spi, &m);
+
+	wl12xx_dump(DEBUG_SPI, "spi reset -> ", cmd, WSPI_INIT_CMD_LEN);
+}
+
+void wl12xx_spi_init(struct wl12xx *wl)
+{
+	u8 crc[WSPI_INIT_CMD_CRC_LEN], *cmd;
+	struct spi_transfer t;
+	struct spi_message m;
+
+	cmd = kzalloc(WSPI_INIT_CMD_LEN, GFP_KERNEL);
+	if (!cmd) {
+		wl12xx_error("could not allocate cmd for spi init");
+		return;
+	}
+
+	memset(crc, 0, sizeof(crc));
+	memset(&t, 0, sizeof(t));
+	spi_message_init(&m);
+
+	/*
+	 * Set WSPI_INIT_COMMAND
+	 * the data is being send from the MSB to LSB
+	 */
+	cmd[2] = 0xff;
+	cmd[3] = 0xff;
+	cmd[1] = WSPI_INIT_CMD_START | WSPI_INIT_CMD_TX;
+	cmd[0] = 0;
+	cmd[7] = 0;
+	cmd[6] |= HW_ACCESS_WSPI_INIT_CMD_MASK << 3;
+	cmd[6] |= HW_ACCESS_WSPI_FIXED_BUSY_LEN & WSPI_INIT_CMD_FIXEDBUSY_LEN;
+
+	if (HW_ACCESS_WSPI_FIXED_BUSY_LEN == 0)
+		cmd[5] |=  WSPI_INIT_CMD_DIS_FIXEDBUSY;
+	else
+		cmd[5] |= WSPI_INIT_CMD_EN_FIXEDBUSY;
+
+	cmd[5] |= WSPI_INIT_CMD_IOD | WSPI_INIT_CMD_IP | WSPI_INIT_CMD_CS
+		| WSPI_INIT_CMD_WSPI | WSPI_INIT_CMD_WS;
+
+	crc[0] = cmd[1];
+	crc[1] = cmd[0];
+	crc[2] = cmd[7];
+	crc[3] = cmd[6];
+	crc[4] = cmd[5];
+
+	cmd[4] |= crc7(0, crc, WSPI_INIT_CMD_CRC_LEN) << 1;
+	cmd[4] |= WSPI_INIT_CMD_END;
+
+	t.tx_buf = cmd;
+	t.len = WSPI_INIT_CMD_LEN;
+	spi_message_add_tail(&t, &m);
+
+	spi_sync(wl->spi, &m);
+
+	wl12xx_dump(DEBUG_SPI, "spi init -> ", cmd, WSPI_INIT_CMD_LEN);
+}
+
+/* Set the SPI partitions to access the chip addresses
+ *
+ * There are two VIRTUAL (SPI) partitions (the memory partition and the
+ * registers partition), which are mapped to two different areas of the
+ * PHYSICAL (hardware) memory.  This function also makes other checks to
+ * ensure that the partitions are not overlapping.  In the diagram below, the
+ * memory partition comes before the register partition, but the opposite is
+ * also supported.
+ *
+ *                               PHYSICAL address
+ *                                     space
+ *
+ *                                    |    |
+ *                                 ...+----+--> mem_start
+ *          VIRTUAL address     ...   |    |
+ *               space       ...      |    | [PART_0]
+ *                        ...         |    |
+ * 0x00000000 <--+----+...         ...+----+--> mem_start + mem_size
+ *               |    |         ...   |    |
+ *               |MEM |      ...      |    |
+ *               |    |   ...         |    |
+ *  part_size <--+----+...            |    | {unused area)
+ *               |    |   ...         |    |
+ *               |REG |      ...      |    |
+ *  part_size    |    |         ...   |    |
+ *      +     <--+----+...         ...+----+--> reg_start
+ *  reg_size              ...         |    |
+ *                           ...      |    | [PART_1]
+ *                              ...   |    |
+ *                                 ...+----+--> reg_start + reg_size
+ *                                    |    |
+ *
+ */
+void wl12xx_set_partition(struct wl12xx *wl,
+			  u32 mem_start, u32 mem_size,
+			  u32 reg_start, u32 reg_size)
+{
+	u8 tx_buf[sizeof(u32) + 2 * sizeof(struct wl12xx_partition)];
+	struct wl12xx_partition *partition;
+	struct spi_transfer t;
+	struct spi_message m;
+	u32 *cmd;
+	size_t len;
+	int addr;
+
+	spi_message_init(&m);
+	memset(&t, 0, sizeof(t));
+	memset(tx_buf, 0, sizeof(tx_buf));
+
+	cmd = (u32 *) tx_buf;
+	partition = (struct wl12xx_partition *) (tx_buf + sizeof(u32));
+	addr = HW_ACCESS_PART0_SIZE_ADDR;
+	len = 2 * sizeof(struct wl12xx_partition);
+
+	*cmd |= WSPI_CMD_WRITE;
+	*cmd |= (len << WSPI_CMD_BYTE_LENGTH_OFFSET) & WSPI_CMD_BYTE_LENGTH;
+	*cmd |= addr & WSPI_CMD_BYTE_ADDR;
+
+	wl12xx_debug(DEBUG_SPI, "mem_start %08X mem_size %08X",
+		     mem_start, mem_size);
+	wl12xx_debug(DEBUG_SPI, "reg_start %08X reg_size %08X",
+		     reg_start, reg_size);
+
+	/* Make sure that the two partitions together don't exceed the
+	 * address range */
+	if ((mem_size + reg_size) > HW_ACCESS_MEMORY_MAX_RANGE) {
+		wl12xx_debug(DEBUG_SPI, "Total size exceeds maximum virtual"
+			     " address range.  Truncating partition[0].");
+		mem_size = HW_ACCESS_MEMORY_MAX_RANGE - reg_size;
+		wl12xx_debug(DEBUG_SPI, "mem_start %08X mem_size %08X",
+			     mem_start, mem_size);
+		wl12xx_debug(DEBUG_SPI, "reg_start %08X reg_size %08X",
+			     reg_start, reg_size);
+	}
+
+	if ((mem_start < reg_start) &&
+	    ((mem_start + mem_size) > reg_start)) {
+		/* Guarantee that the memory partition doesn't overlap the
+		 * registers partition */
+		wl12xx_debug(DEBUG_SPI, "End of partition[0] is "
+			     "overlapping partition[1].  Adjusted.");
+		mem_size = reg_start - mem_start;
+		wl12xx_debug(DEBUG_SPI, "mem_start %08X mem_size %08X",
+			     mem_start, mem_size);
+		wl12xx_debug(DEBUG_SPI, "reg_start %08X reg_size %08X",
+			     reg_start, reg_size);
+	} else if ((reg_start < mem_start) &&
+		   ((reg_start + reg_size) > mem_start)) {
+		/* Guarantee that the register partition doesn't overlap the
+		 * memory partition */
+		wl12xx_debug(DEBUG_SPI, "End of partition[1] is"
+			     " overlapping partition[0].  Adjusted.");
+		reg_size = mem_start - reg_start;
+		wl12xx_debug(DEBUG_SPI, "mem_start %08X mem_size %08X",
+			     mem_start, mem_size);
+		wl12xx_debug(DEBUG_SPI, "reg_start %08X reg_size %08X",
+			     reg_start, reg_size);
+	}
+
+	partition[0].start = mem_start;
+	partition[0].size  = mem_size;
+	partition[1].start = reg_start;
+	partition[1].size  = reg_size;
+
+	wl->physical_mem_addr = mem_start;
+	wl->physical_reg_addr = reg_start;
+
+	wl->virtual_mem_addr = 0;
+	wl->virtual_reg_addr = mem_size;
+
+	t.tx_buf = tx_buf;
+	t.len = sizeof(tx_buf);
+	spi_message_add_tail(&t, &m);
+
+	spi_sync(wl->spi, &m);
+}
+
+void wl12xx_spi_read(struct wl12xx *wl, int addr, void *buf,
+		     size_t len)
+{
+	struct spi_transfer t[3];
+	struct spi_message m;
+	char busy_buf[TNETWIF_READ_OFFSET_BYTES];
+	u32 cmd;
+
+	cmd = 0;
+	cmd |= WSPI_CMD_READ;
+	cmd |= (len << WSPI_CMD_BYTE_LENGTH_OFFSET) & WSPI_CMD_BYTE_LENGTH;
+	cmd |= addr & WSPI_CMD_BYTE_ADDR;
+
+	spi_message_init(&m);
+	memset(t, 0, sizeof(t));
+
+	t[0].tx_buf = &cmd;
+	t[0].len = 4;
+	spi_message_add_tail(&t[0], &m);
+
+	/* Busy and non busy words read */
+	t[1].rx_buf = busy_buf;
+	t[1].len = TNETWIF_READ_OFFSET_BYTES;
+	spi_message_add_tail(&t[1], &m);
+
+	t[2].rx_buf = buf;
+	t[2].len = len;
+	spi_message_add_tail(&t[2], &m);
+
+	spi_sync(wl->spi, &m);
+
+	/* FIXME: check busy words */
+
+	wl12xx_dump(DEBUG_SPI, "spi_read cmd -> ", &cmd, sizeof(cmd));
+	wl12xx_dump(DEBUG_SPI, "spi_read buf <- ", buf, len);
+}
+
+void wl12xx_spi_write(struct wl12xx *wl, int addr, void *buf,
+		      size_t len)
+{
+	struct spi_transfer t[2];
+	struct spi_message m;
+	u32 cmd;
+
+	cmd = 0;
+	cmd |= WSPI_CMD_WRITE;
+	cmd |= (len << WSPI_CMD_BYTE_LENGTH_OFFSET) & WSPI_CMD_BYTE_LENGTH;
+	cmd |= addr & WSPI_CMD_BYTE_ADDR;
+
+	spi_message_init(&m);
+	memset(t, 0, sizeof(t));
+
+	t[0].tx_buf = &cmd;
+	t[0].len = sizeof(cmd);
+	spi_message_add_tail(&t[0], &m);
+
+	t[1].tx_buf = buf;
+	t[1].len = len;
+	spi_message_add_tail(&t[1], &m);
+
+	spi_sync(wl->spi, &m);
+
+	wl12xx_dump(DEBUG_SPI, "spi_write cmd -> ", &cmd, sizeof(cmd));
+	wl12xx_dump(DEBUG_SPI, "spi_write buf -> ", buf, len);
+}
+
+void wl12xx_spi_mem_read(struct wl12xx *wl, int addr, void *buf,
+			 size_t len)
+{
+	int physical;
+
+	physical = wl12xx_translate_mem_addr(wl, addr);
+
+	wl12xx_spi_read(wl, physical, buf, len);
+}
+
+void wl12xx_spi_mem_write(struct wl12xx *wl, int addr, void *buf,
+			  size_t len)
+{
+	int physical;
+
+	physical = wl12xx_translate_mem_addr(wl, addr);
+
+	wl12xx_spi_write(wl, physical, buf, len);
+}
+
+u32 wl12xx_mem_read32(struct wl12xx *wl, int addr)
+{
+	return wl12xx_read32(wl, wl12xx_translate_mem_addr(wl, addr));
+}
+
+void wl12xx_mem_write32(struct wl12xx *wl, int addr, u32 val)
+{
+	wl12xx_write32(wl, wl12xx_translate_mem_addr(wl, addr), val);
+}
+
+u32 wl12xx_reg_read32(struct wl12xx *wl, int addr)
+{
+	return wl12xx_read32(wl, wl12xx_translate_reg_addr(wl, addr));
+}
+
+void wl12xx_reg_write32(struct wl12xx *wl, int addr, u32 val)
+{
+	wl12xx_write32(wl, wl12xx_translate_reg_addr(wl, addr), val);
+}
diff --git a/drivers/net/wireless/wl12xx/spi.h b/drivers/net/wireless/wl12xx/spi.h
new file mode 100644
index 00000000000..fd3227e904a
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/spi.h
@@ -0,0 +1,109 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_SPI_H__
+#define __WL12XX_SPI_H__
+
+#include "cmd.h"
+#include "acx.h"
+#include "reg.h"
+
+#define HW_ACCESS_MEMORY_MAX_RANGE		0x1FFC0
+
+#define HW_ACCESS_PART0_SIZE_ADDR           0x1FFC0
+#define HW_ACCESS_PART0_START_ADDR          0x1FFC4
+#define HW_ACCESS_PART1_SIZE_ADDR           0x1FFC8
+#define HW_ACCESS_PART1_START_ADDR          0x1FFCC
+
+#define HW_ACCESS_REGISTER_SIZE             4
+
+#define HW_ACCESS_PRAM_MAX_RANGE		0x3c000
+
+#define WSPI_CMD_READ                 0x40000000
+#define WSPI_CMD_WRITE                0x00000000
+#define WSPI_CMD_FIXED                0x20000000
+#define WSPI_CMD_BYTE_LENGTH          0x1FFE0000
+#define WSPI_CMD_BYTE_LENGTH_OFFSET   17
+#define WSPI_CMD_BYTE_ADDR            0x0001FFFF
+
+#define WSPI_INIT_CMD_CRC_LEN       5
+
+#define WSPI_INIT_CMD_START         0x00
+#define WSPI_INIT_CMD_TX            0x40
+/* the extra bypass bit is sampled by the TNET as '1' */
+#define WSPI_INIT_CMD_BYPASS_BIT    0x80
+#define WSPI_INIT_CMD_FIXEDBUSY_LEN 0x07
+#define WSPI_INIT_CMD_EN_FIXEDBUSY  0x80
+#define WSPI_INIT_CMD_DIS_FIXEDBUSY 0x00
+#define WSPI_INIT_CMD_IOD           0x40
+#define WSPI_INIT_CMD_IP            0x20
+#define WSPI_INIT_CMD_CS            0x10
+#define WSPI_INIT_CMD_WS            0x08
+#define WSPI_INIT_CMD_WSPI          0x01
+#define WSPI_INIT_CMD_END           0x01
+
+#define WSPI_INIT_CMD_LEN           8
+
+#define TNETWIF_READ_OFFSET_BYTES  8
+#define HW_ACCESS_WSPI_FIXED_BUSY_LEN \
+		((TNETWIF_READ_OFFSET_BYTES - 4) / sizeof(u32))
+#define HW_ACCESS_WSPI_INIT_CMD_MASK  0
+
+
+/* Raw target IO, address is not translated */
+void wl12xx_spi_read(struct wl12xx *wl, int addr, void *buf, size_t len);
+void wl12xx_spi_write(struct wl12xx *wl, int addr, void *buf, size_t len);
+
+/* Memory target IO, address is tranlated to partition 0 */
+void wl12xx_spi_mem_read(struct wl12xx *wl, int addr, void *buf, size_t len);
+void wl12xx_spi_mem_write(struct wl12xx *wl, int addr, void *buf, size_t len);
+u32 wl12xx_mem_read32(struct wl12xx *wl, int addr);
+void wl12xx_mem_write32(struct wl12xx *wl, int addr, u32 val);
+
+/* Registers IO */
+u32 wl12xx_reg_read32(struct wl12xx *wl, int addr);
+void wl12xx_reg_write32(struct wl12xx *wl, int addr, u32 val);
+
+/* INIT and RESET words */
+void wl12xx_spi_reset(struct wl12xx *wl);
+void wl12xx_spi_init(struct wl12xx *wl);
+void wl12xx_set_partition(struct wl12xx *wl,
+			  u32 part_start, u32 part_size,
+			  u32 reg_start,  u32 reg_size);
+
+static inline u32 wl12xx_read32(struct wl12xx *wl, int addr)
+{
+	u32 response;
+
+	wl12xx_spi_read(wl, addr, &response, sizeof(u32));
+
+	return response;
+}
+
+static inline void wl12xx_write32(struct wl12xx *wl, int addr, u32 val)
+{
+	wl12xx_spi_write(wl, addr, &val, sizeof(u32));
+}
+
+#endif /* __WL12XX_SPI_H__ */
diff --git a/drivers/net/wireless/wl12xx/tx.c b/drivers/net/wireless/wl12xx/tx.c
new file mode 100644
index 00000000000..62145e205a8
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/tx.c
@@ -0,0 +1,557 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "wl12xx.h"
+#include "reg.h"
+#include "spi.h"
+#include "tx.h"
+#include "ps.h"
+
+static bool wl12xx_tx_double_buffer_busy(struct wl12xx *wl, u32 data_out_count)
+{
+	int used, data_in_count;
+
+	data_in_count = wl->data_in_count;
+
+	if (data_in_count < data_out_count)
+		/* data_in_count has wrapped */
+		data_in_count += TX_STATUS_DATA_OUT_COUNT_MASK + 1;
+
+	used = data_in_count - data_out_count;
+
+	WARN_ON(used < 0);
+	WARN_ON(used > DP_TX_PACKET_RING_CHUNK_NUM);
+
+	if (used >= DP_TX_PACKET_RING_CHUNK_NUM)
+		return true;
+	else
+		return false;
+}
+
+static int wl12xx_tx_path_status(struct wl12xx *wl)
+{
+	u32 status, addr, data_out_count;
+	bool busy;
+
+	addr = wl->data_path->tx_control_addr;
+	status = wl12xx_mem_read32(wl, addr);
+	data_out_count = status & TX_STATUS_DATA_OUT_COUNT_MASK;
+	busy = wl12xx_tx_double_buffer_busy(wl, data_out_count);
+
+	if (busy)
+		return -EBUSY;
+
+	return 0;
+}
+
+static int wl12xx_tx_id(struct wl12xx *wl, struct sk_buff *skb)
+{
+	int i;
+
+	for (i = 0; i < FW_TX_CMPLT_BLOCK_SIZE; i++)
+		if (wl->tx_frames[i] == NULL) {
+			wl->tx_frames[i] = skb;
+			return i;
+		}
+
+	return -EBUSY;
+}
+
+static void wl12xx_tx_control(struct tx_double_buffer_desc *tx_hdr,
+			      struct ieee80211_tx_info *control, u16 fc)
+{
+	*(u16 *)&tx_hdr->control = 0;
+
+	tx_hdr->control.rate_policy = 0;
+
+	/* 802.11 packets */
+	tx_hdr->control.packet_type = 0;
+
+	if (control->flags & IEEE80211_TX_CTL_NO_ACK)
+		tx_hdr->control.ack_policy = 1;
+
+	tx_hdr->control.tx_complete = 1;
+
+	if ((fc & IEEE80211_FTYPE_DATA) &&
+	    ((fc & IEEE80211_STYPE_QOS_DATA) ||
+	     (fc & IEEE80211_STYPE_QOS_NULLFUNC)))
+		tx_hdr->control.qos = 1;
+}
+
+/* RSN + MIC = 8 + 8 = 16 bytes (worst case - AES). */
+#define MAX_MSDU_SECURITY_LENGTH      16
+#define MAX_MPDU_SECURITY_LENGTH      16
+#define WLAN_QOS_HDR_LEN              26
+#define MAX_MPDU_HEADER_AND_SECURITY  (MAX_MPDU_SECURITY_LENGTH + \
+				       WLAN_QOS_HDR_LEN)
+#define HW_BLOCK_SIZE                 252
+static void wl12xx_tx_frag_block_num(struct tx_double_buffer_desc *tx_hdr)
+{
+	u16 payload_len, frag_threshold, mem_blocks;
+	u16 num_mpdus, mem_blocks_per_frag;
+
+	frag_threshold = IEEE80211_MAX_FRAG_THRESHOLD;
+	tx_hdr->frag_threshold = cpu_to_le16(frag_threshold);
+
+	payload_len = tx_hdr->length + MAX_MSDU_SECURITY_LENGTH;
+
+	if (payload_len > frag_threshold) {
+		mem_blocks_per_frag =
+			((frag_threshold + MAX_MPDU_HEADER_AND_SECURITY) /
+			 HW_BLOCK_SIZE) + 1;
+		num_mpdus = payload_len / frag_threshold;
+		mem_blocks = num_mpdus * mem_blocks_per_frag;
+		payload_len -= num_mpdus * frag_threshold;
+		num_mpdus++;
+
+	} else {
+		mem_blocks_per_frag = 0;
+		mem_blocks = 0;
+		num_mpdus = 1;
+	}
+
+	mem_blocks += (payload_len / HW_BLOCK_SIZE) + 1;
+
+	if (num_mpdus > 1)
+		mem_blocks += min(num_mpdus, mem_blocks_per_frag);
+
+	tx_hdr->num_mem_blocks = mem_blocks;
+}
+
+static int wl12xx_tx_fill_hdr(struct wl12xx *wl, struct sk_buff *skb,
+			      struct ieee80211_tx_info *control)
+{
+	struct tx_double_buffer_desc *tx_hdr;
+	struct ieee80211_rate *rate;
+	int id;
+	u16 fc;
+
+	if (!skb)
+		return -EINVAL;
+
+	id = wl12xx_tx_id(wl, skb);
+	if (id < 0)
+		return id;
+
+	fc = *(u16 *)skb->data;
+	tx_hdr = (struct tx_double_buffer_desc *) skb_push(skb,
+							   sizeof(*tx_hdr));
+
+	tx_hdr->length = cpu_to_le16(skb->len - sizeof(*tx_hdr));
+	rate = ieee80211_get_tx_rate(wl->hw, control);
+	tx_hdr->rate = cpu_to_le16(rate->hw_value);
+	tx_hdr->expiry_time = cpu_to_le32(1 << 16);
+	tx_hdr->id = id;
+
+	/* FIXME: how to get the correct queue id? */
+	tx_hdr->xmit_queue = 0;
+
+	wl12xx_tx_control(tx_hdr, control, fc);
+	wl12xx_tx_frag_block_num(tx_hdr);
+
+	return 0;
+}
+
+/* We copy the packet to the target */
+static int wl12xx_tx_send_packet(struct wl12xx *wl, struct sk_buff *skb,
+				 struct ieee80211_tx_info *control)
+{
+	struct tx_double_buffer_desc *tx_hdr;
+	int len;
+	u32 addr;
+
+	if (!skb)
+		return -EINVAL;
+
+	tx_hdr = (struct tx_double_buffer_desc *) skb->data;
+
+	if (control->control.hw_key &&
+	    control->control.hw_key->alg == ALG_TKIP) {
+		int hdrlen;
+		u16 fc;
+		u8 *pos;
+
+		fc = *(u16 *)(skb->data + sizeof(*tx_hdr));
+		tx_hdr->length += WL12XX_TKIP_IV_SPACE;
+
+		hdrlen = ieee80211_hdrlen(fc);
+
+		pos = skb_push(skb, WL12XX_TKIP_IV_SPACE);
+		memmove(pos, pos + WL12XX_TKIP_IV_SPACE,
+			sizeof(*tx_hdr) + hdrlen);
+	}
+
+	/* Revisit. This is a workaround for getting non-aligned packets.
+	   This happens at least with EAPOL packets from the user space.
+	   Our DMA requires packets to be aligned on a 4-byte boundary.
+	*/
+	if (unlikely((long)skb->data & 0x03)) {
+		int offset = (4 - (long)skb->data) & 0x03;
+		wl12xx_debug(DEBUG_TX, "skb offset %d", offset);
+
+		/* check whether the current skb can be used */
+		if (!skb_cloned(skb) && (skb_tailroom(skb) >= offset)) {
+			unsigned char *src = skb->data;
+
+			/* align the buffer on a 4-byte boundary */
+			skb_reserve(skb, offset);
+			memmove(skb->data, src, skb->len);
+		} else {
+			wl12xx_info("No handler, fixme!");
+			return -EINVAL;
+		}
+	}
+
+	/* Our skb->data at this point includes the HW header */
+	len = WL12XX_TX_ALIGN(skb->len);
+
+	if (wl->data_in_count & 0x1)
+		addr = wl->data_path->tx_packet_ring_addr +
+			wl->data_path->tx_packet_ring_chunk_size;
+	else
+		addr = wl->data_path->tx_packet_ring_addr;
+
+	wl12xx_spi_mem_write(wl, addr, skb->data, len);
+
+	wl12xx_debug(DEBUG_TX, "tx id %u skb 0x%p payload %u rate 0x%x",
+		     tx_hdr->id, skb, tx_hdr->length, tx_hdr->rate);
+
+	return 0;
+}
+
+static void wl12xx_tx_trigger(struct wl12xx *wl)
+{
+	u32 data, addr;
+
+	if (wl->data_in_count & 0x1) {
+		addr = ACX_REG_INTERRUPT_TRIG_H;
+		data = INTR_TRIG_TX_PROC1;
+	} else {
+		addr = ACX_REG_INTERRUPT_TRIG;
+		data = INTR_TRIG_TX_PROC0;
+	}
+
+	wl12xx_reg_write32(wl, addr, data);
+
+	/* Bumping data in */
+	wl->data_in_count = (wl->data_in_count + 1) &
+		TX_STATUS_DATA_OUT_COUNT_MASK;
+}
+
+/* caller must hold wl->mutex */
+static int wl12xx_tx_frame(struct wl12xx *wl, struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info;
+	int ret = 0;
+	u8 idx;
+
+	info = IEEE80211_SKB_CB(skb);
+
+	if (info->control.hw_key) {
+		idx = info->control.hw_key->hw_key_idx;
+		if (unlikely(wl->default_key != idx)) {
+			ret = wl12xx_acx_default_key(wl, idx);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	ret = wl12xx_tx_path_status(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_tx_fill_hdr(wl, skb, info);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_tx_send_packet(wl, skb, info);
+	if (ret < 0)
+		return ret;
+
+	wl12xx_tx_trigger(wl);
+
+	return ret;
+}
+
+void wl12xx_tx_work(struct work_struct *work)
+{
+	struct wl12xx *wl = container_of(work, struct wl12xx, tx_work);
+	struct sk_buff *skb;
+	bool woken_up = false;
+	int ret;
+
+	mutex_lock(&wl->mutex);
+
+	if (unlikely(wl->state == WL12XX_STATE_OFF))
+		goto out;
+
+	while ((skb = skb_dequeue(&wl->tx_queue))) {
+		if (!woken_up) {
+			wl12xx_ps_elp_wakeup(wl);
+			woken_up = true;
+		}
+
+		ret = wl12xx_tx_frame(wl, skb);
+		if (ret == -EBUSY) {
+			/* firmware buffer is full, stop queues */
+			wl12xx_debug(DEBUG_TX, "tx_work: fw buffer full, "
+				     "stop queues");
+			ieee80211_stop_queues(wl->hw);
+			wl->tx_queue_stopped = true;
+			skb_queue_head(&wl->tx_queue, skb);
+			goto out;
+		} else if (ret < 0) {
+			dev_kfree_skb(skb);
+			goto out;
+		}
+	}
+
+out:
+	if (woken_up)
+		wl12xx_ps_elp_sleep(wl);
+
+	mutex_unlock(&wl->mutex);
+}
+
+static const char *wl12xx_tx_parse_status(u8 status)
+{
+	/* 8 bit status field, one character per bit plus null */
+	static char buf[9];
+	int i = 0;
+
+	memset(buf, 0, sizeof(buf));
+
+	if (status & TX_DMA_ERROR)
+		buf[i++] = 'm';
+	if (status & TX_DISABLED)
+		buf[i++] = 'd';
+	if (status & TX_RETRY_EXCEEDED)
+		buf[i++] = 'r';
+	if (status & TX_TIMEOUT)
+		buf[i++] = 't';
+	if (status & TX_KEY_NOT_FOUND)
+		buf[i++] = 'k';
+	if (status & TX_ENCRYPT_FAIL)
+		buf[i++] = 'e';
+	if (status & TX_UNAVAILABLE_PRIORITY)
+		buf[i++] = 'p';
+
+	/* bit 0 is unused apparently */
+
+	return buf;
+}
+
+static void wl12xx_tx_packet_cb(struct wl12xx *wl,
+				struct tx_result *result)
+{
+	struct ieee80211_tx_info *info;
+	struct sk_buff *skb;
+	int hdrlen, ret;
+	u8 *frame;
+
+	skb = wl->tx_frames[result->id];
+	if (skb == NULL) {
+		wl12xx_error("SKB for packet %d is NULL", result->id);
+		return;
+	}
+
+	info = IEEE80211_SKB_CB(skb);
+
+	if (!(info->flags & IEEE80211_TX_CTL_NO_ACK) &&
+	    (result->status == TX_SUCCESS))
+		info->flags |= IEEE80211_TX_STAT_ACK;
+
+	info->status.rates[0].count = result->ack_failures + 1;
+	wl->stats.retry_count += result->ack_failures;
+
+	/*
+	 * We have to remove our private TX header before pushing
+	 * the skb back to mac80211.
+	 */
+	frame = skb_pull(skb, sizeof(struct tx_double_buffer_desc));
+	if (info->control.hw_key &&
+	    info->control.hw_key->alg == ALG_TKIP) {
+		hdrlen = ieee80211_get_hdrlen_from_skb(skb);
+		memmove(frame + WL12XX_TKIP_IV_SPACE, frame, hdrlen);
+		skb_pull(skb, WL12XX_TKIP_IV_SPACE);
+	}
+
+	wl12xx_debug(DEBUG_TX, "tx status id %u skb 0x%p failures %u rate 0x%x"
+		     " status 0x%x (%s)",
+		     result->id, skb, result->ack_failures, result->rate,
+		     result->status, wl12xx_tx_parse_status(result->status));
+
+
+	ieee80211_tx_status(wl->hw, skb);
+
+	wl->tx_frames[result->id] = NULL;
+
+	if (wl->tx_queue_stopped) {
+		wl12xx_debug(DEBUG_TX, "cb: queue was stopped");
+
+		skb = skb_dequeue(&wl->tx_queue);
+
+		/* The skb can be NULL because tx_work might have been
+		   scheduled before the queue was stopped making the
+		   queue empty */
+
+		if (skb) {
+			ret = wl12xx_tx_frame(wl, skb);
+			if (ret == -EBUSY) {
+				/* firmware buffer is still full */
+				wl12xx_debug(DEBUG_TX, "cb: fw buffer "
+					     "still full");
+				skb_queue_head(&wl->tx_queue, skb);
+				return;
+			} else if (ret < 0) {
+				dev_kfree_skb(skb);
+				return;
+			}
+		}
+
+		wl12xx_debug(DEBUG_TX, "cb: waking queues");
+		ieee80211_wake_queues(wl->hw);
+		wl->tx_queue_stopped = false;
+	}
+}
+
+/* Called upon reception of a TX complete interrupt */
+void wl12xx_tx_complete(struct wl12xx *wl)
+{
+	int i, result_index, num_complete = 0;
+	struct tx_result result[FW_TX_CMPLT_BLOCK_SIZE], *result_ptr;
+
+	if (unlikely(wl->state != WL12XX_STATE_ON))
+		return;
+
+	/* First we read the result */
+	wl12xx_spi_mem_read(wl, wl->data_path->tx_complete_addr,
+			    result, sizeof(result));
+
+	result_index = wl->next_tx_complete;
+
+	for (i = 0; i < ARRAY_SIZE(result); i++) {
+		result_ptr = &result[result_index];
+
+		if (result_ptr->done_1 == 1 &&
+		    result_ptr->done_2 == 1) {
+			wl12xx_tx_packet_cb(wl, result_ptr);
+
+			result_ptr->done_1 = 0;
+			result_ptr->done_2 = 0;
+
+			result_index = (result_index + 1) &
+				(FW_TX_CMPLT_BLOCK_SIZE - 1);
+			num_complete++;
+		} else {
+			break;
+		}
+	}
+
+	/* Every completed frame needs to be acknowledged */
+	if (num_complete) {
+		/*
+		 * If we've wrapped, we have to clear
+		 * the results in 2 steps.
+		 */
+		if (result_index > wl->next_tx_complete) {
+			/* Only 1 write is needed */
+			wl12xx_spi_mem_write(wl,
+					     wl->data_path->tx_complete_addr +
+					     (wl->next_tx_complete *
+					      sizeof(struct tx_result)),
+					     &result[wl->next_tx_complete],
+					     num_complete *
+					     sizeof(struct tx_result));
+
+
+		} else if (result_index < wl->next_tx_complete) {
+			/* 2 writes are needed */
+			wl12xx_spi_mem_write(wl,
+					     wl->data_path->tx_complete_addr +
+					     (wl->next_tx_complete *
+					      sizeof(struct tx_result)),
+					     &result[wl->next_tx_complete],
+					     (FW_TX_CMPLT_BLOCK_SIZE -
+					      wl->next_tx_complete) *
+					     sizeof(struct tx_result));
+
+			wl12xx_spi_mem_write(wl,
+					     wl->data_path->tx_complete_addr,
+					     result,
+					     (num_complete -
+					      FW_TX_CMPLT_BLOCK_SIZE +
+					      wl->next_tx_complete) *
+					     sizeof(struct tx_result));
+
+		} else {
+			/* We have to write the whole array */
+			wl12xx_spi_mem_write(wl,
+					     wl->data_path->tx_complete_addr,
+					     result,
+					     FW_TX_CMPLT_BLOCK_SIZE *
+					     sizeof(struct tx_result));
+		}
+
+	}
+
+	wl->next_tx_complete = result_index;
+}
+
+/* caller must hold wl->mutex */
+void wl12xx_tx_flush(struct wl12xx *wl)
+{
+	int i;
+	struct sk_buff *skb;
+	struct ieee80211_tx_info *info;
+
+	/* TX failure */
+/* 	control->flags = 0; FIXME */
+
+	while ((skb = skb_dequeue(&wl->tx_queue))) {
+		info = IEEE80211_SKB_CB(skb);
+
+		wl12xx_debug(DEBUG_TX, "flushing skb 0x%p", skb);
+
+		if (!(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS))
+				continue;
+
+		ieee80211_tx_status(wl->hw, skb);
+	}
+
+	for (i = 0; i < FW_TX_CMPLT_BLOCK_SIZE; i++)
+		if (wl->tx_frames[i] != NULL) {
+			skb = wl->tx_frames[i];
+			info = IEEE80211_SKB_CB(skb);
+
+			if (!(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS))
+				continue;
+
+			ieee80211_tx_status(wl->hw, skb);
+			wl->tx_frames[i] = NULL;
+		}
+}
diff --git a/drivers/net/wireless/wl12xx/tx.h b/drivers/net/wireless/wl12xx/tx.h
new file mode 100644
index 00000000000..dc82691f4c1
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/tx.h
@@ -0,0 +1,215 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_TX_H__
+#define __WL12XX_TX_H__
+
+#include <linux/bitops.h>
+
+/*
+ *
+ * TX PATH
+ *
+ * The Tx path uses a double buffer and a tx_control structure, each located
+ * at a fixed address in the device's memory. On startup, the host retrieves
+ * the pointers to these addresses. A double buffer allows for continuous data
+ * flow towards the device. The host keeps track of which buffer is available
+ * and alternates between these two buffers on a per packet basis.
+ *
+ * The size of each of the two buffers is large enough to hold the longest
+ * 802.3 packet - maximum size Ethernet packet + header + descriptor.
+ * TX complete indication will be received a-synchronously in a TX done cyclic
+ * buffer which is composed of 16 tx_result descriptors structures and is used
+ * in a cyclic manner.
+ *
+ * The TX (HOST) procedure is as follows:
+ * 1. Read the Tx path status, that will give the data_out_count.
+ * 2. goto 1, if not possible.
+ *    i.e. if data_in_count - data_out_count >= HwBuffer size (2 for double
+ *    buffer).
+ * 3. Copy the packet (preceded by double_buffer_desc), if possible.
+ *    i.e. if data_in_count - data_out_count < HwBuffer size (2 for double
+ *    buffer).
+ * 4. increment data_in_count.
+ * 5. Inform the firmware by generating a firmware internal interrupt.
+ * 6. FW will increment data_out_count after it reads the buffer.
+ *
+ * The TX Complete procedure:
+ * 1. To get a TX complete indication the host enables the tx_complete flag in
+ *    the TX descriptor Structure.
+ * 2. For each packet with a Tx Complete field set, the firmware adds the
+ *    transmit results to the cyclic buffer (txDoneRing) and sets both done_1
+ *    and done_2 to 1 to indicate driver ownership.
+ * 3. The firmware sends a Tx Complete interrupt to the host to trigger the
+ *    host to process the new data. Note: interrupt will be send per packet if
+ *    TX complete indication was requested in tx_control or per crossing
+ *    aggregation threshold.
+ * 4. After receiving the Tx Complete interrupt, the host reads the
+ *    TxDescriptorDone information in a cyclic manner and clears both done_1
+ *    and done_2 fields.
+ *
+ */
+
+#define TX_COMPLETE_REQUIRED_BIT	0x80
+#define TX_STATUS_DATA_OUT_COUNT_MASK   0xf
+#define WL12XX_TX_ALIGN_TO 4
+#define WL12XX_TX_ALIGN(len) (((len) + WL12XX_TX_ALIGN_TO - 1) & \
+			     ~(WL12XX_TX_ALIGN_TO - 1))
+#define WL12XX_TKIP_IV_SPACE 4
+
+struct tx_control {
+	/* Rate Policy (class) index */
+	unsigned rate_policy:3;
+
+	/* When set, no ack policy is expected */
+	unsigned ack_policy:1;
+
+	/*
+	 * Packet type:
+	 * 0 -> 802.11
+	 * 1 -> 802.3
+	 * 2 -> IP
+	 * 3 -> raw codec
+	 */
+	unsigned packet_type:2;
+
+	/* If set, this is a QoS-Null or QoS-Data frame */
+	unsigned qos:1;
+
+	/*
+	 * If set, the target triggers the tx complete INT
+	 * upon frame sending completion.
+	 */
+	unsigned tx_complete:1;
+
+	/* 2 bytes padding before packet header */
+	unsigned xfer_pad:1;
+
+	unsigned reserved:7;
+} __attribute__ ((packed));
+
+
+struct tx_double_buffer_desc {
+	/* Length of payload, including headers. */
+	u16 length;
+
+	/*
+	 * A bit mask that specifies the initial rate to be used
+	 * Possible values are:
+	 * 0x0001 - 1Mbits
+	 * 0x0002 - 2Mbits
+	 * 0x0004 - 5.5Mbits
+	 * 0x0008 - 6Mbits
+	 * 0x0010 - 9Mbits
+	 * 0x0020 - 11Mbits
+	 * 0x0040 - 12Mbits
+	 * 0x0080 - 18Mbits
+	 * 0x0100 - 22Mbits
+	 * 0x0200 - 24Mbits
+	 * 0x0400 - 36Mbits
+	 * 0x0800 - 48Mbits
+	 * 0x1000 - 54Mbits
+	 */
+	u16 rate;
+
+	/* Time in us that a packet can spend in the target */
+	u32 expiry_time;
+
+	/* index of the TX queue used for this packet */
+	u8 xmit_queue;
+
+	/* Used to identify a packet */
+	u8 id;
+
+	struct tx_control control;
+
+	/*
+	 * The FW should cut the packet into fragments
+	 * of this size.
+	 */
+	u16 frag_threshold;
+
+	/* Numbers of HW queue blocks to be allocated */
+	u8 num_mem_blocks;
+
+	u8 reserved;
+} __attribute__ ((packed));
+
+enum {
+	TX_SUCCESS              = 0,
+	TX_DMA_ERROR            = BIT(7),
+	TX_DISABLED             = BIT(6),
+	TX_RETRY_EXCEEDED       = BIT(5),
+	TX_TIMEOUT              = BIT(4),
+	TX_KEY_NOT_FOUND        = BIT(3),
+	TX_ENCRYPT_FAIL         = BIT(2),
+	TX_UNAVAILABLE_PRIORITY = BIT(1),
+};
+
+struct tx_result {
+	/*
+	 * Ownership synchronization between the host and
+	 * the firmware. If done_1 and done_2 are cleared,
+	 * owned by the FW (no info ready).
+	 */
+	u8 done_1;
+
+	/* same as double_buffer_desc->id */
+	u8 id;
+
+	/*
+	 * Total air access duration consumed by this
+	 * packet, including all retries and overheads.
+	 */
+	u16 medium_usage;
+
+	/* Total media delay (from 1st EDCA AIFS counter until TX Complete). */
+	u32 medium_delay;
+
+	/* Time between host xfer and tx complete */
+	u32 fw_hnadling_time;
+
+	/* The LS-byte of the last TKIP sequence number. */
+	u8 lsb_seq_num;
+
+	/* Retry count */
+	u8 ack_failures;
+
+	/* At which rate we got a ACK */
+	u16 rate;
+
+	u16 reserved;
+
+	/* TX_* */
+	u8 status;
+
+	/* See done_1 */
+	u8 done_2;
+} __attribute__ ((packed));
+
+void wl12xx_tx_work(struct work_struct *work);
+void wl12xx_tx_complete(struct wl12xx *wl);
+void wl12xx_tx_flush(struct wl12xx *wl);
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/wl1251.c b/drivers/net/wireless/wl12xx/wl1251.c
new file mode 100644
index 00000000000..bd0decdcad2
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/wl1251.c
@@ -0,0 +1,709 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "wl1251.h"
+#include "reg.h"
+#include "spi.h"
+#include "boot.h"
+#include "event.h"
+#include "acx.h"
+#include "tx.h"
+#include "rx.h"
+#include "ps.h"
+#include "init.h"
+
+static struct wl12xx_partition_set wl1251_part_table[PART_TABLE_LEN] = {
+	[PART_DOWN] = {
+		.mem = {
+			.start = 0x00000000,
+			.size  = 0x00016800
+		},
+		.reg = {
+			.start = REGISTERS_BASE,
+			.size  = REGISTERS_DOWN_SIZE
+		},
+	},
+
+	[PART_WORK] = {
+		.mem = {
+			.start = 0x00028000,
+			.size  = 0x00014000
+		},
+		.reg = {
+			.start = REGISTERS_BASE,
+			.size  = REGISTERS_WORK_SIZE
+		},
+	},
+
+	/* WL1251 doesn't use the DRPW partition, so we don't set it here */
+};
+
+static enum wl12xx_acx_int_reg wl1251_acx_reg_table[ACX_REG_TABLE_LEN] = {
+	[ACX_REG_INTERRUPT_TRIG]     = (REGISTERS_BASE + 0x0474),
+	[ACX_REG_INTERRUPT_TRIG_H]   = (REGISTERS_BASE + 0x0478),
+	[ACX_REG_INTERRUPT_MASK]     = (REGISTERS_BASE + 0x0494),
+	[ACX_REG_HINT_MASK_SET]      = (REGISTERS_BASE + 0x0498),
+	[ACX_REG_HINT_MASK_CLR]      = (REGISTERS_BASE + 0x049C),
+	[ACX_REG_INTERRUPT_NO_CLEAR] = (REGISTERS_BASE + 0x04B0),
+	[ACX_REG_INTERRUPT_CLEAR]    = (REGISTERS_BASE + 0x04A4),
+	[ACX_REG_INTERRUPT_ACK]      = (REGISTERS_BASE + 0x04A8),
+	[ACX_REG_SLV_SOFT_RESET]     = (REGISTERS_BASE + 0x0000),
+	[ACX_REG_EE_START]           = (REGISTERS_BASE + 0x080C),
+	[ACX_REG_ECPU_CONTROL]       = (REGISTERS_BASE + 0x0804)
+};
+
+static int wl1251_upload_firmware(struct wl12xx *wl)
+{
+	struct wl12xx_partition_set *p_table = wl->chip.p_table;
+	int addr, chunk_num, partition_limit;
+	size_t fw_data_len;
+	u8 *p;
+
+	/* whal_FwCtrl_LoadFwImageSm() */
+
+	wl12xx_debug(DEBUG_BOOT, "chip id before fw upload: 0x%x",
+		     wl12xx_reg_read32(wl, CHIP_ID_B));
+
+	/* 10.0 check firmware length and set partition */
+	fw_data_len =  (wl->fw[4] << 24) | (wl->fw[5] << 16) |
+		(wl->fw[6] << 8) | (wl->fw[7]);
+
+	wl12xx_debug(DEBUG_BOOT, "fw_data_len %d chunk_size %d", fw_data_len,
+		CHUNK_SIZE);
+
+	if ((fw_data_len % 4) != 0) {
+		wl12xx_error("firmware length not multiple of four");
+		return -EIO;
+	}
+
+	wl12xx_set_partition(wl,
+			     p_table[PART_DOWN].mem.start,
+			     p_table[PART_DOWN].mem.size,
+			     p_table[PART_DOWN].reg.start,
+			     p_table[PART_DOWN].reg.size);
+
+	/* 10.1 set partition limit and chunk num */
+	chunk_num = 0;
+	partition_limit = p_table[PART_DOWN].mem.size;
+
+	while (chunk_num < fw_data_len / CHUNK_SIZE) {
+		/* 10.2 update partition, if needed */
+		addr = p_table[PART_DOWN].mem.start +
+			(chunk_num + 2) * CHUNK_SIZE;
+		if (addr > partition_limit) {
+			addr = p_table[PART_DOWN].mem.start +
+				chunk_num * CHUNK_SIZE;
+			partition_limit = chunk_num * CHUNK_SIZE +
+				p_table[PART_DOWN].mem.size;
+			wl12xx_set_partition(wl,
+					     addr,
+					     p_table[PART_DOWN].mem.size,
+					     p_table[PART_DOWN].reg.start,
+					     p_table[PART_DOWN].reg.size);
+		}
+
+		/* 10.3 upload the chunk */
+		addr = p_table[PART_DOWN].mem.start + chunk_num * CHUNK_SIZE;
+		p = wl->fw + FW_HDR_SIZE + chunk_num * CHUNK_SIZE;
+		wl12xx_debug(DEBUG_BOOT, "uploading fw chunk 0x%p to 0x%x",
+			     p, addr);
+		wl12xx_spi_mem_write(wl, addr, p, CHUNK_SIZE);
+
+		chunk_num++;
+	}
+
+	/* 10.4 upload the last chunk */
+	addr = p_table[PART_DOWN].mem.start + chunk_num * CHUNK_SIZE;
+	p = wl->fw + FW_HDR_SIZE + chunk_num * CHUNK_SIZE;
+	wl12xx_debug(DEBUG_BOOT, "uploading fw last chunk (%d B) 0x%p to 0x%x",
+		     fw_data_len % CHUNK_SIZE, p, addr);
+	wl12xx_spi_mem_write(wl, addr, p, fw_data_len % CHUNK_SIZE);
+
+	return 0;
+}
+
+static int wl1251_upload_nvs(struct wl12xx *wl)
+{
+	size_t nvs_len, nvs_bytes_written, burst_len;
+	int nvs_start, i;
+	u32 dest_addr, val;
+	u8 *nvs_ptr, *nvs;
+
+	nvs = wl->nvs;
+	if (nvs == NULL)
+		return -ENODEV;
+
+	nvs_ptr = nvs;
+
+	nvs_len = wl->nvs_len;
+	nvs_start = wl->fw_len;
+
+	/*
+	 * Layout before the actual NVS tables:
+	 * 1 byte : burst length.
+	 * 2 bytes: destination address.
+	 * n bytes: data to burst copy.
+	 *
+	 * This is ended by a 0 length, then the NVS tables.
+	 */
+
+	while (nvs_ptr[0]) {
+		burst_len = nvs_ptr[0];
+		dest_addr = (nvs_ptr[1] & 0xfe) | ((u32)(nvs_ptr[2] << 8));
+
+		/* We move our pointer to the data */
+		nvs_ptr += 3;
+
+		for (i = 0; i < burst_len; i++) {
+			val = (nvs_ptr[0] | (nvs_ptr[1] << 8)
+			       | (nvs_ptr[2] << 16) | (nvs_ptr[3] << 24));
+
+			wl12xx_debug(DEBUG_BOOT,
+				     "nvs burst write 0x%x: 0x%x",
+				     dest_addr, val);
+			wl12xx_mem_write32(wl, dest_addr, val);
+
+			nvs_ptr += 4;
+			dest_addr += 4;
+		}
+	}
+
+	/*
+	 * We've reached the first zero length, the first NVS table
+	 * is 7 bytes further.
+	 */
+	nvs_ptr += 7;
+	nvs_len -= nvs_ptr - nvs;
+	nvs_len = ALIGN(nvs_len, 4);
+
+	/* Now we must set the partition correctly */
+	wl12xx_set_partition(wl, nvs_start,
+			     wl->chip.p_table[PART_DOWN].mem.size,
+			     wl->chip.p_table[PART_DOWN].reg.start,
+			     wl->chip.p_table[PART_DOWN].reg.size);
+
+	/* And finally we upload the NVS tables */
+	nvs_bytes_written = 0;
+	while (nvs_bytes_written < nvs_len) {
+		val = (nvs_ptr[0] | (nvs_ptr[1] << 8)
+		       | (nvs_ptr[2] << 16) | (nvs_ptr[3] << 24));
+
+		val = cpu_to_le32(val);
+
+		wl12xx_debug(DEBUG_BOOT,
+			     "nvs write table 0x%x: 0x%x",
+			     nvs_start, val);
+		wl12xx_mem_write32(wl, nvs_start, val);
+
+		nvs_ptr += 4;
+		nvs_bytes_written += 4;
+		nvs_start += 4;
+	}
+
+	return 0;
+}
+
+static int wl1251_boot(struct wl12xx *wl)
+{
+	int ret = 0, minor_minor_e2_ver;
+	u32 tmp, boot_data;
+
+	ret = wl12xx_boot_soft_reset(wl);
+	if (ret < 0)
+		goto out;
+
+	/* 2. start processing NVS file */
+	ret = wl->chip.op_upload_nvs(wl);
+	if (ret < 0)
+		goto out;
+
+	/* write firmware's last address (ie. it's length) to
+	 * ACX_EEPROMLESS_IND_REG */
+	wl12xx_reg_write32(wl, ACX_EEPROMLESS_IND_REG, wl->fw_len);
+
+	/* 6. read the EEPROM parameters */
+	tmp = wl12xx_reg_read32(wl, SCR_PAD2);
+
+	/* 7. read bootdata */
+	wl->boot_attr.radio_type = (tmp & 0x0000FF00) >> 8;
+	wl->boot_attr.major = (tmp & 0x00FF0000) >> 16;
+	tmp = wl12xx_reg_read32(wl, SCR_PAD3);
+
+	/* 8. check bootdata and call restart sequence */
+	wl->boot_attr.minor = (tmp & 0x00FF0000) >> 16;
+	minor_minor_e2_ver = (tmp & 0xFF000000) >> 24;
+
+	wl12xx_debug(DEBUG_BOOT, "radioType 0x%x majorE2Ver 0x%x "
+		     "minorE2Ver 0x%x minor_minor_e2_ver 0x%x",
+		     wl->boot_attr.radio_type, wl->boot_attr.major,
+		     wl->boot_attr.minor, minor_minor_e2_ver);
+
+	ret = wl12xx_boot_init_seq(wl);
+	if (ret < 0)
+		goto out;
+
+	/* 9. NVS processing done */
+	boot_data = wl12xx_reg_read32(wl, ACX_REG_ECPU_CONTROL);
+
+	wl12xx_debug(DEBUG_BOOT, "halt boot_data 0x%x", boot_data);
+
+	/* 10. check that ECPU_CONTROL_HALT bits are set in
+	 * pWhalBus->uBootData and start uploading firmware
+	 */
+	if ((boot_data & ECPU_CONTROL_HALT) == 0) {
+		wl12xx_error("boot failed, ECPU_CONTROL_HALT not set");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = wl->chip.op_upload_fw(wl);
+	if (ret < 0)
+		goto out;
+
+	/* 10.5 start firmware */
+	ret = wl12xx_boot_run_firmware(wl);
+	if (ret < 0)
+		goto out;
+
+	/* Get and save the firmware version */
+	wl12xx_acx_fw_version(wl, wl->chip.fw_ver, sizeof(wl->chip.fw_ver));
+
+out:
+	return ret;
+}
+
+static int wl1251_mem_cfg(struct wl12xx *wl)
+{
+	struct wl1251_acx_config_memory mem_conf;
+	int ret, i;
+
+	wl12xx_debug(DEBUG_ACX, "wl1251 mem cfg");
+
+	/* memory config */
+	mem_conf.mem_config.num_stations = cpu_to_le16(DEFAULT_NUM_STATIONS);
+	mem_conf.mem_config.rx_mem_block_num = 35;
+	mem_conf.mem_config.tx_min_mem_block_num = 64;
+	mem_conf.mem_config.num_tx_queues = MAX_TX_QUEUES;
+	mem_conf.mem_config.host_if_options = HOSTIF_PKT_RING;
+	mem_conf.mem_config.num_ssid_profiles = 1;
+	mem_conf.mem_config.debug_buffer_size =
+		cpu_to_le16(TRACE_BUFFER_MAX_SIZE);
+
+	/* RX queue config */
+	mem_conf.rx_queue_config.dma_address = 0;
+	mem_conf.rx_queue_config.num_descs = ACX_RX_DESC_DEF;
+	mem_conf.rx_queue_config.priority = DEFAULT_RXQ_PRIORITY;
+	mem_conf.rx_queue_config.type = DEFAULT_RXQ_TYPE;
+
+	/* TX queue config */
+	for (i = 0; i < MAX_TX_QUEUES; i++) {
+		mem_conf.tx_queue_config[i].num_descs = ACX_TX_DESC_DEF;
+		mem_conf.tx_queue_config[i].attributes = i;
+	}
+
+	mem_conf.header.id = ACX_MEM_CFG;
+	mem_conf.header.len = sizeof(struct wl1251_acx_config_memory) -
+		sizeof(struct acx_header);
+	mem_conf.header.len -=
+		(MAX_TX_QUEUE_CONFIGS - mem_conf.mem_config.num_tx_queues) *
+		sizeof(struct wl1251_acx_tx_queue_config);
+
+	ret = wl12xx_cmd_configure(wl, &mem_conf,
+				   sizeof(struct wl1251_acx_config_memory));
+	if (ret < 0)
+		wl12xx_warning("wl1251 mem config failed: %d", ret);
+
+	return ret;
+}
+
+static int wl1251_hw_init_mem_config(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl1251_mem_cfg(wl);
+	if (ret < 0)
+		return ret;
+
+	wl->target_mem_map = kzalloc(sizeof(struct wl1251_acx_mem_map),
+					  GFP_KERNEL);
+	if (!wl->target_mem_map) {
+		wl12xx_error("couldn't allocate target memory map");
+		return -ENOMEM;
+	}
+
+	/* we now ask for the firmware built memory map */
+	ret = wl12xx_acx_mem_map(wl, wl->target_mem_map,
+				 sizeof(struct wl1251_acx_mem_map));
+	if (ret < 0) {
+		wl12xx_error("couldn't retrieve firmware memory map");
+		kfree(wl->target_mem_map);
+		wl->target_mem_map = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static void wl1251_set_ecpu_ctrl(struct wl12xx *wl, u32 flag)
+{
+	u32 cpu_ctrl;
+
+	/* 10.5.0 run the firmware (I) */
+	cpu_ctrl = wl12xx_reg_read32(wl, ACX_REG_ECPU_CONTROL);
+
+	/* 10.5.1 run the firmware (II) */
+	cpu_ctrl &= ~flag;
+	wl12xx_reg_write32(wl, ACX_REG_ECPU_CONTROL, cpu_ctrl);
+}
+
+static void wl1251_target_enable_interrupts(struct wl12xx *wl)
+{
+	/* Enable target's interrupts */
+	wl->intr_mask = WL1251_ACX_INTR_RX0_DATA |
+		WL1251_ACX_INTR_RX1_DATA |
+		WL1251_ACX_INTR_TX_RESULT |
+		WL1251_ACX_INTR_EVENT_A |
+		WL1251_ACX_INTR_EVENT_B |
+		WL1251_ACX_INTR_INIT_COMPLETE;
+	wl12xx_boot_target_enable_interrupts(wl);
+}
+
+static void wl1251_irq_work(struct work_struct *work)
+{
+	u32 intr;
+	struct wl12xx *wl =
+		container_of(work, struct wl12xx, irq_work);
+
+	mutex_lock(&wl->mutex);
+
+	wl12xx_debug(DEBUG_IRQ, "IRQ work");
+
+	if (wl->state == WL12XX_STATE_OFF)
+		goto out;
+
+	wl12xx_ps_elp_wakeup(wl);
+
+	wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_MASK, WL1251_ACX_INTR_ALL);
+
+	intr = wl12xx_reg_read32(wl, ACX_REG_INTERRUPT_CLEAR);
+	wl12xx_debug(DEBUG_IRQ, "intr: 0x%x", intr);
+
+	if (wl->data_path) {
+		wl12xx_spi_mem_read(wl, wl->data_path->rx_control_addr,
+				    &wl->rx_counter, sizeof(u32));
+
+		/* We handle a frmware bug here */
+		switch ((wl->rx_counter - wl->rx_handled) & 0xf) {
+		case 0:
+			wl12xx_debug(DEBUG_IRQ, "RX: FW and host in sync");
+			intr &= ~WL1251_ACX_INTR_RX0_DATA;
+			intr &= ~WL1251_ACX_INTR_RX1_DATA;
+			break;
+		case 1:
+			wl12xx_debug(DEBUG_IRQ, "RX: FW +1");
+			intr |= WL1251_ACX_INTR_RX0_DATA;
+			intr &= ~WL1251_ACX_INTR_RX1_DATA;
+			break;
+		case 2:
+			wl12xx_debug(DEBUG_IRQ, "RX: FW +2");
+			intr |= WL1251_ACX_INTR_RX0_DATA;
+			intr |= WL1251_ACX_INTR_RX1_DATA;
+			break;
+		default:
+			wl12xx_warning("RX: FW and host out of sync: %d",
+				       wl->rx_counter - wl->rx_handled);
+			break;
+		}
+
+		wl->rx_handled = wl->rx_counter;
+
+
+		wl12xx_debug(DEBUG_IRQ, "RX counter: %d", wl->rx_counter);
+	}
+
+	intr &= wl->intr_mask;
+
+	if (intr == 0) {
+		wl12xx_debug(DEBUG_IRQ, "INTR is 0");
+		wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_MASK,
+				   ~(wl->intr_mask));
+
+		goto out_sleep;
+	}
+
+	if (intr & WL1251_ACX_INTR_RX0_DATA) {
+		wl12xx_debug(DEBUG_IRQ, "WL1251_ACX_INTR_RX0_DATA");
+		wl12xx_rx(wl);
+	}
+
+	if (intr & WL1251_ACX_INTR_RX1_DATA) {
+		wl12xx_debug(DEBUG_IRQ, "WL1251_ACX_INTR_RX1_DATA");
+		wl12xx_rx(wl);
+	}
+
+	if (intr & WL1251_ACX_INTR_TX_RESULT) {
+		wl12xx_debug(DEBUG_IRQ, "WL1251_ACX_INTR_TX_RESULT");
+		wl12xx_tx_complete(wl);
+	}
+
+	if (intr & (WL1251_ACX_INTR_EVENT_A | WL1251_ACX_INTR_EVENT_B)) {
+		wl12xx_debug(DEBUG_IRQ, "WL1251_ACX_INTR_EVENT (0x%x)", intr);
+		if (intr & WL1251_ACX_INTR_EVENT_A)
+			wl12xx_event_handle(wl, 0);
+		else
+			wl12xx_event_handle(wl, 1);
+	}
+
+	if (intr & WL1251_ACX_INTR_INIT_COMPLETE)
+		wl12xx_debug(DEBUG_IRQ, "WL1251_ACX_INTR_INIT_COMPLETE");
+
+	wl12xx_reg_write32(wl, ACX_REG_INTERRUPT_MASK, ~(wl->intr_mask));
+
+out_sleep:
+	wl12xx_ps_elp_sleep(wl);
+out:
+	mutex_unlock(&wl->mutex);
+}
+
+static int wl1251_hw_init_txq_fill(u8 qid,
+				   struct acx_tx_queue_qos_config *config,
+				   u32 num_blocks)
+{
+	config->qid = qid;
+
+	switch (qid) {
+	case QOS_AC_BE:
+		config->high_threshold =
+			(QOS_TX_HIGH_BE_DEF * num_blocks) / 100;
+		config->low_threshold =
+			(QOS_TX_LOW_BE_DEF * num_blocks) / 100;
+		break;
+	case QOS_AC_BK:
+		config->high_threshold =
+			(QOS_TX_HIGH_BK_DEF * num_blocks) / 100;
+		config->low_threshold =
+			(QOS_TX_LOW_BK_DEF * num_blocks) / 100;
+		break;
+	case QOS_AC_VI:
+		config->high_threshold =
+			(QOS_TX_HIGH_VI_DEF * num_blocks) / 100;
+		config->low_threshold =
+			(QOS_TX_LOW_VI_DEF * num_blocks) / 100;
+		break;
+	case QOS_AC_VO:
+		config->high_threshold =
+			(QOS_TX_HIGH_VO_DEF * num_blocks) / 100;
+		config->low_threshold =
+			(QOS_TX_LOW_VO_DEF * num_blocks) / 100;
+		break;
+	default:
+		wl12xx_error("Invalid TX queue id: %d", qid);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int wl1251_hw_init_tx_queue_config(struct wl12xx *wl)
+{
+	struct acx_tx_queue_qos_config config;
+	struct wl1251_acx_mem_map *wl_mem_map = wl->target_mem_map;
+	int ret, i;
+
+	wl12xx_debug(DEBUG_ACX, "acx tx queue config");
+
+	config.header.id = ACX_TX_QUEUE_CFG;
+	config.header.len = sizeof(struct acx_tx_queue_qos_config) -
+		sizeof(struct acx_header);
+
+	for (i = 0; i < MAX_NUM_OF_AC; i++) {
+		ret = wl1251_hw_init_txq_fill(i, &config,
+					      wl_mem_map->num_tx_mem_blocks);
+		if (ret < 0)
+			return ret;
+
+		ret = wl12xx_cmd_configure(wl, &config, sizeof(config));
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int wl1251_hw_init_data_path_config(struct wl12xx *wl)
+{
+	int ret;
+
+	/* asking for the data path parameters */
+	wl->data_path = kzalloc(sizeof(struct acx_data_path_params_resp),
+				GFP_KERNEL);
+	if (!wl->data_path) {
+		wl12xx_error("Couldnt allocate data path parameters");
+		return -ENOMEM;
+	}
+
+	ret = wl12xx_acx_data_path_params(wl, wl->data_path);
+	if (ret < 0) {
+		kfree(wl->data_path);
+		wl->data_path = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int wl1251_hw_init(struct wl12xx *wl)
+{
+	struct wl1251_acx_mem_map *wl_mem_map;
+	int ret;
+
+	ret = wl12xx_hw_init_hwenc_config(wl);
+	if (ret < 0)
+		return ret;
+
+	/* Template settings */
+	ret = wl12xx_hw_init_templates_config(wl);
+	if (ret < 0)
+		return ret;
+
+	/* Default memory configuration */
+	ret = wl1251_hw_init_mem_config(wl);
+	if (ret < 0)
+		return ret;
+
+	/* Default data path configuration  */
+	ret = wl1251_hw_init_data_path_config(wl);
+	if (ret < 0)
+		goto out_free_memmap;
+
+	/* RX config */
+	ret = wl12xx_hw_init_rx_config(wl,
+				       RX_CFG_PROMISCUOUS | RX_CFG_TSF,
+				       RX_FILTER_OPTION_DEF);
+	/* RX_CONFIG_OPTION_ANY_DST_ANY_BSS,
+	   RX_FILTER_OPTION_FILTER_ALL); */
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* TX queues config */
+	ret = wl1251_hw_init_tx_queue_config(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* PHY layer config */
+	ret = wl12xx_hw_init_phy_config(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* Beacon filtering */
+	ret = wl12xx_hw_init_beacon_filter(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* Bluetooth WLAN coexistence */
+	ret = wl12xx_hw_init_pta(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* Energy detection */
+	ret = wl12xx_hw_init_energy_detection(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* Beacons and boradcast settings */
+	ret = wl12xx_hw_init_beacon_broadcast(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* Enable data path */
+	ret = wl12xx_cmd_data_path(wl, wl->channel, 1);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	/* Default power state */
+	ret = wl12xx_hw_init_power_auth(wl);
+	if (ret < 0)
+		goto out_free_data_path;
+
+	wl_mem_map = wl->target_mem_map;
+	wl12xx_info("%d tx blocks at 0x%x, %d rx blocks at 0x%x",
+		    wl_mem_map->num_tx_mem_blocks,
+		    wl->data_path->tx_control_addr,
+		    wl_mem_map->num_rx_mem_blocks,
+		    wl->data_path->rx_control_addr);
+
+	return 0;
+
+ out_free_data_path:
+	kfree(wl->data_path);
+
+ out_free_memmap:
+	kfree(wl->target_mem_map);
+
+	return ret;
+}
+
+static int wl1251_plt_init(struct wl12xx *wl)
+{
+	int ret;
+
+	ret = wl1251_hw_init_mem_config(wl);
+	if (ret < 0)
+		return ret;
+
+	ret = wl12xx_cmd_data_path(wl, wl->channel, 1);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+void wl1251_setup(struct wl12xx *wl)
+{
+	/* FIXME: Is it better to use strncpy here or is this ok? */
+	wl->chip.fw_filename = WL1251_FW_NAME;
+	wl->chip.nvs_filename = WL1251_NVS_NAME;
+
+	/* Now we know what chip we're using, so adjust the power on sleep
+	 * time accordingly */
+	wl->chip.power_on_sleep = WL1251_POWER_ON_SLEEP;
+
+	wl->chip.intr_cmd_complete = WL1251_ACX_INTR_CMD_COMPLETE;
+	wl->chip.intr_init_complete = WL1251_ACX_INTR_INIT_COMPLETE;
+
+	wl->chip.op_upload_nvs = wl1251_upload_nvs;
+	wl->chip.op_upload_fw = wl1251_upload_firmware;
+	wl->chip.op_boot = wl1251_boot;
+	wl->chip.op_set_ecpu_ctrl = wl1251_set_ecpu_ctrl;
+	wl->chip.op_target_enable_interrupts = wl1251_target_enable_interrupts;
+	wl->chip.op_hw_init = wl1251_hw_init;
+	wl->chip.op_plt_init = wl1251_plt_init;
+
+	wl->chip.p_table = wl1251_part_table;
+	wl->chip.acx_reg_table = wl1251_acx_reg_table;
+
+	INIT_WORK(&wl->irq_work, wl1251_irq_work);
+}
diff --git a/drivers/net/wireless/wl12xx/wl1251.h b/drivers/net/wireless/wl12xx/wl1251.h
new file mode 100644
index 00000000000..1f4a4433039
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/wl1251.h
@@ -0,0 +1,165 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2008 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL1251_H__
+#define __WL1251_H__
+
+#include <linux/bitops.h>
+
+#include "wl12xx.h"
+#include "acx.h"
+
+#define WL1251_FW_NAME "wl1251-fw.bin"
+#define WL1251_NVS_NAME "wl1251-nvs.bin"
+
+#define WL1251_POWER_ON_SLEEP 10 /* in miliseconds */
+
+void wl1251_setup(struct wl12xx *wl);
+
+
+struct wl1251_acx_memory {
+	__le16 num_stations; /* number of STAs to be supported. */
+	u16 reserved_1;
+
+	/*
+	 * Nmber of memory buffers for the RX mem pool.
+	 * The actual number may be less if there are
+	 * not enough blocks left for the minimum num
+	 * of TX ones.
+	 */
+	u8 rx_mem_block_num;
+	u8 reserved_2;
+	u8 num_tx_queues; /* From 1 to 16 */
+	u8 host_if_options; /* HOST_IF* */
+	u8 tx_min_mem_block_num;
+	u8 num_ssid_profiles;
+	__le16 debug_buffer_size;
+} __attribute__ ((packed));
+
+
+#define ACX_RX_DESC_MIN                1
+#define ACX_RX_DESC_MAX                127
+#define ACX_RX_DESC_DEF                32
+struct wl1251_acx_rx_queue_config {
+	u8 num_descs;
+	u8 pad;
+	u8 type;
+	u8 priority;
+	__le32 dma_address;
+} __attribute__ ((packed));
+
+#define ACX_TX_DESC_MIN                1
+#define ACX_TX_DESC_MAX                127
+#define ACX_TX_DESC_DEF                16
+struct wl1251_acx_tx_queue_config {
+    u8 num_descs;
+    u8 pad[2];
+    u8 attributes;
+} __attribute__ ((packed));
+
+#define MAX_TX_QUEUE_CONFIGS 5
+#define MAX_TX_QUEUES 4
+struct wl1251_acx_config_memory {
+	struct acx_header header;
+
+	struct wl1251_acx_memory mem_config;
+	struct wl1251_acx_rx_queue_config rx_queue_config;
+	struct wl1251_acx_tx_queue_config tx_queue_config[MAX_TX_QUEUE_CONFIGS];
+} __attribute__ ((packed));
+
+struct wl1251_acx_mem_map {
+	struct acx_header header;
+
+	void *code_start;
+	void *code_end;
+
+	void *wep_defkey_start;
+	void *wep_defkey_end;
+
+	void *sta_table_start;
+	void *sta_table_end;
+
+	void *packet_template_start;
+	void *packet_template_end;
+
+	void *queue_memory_start;
+	void *queue_memory_end;
+
+	void *packet_memory_pool_start;
+	void *packet_memory_pool_end;
+
+	void *debug_buffer1_start;
+	void *debug_buffer1_end;
+
+	void *debug_buffer2_start;
+	void *debug_buffer2_end;
+
+	/* Number of blocks FW allocated for TX packets */
+	u32 num_tx_mem_blocks;
+
+	/* Number of blocks FW allocated for RX packets */
+	u32 num_rx_mem_blocks;
+} __attribute__ ((packed));
+
+/*************************************************************************
+
+    Host Interrupt Register (WiLink -> Host)
+
+**************************************************************************/
+
+/* RX packet is ready in Xfer buffer #0 */
+#define WL1251_ACX_INTR_RX0_DATA      BIT(0)
+
+/* TX result(s) are in the TX complete buffer */
+#define WL1251_ACX_INTR_TX_RESULT	BIT(1)
+
+/* OBSOLETE */
+#define WL1251_ACX_INTR_TX_XFR		BIT(2)
+
+/* RX packet is ready in Xfer buffer #1 */
+#define WL1251_ACX_INTR_RX1_DATA	BIT(3)
+
+/* Event was entered to Event MBOX #A */
+#define WL1251_ACX_INTR_EVENT_A		BIT(4)
+
+/* Event was entered to Event MBOX #B */
+#define WL1251_ACX_INTR_EVENT_B		BIT(5)
+
+/* OBSOLETE */
+#define WL1251_ACX_INTR_WAKE_ON_HOST	BIT(6)
+
+/* Trace meassge on MBOX #A */
+#define WL1251_ACX_INTR_TRACE_A		BIT(7)
+
+/* Trace meassge on MBOX #B */
+#define WL1251_ACX_INTR_TRACE_B		BIT(8)
+
+/* Command processing completion */
+#define WL1251_ACX_INTR_CMD_COMPLETE	BIT(9)
+
+/* Init sequence is done */
+#define WL1251_ACX_INTR_INIT_COMPLETE	BIT(14)
+
+#define WL1251_ACX_INTR_ALL           0xFFFFFFFF
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/wl12xx.h b/drivers/net/wireless/wl12xx/wl12xx.h
new file mode 100644
index 00000000000..48641437414
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/wl12xx.h
@@ -0,0 +1,409 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (c) 1998-2007 Texas Instruments Incorporated
+ * Copyright (C) 2008-2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __WL12XX_H__
+#define __WL12XX_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/bitops.h>
+#include <net/mac80211.h>
+
+#define DRIVER_NAME "wl12xx"
+#define DRIVER_PREFIX DRIVER_NAME ": "
+
+enum {
+	DEBUG_NONE	= 0,
+	DEBUG_IRQ	= BIT(0),
+	DEBUG_SPI	= BIT(1),
+	DEBUG_BOOT	= BIT(2),
+	DEBUG_MAILBOX	= BIT(3),
+	DEBUG_NETLINK	= BIT(4),
+	DEBUG_EVENT	= BIT(5),
+	DEBUG_TX	= BIT(6),
+	DEBUG_RX	= BIT(7),
+	DEBUG_SCAN	= BIT(8),
+	DEBUG_CRYPT	= BIT(9),
+	DEBUG_PSM	= BIT(10),
+	DEBUG_MAC80211	= BIT(11),
+	DEBUG_CMD	= BIT(12),
+	DEBUG_ACX	= BIT(13),
+	DEBUG_ALL	= ~0,
+};
+
+#define DEBUG_LEVEL (DEBUG_NONE)
+
+#define DEBUG_DUMP_LIMIT 1024
+
+#define wl12xx_error(fmt, arg...) \
+	printk(KERN_ERR DRIVER_PREFIX "ERROR " fmt "\n", ##arg)
+
+#define wl12xx_warning(fmt, arg...) \
+	printk(KERN_WARNING DRIVER_PREFIX "WARNING " fmt "\n", ##arg)
+
+#define wl12xx_notice(fmt, arg...) \
+	printk(KERN_INFO DRIVER_PREFIX fmt "\n", ##arg)
+
+#define wl12xx_info(fmt, arg...) \
+	printk(KERN_DEBUG DRIVER_PREFIX fmt "\n", ##arg)
+
+#define wl12xx_debug(level, fmt, arg...) \
+	do { \
+		if (level & DEBUG_LEVEL) \
+			printk(KERN_DEBUG DRIVER_PREFIX fmt "\n", ##arg); \
+	} while (0)
+
+#define wl12xx_dump(level, prefix, buf, len)	\
+	do { \
+		if (level & DEBUG_LEVEL) \
+			print_hex_dump(KERN_DEBUG, DRIVER_PREFIX prefix, \
+				       DUMP_PREFIX_OFFSET, 16, 1,	\
+				       buf,				\
+				       min_t(size_t, len, DEBUG_DUMP_LIMIT), \
+				       0);				\
+	} while (0)
+
+#define wl12xx_dump_ascii(level, prefix, buf, len)	\
+	do { \
+		if (level & DEBUG_LEVEL) \
+			print_hex_dump(KERN_DEBUG, DRIVER_PREFIX prefix, \
+				       DUMP_PREFIX_OFFSET, 16, 1,	\
+				       buf,				\
+				       min_t(size_t, len, DEBUG_DUMP_LIMIT), \
+				       true);				\
+	} while (0)
+
+#define WL12XX_DEFAULT_RX_CONFIG (CFG_UNI_FILTER_EN |	\
+				  CFG_BSSID_FILTER_EN)
+
+#define WL12XX_DEFAULT_RX_FILTER (CFG_RX_PRSP_EN |  \
+				  CFG_RX_MGMT_EN |  \
+				  CFG_RX_DATA_EN |  \
+				  CFG_RX_CTL_EN |   \
+				  CFG_RX_BCN_EN |   \
+				  CFG_RX_AUTH_EN |  \
+				  CFG_RX_ASSOC_EN)
+
+
+struct boot_attr {
+	u32 radio_type;
+	u8 mac_clock;
+	u8 arm_clock;
+	int firmware_debug;
+	u32 minor;
+	u32 major;
+	u32 bugfix;
+};
+
+enum wl12xx_state {
+	WL12XX_STATE_OFF,
+	WL12XX_STATE_ON,
+	WL12XX_STATE_PLT,
+};
+
+enum wl12xx_partition_type {
+	PART_DOWN,
+	PART_WORK,
+	PART_DRPW,
+
+	PART_TABLE_LEN
+};
+
+struct wl12xx_partition {
+	u32 size;
+	u32 start;
+};
+
+struct wl12xx_partition_set {
+	struct wl12xx_partition mem;
+	struct wl12xx_partition reg;
+};
+
+struct wl12xx;
+
+/* FIXME: I'm not sure about this structure name */
+struct wl12xx_chip {
+	u32 id;
+
+	const char *fw_filename;
+	const char *nvs_filename;
+
+	char fw_ver[21];
+
+	unsigned int power_on_sleep;
+	int intr_cmd_complete;
+	int intr_init_complete;
+
+	int (*op_upload_fw)(struct wl12xx *wl);
+	int (*op_upload_nvs)(struct wl12xx *wl);
+	int (*op_boot)(struct wl12xx *wl);
+	void (*op_set_ecpu_ctrl)(struct wl12xx *wl, u32 flag);
+	void (*op_target_enable_interrupts)(struct wl12xx *wl);
+	int (*op_hw_init)(struct wl12xx *wl);
+	int (*op_plt_init)(struct wl12xx *wl);
+
+	struct wl12xx_partition_set *p_table;
+	enum wl12xx_acx_int_reg *acx_reg_table;
+};
+
+struct wl12xx_stats {
+	struct acx_statistics *fw_stats;
+	unsigned long fw_stats_update;
+
+	unsigned int retry_count;
+	unsigned int excessive_retries;
+};
+
+struct wl12xx_debugfs {
+	struct dentry *rootdir;
+	struct dentry *fw_statistics;
+
+	struct dentry *tx_internal_desc_overflow;
+
+	struct dentry *rx_out_of_mem;
+	struct dentry *rx_hdr_overflow;
+	struct dentry *rx_hw_stuck;
+	struct dentry *rx_dropped;
+	struct dentry *rx_fcs_err;
+	struct dentry *rx_xfr_hint_trig;
+	struct dentry *rx_path_reset;
+	struct dentry *rx_reset_counter;
+
+	struct dentry *dma_rx_requested;
+	struct dentry *dma_rx_errors;
+	struct dentry *dma_tx_requested;
+	struct dentry *dma_tx_errors;
+
+	struct dentry *isr_cmd_cmplt;
+	struct dentry *isr_fiqs;
+	struct dentry *isr_rx_headers;
+	struct dentry *isr_rx_mem_overflow;
+	struct dentry *isr_rx_rdys;
+	struct dentry *isr_irqs;
+	struct dentry *isr_tx_procs;
+	struct dentry *isr_decrypt_done;
+	struct dentry *isr_dma0_done;
+	struct dentry *isr_dma1_done;
+	struct dentry *isr_tx_exch_complete;
+	struct dentry *isr_commands;
+	struct dentry *isr_rx_procs;
+	struct dentry *isr_hw_pm_mode_changes;
+	struct dentry *isr_host_acknowledges;
+	struct dentry *isr_pci_pm;
+	struct dentry *isr_wakeups;
+	struct dentry *isr_low_rssi;
+
+	struct dentry *wep_addr_key_count;
+	struct dentry *wep_default_key_count;
+	/* skipping wep.reserved */
+	struct dentry *wep_key_not_found;
+	struct dentry *wep_decrypt_fail;
+	struct dentry *wep_packets;
+	struct dentry *wep_interrupt;
+
+	struct dentry *pwr_ps_enter;
+	struct dentry *pwr_elp_enter;
+	struct dentry *pwr_missing_bcns;
+	struct dentry *pwr_wake_on_host;
+	struct dentry *pwr_wake_on_timer_exp;
+	struct dentry *pwr_tx_with_ps;
+	struct dentry *pwr_tx_without_ps;
+	struct dentry *pwr_rcvd_beacons;
+	struct dentry *pwr_power_save_off;
+	struct dentry *pwr_enable_ps;
+	struct dentry *pwr_disable_ps;
+	struct dentry *pwr_fix_tsf_ps;
+	/* skipping cont_miss_bcns_spread for now */
+	struct dentry *pwr_rcvd_awake_beacons;
+
+	struct dentry *mic_rx_pkts;
+	struct dentry *mic_calc_failure;
+
+	struct dentry *aes_encrypt_fail;
+	struct dentry *aes_decrypt_fail;
+	struct dentry *aes_encrypt_packets;
+	struct dentry *aes_decrypt_packets;
+	struct dentry *aes_encrypt_interrupt;
+	struct dentry *aes_decrypt_interrupt;
+
+	struct dentry *event_heart_beat;
+	struct dentry *event_calibration;
+	struct dentry *event_rx_mismatch;
+	struct dentry *event_rx_mem_empty;
+	struct dentry *event_rx_pool;
+	struct dentry *event_oom_late;
+	struct dentry *event_phy_transmit_error;
+	struct dentry *event_tx_stuck;
+
+	struct dentry *ps_pspoll_timeouts;
+	struct dentry *ps_upsd_timeouts;
+	struct dentry *ps_upsd_max_sptime;
+	struct dentry *ps_upsd_max_apturn;
+	struct dentry *ps_pspoll_max_apturn;
+	struct dentry *ps_pspoll_utilization;
+	struct dentry *ps_upsd_utilization;
+
+	struct dentry *rxpipe_rx_prep_beacon_drop;
+	struct dentry *rxpipe_descr_host_int_trig_rx_data;
+	struct dentry *rxpipe_beacon_buffer_thres_host_int_trig_rx_data;
+	struct dentry *rxpipe_missed_beacon_host_int_trig_rx_data;
+	struct dentry *rxpipe_tx_xfr_host_int_trig_rx_data;
+
+	struct dentry *tx_queue_len;
+
+	struct dentry *retry_count;
+	struct dentry *excessive_retries;
+};
+
+struct wl12xx {
+	struct ieee80211_hw *hw;
+	bool mac80211_registered;
+
+	struct spi_device *spi;
+
+	void (*set_power)(bool enable);
+	int irq;
+
+	enum wl12xx_state state;
+	struct mutex mutex;
+
+	int physical_mem_addr;
+	int physical_reg_addr;
+	int virtual_mem_addr;
+	int virtual_reg_addr;
+
+	struct wl12xx_chip chip;
+
+	int cmd_box_addr;
+	int event_box_addr;
+	struct boot_attr boot_attr;
+
+	u8 *fw;
+	size_t fw_len;
+	u8 *nvs;
+	size_t nvs_len;
+
+	u8 bssid[ETH_ALEN];
+	u8 mac_addr[ETH_ALEN];
+	u8 bss_type;
+	u8 listen_int;
+	int channel;
+
+	void *target_mem_map;
+	struct acx_data_path_params_resp *data_path;
+
+	/* Number of TX packets transferred to the FW, modulo 16 */
+	u32 data_in_count;
+
+	/* Frames scheduled for transmission, not handled yet */
+	struct sk_buff_head tx_queue;
+	bool tx_queue_stopped;
+
+	struct work_struct tx_work;
+	struct work_struct filter_work;
+
+	/* Pending TX frames */
+	struct sk_buff *tx_frames[16];
+
+	/*
+	 * Index pointing to the next TX complete entry
+	 * in the cyclic XT complete array we get from
+	 * the FW.
+	 */
+	u32 next_tx_complete;
+
+	/* FW Rx counter */
+	u32 rx_counter;
+
+	/* Rx frames handled */
+	u32 rx_handled;
+
+	/* Current double buffer */
+	u32 rx_current_buffer;
+	u32 rx_last_id;
+
+	/* The target interrupt mask */
+	u32 intr_mask;
+	struct work_struct irq_work;
+
+	/* The mbox event mask */
+	u32 event_mask;
+
+	/* Mailbox pointers */
+	u32 mbox_ptr[2];
+
+	/* Are we currently scanning */
+	bool scanning;
+
+	/* Our association ID */
+	u16 aid;
+
+	/* Default key (for WEP) */
+	u32 default_key;
+
+	unsigned int tx_mgmt_frm_rate;
+	unsigned int tx_mgmt_frm_mod;
+
+	unsigned int rx_config;
+	unsigned int rx_filter;
+
+	/* is firmware in elp mode */
+	bool elp;
+
+	/* we can be in psm, but not in elp, we have to differentiate */
+	bool psm;
+
+	/* PSM mode requested */
+	bool psm_requested;
+
+	/* in dBm */
+	int power_level;
+
+	struct wl12xx_stats stats;
+	struct wl12xx_debugfs debugfs;
+};
+
+int wl12xx_plt_start(struct wl12xx *wl);
+int wl12xx_plt_stop(struct wl12xx *wl);
+
+#define DEFAULT_HW_GEN_MODULATION_TYPE    CCK_LONG /* Long Preamble */
+#define DEFAULT_HW_GEN_TX_RATE          RATE_2MBPS
+#define JOIN_TIMEOUT 5000 /* 5000 milliseconds to join */
+
+#define WL12XX_DEFAULT_POWER_LEVEL 20
+
+#define WL12XX_TX_QUEUE_MAX_LENGTH 20
+
+/* Different chips need different sleep times after power on.  WL1271 needs
+ * 200ms, WL1251 needs only 10ms.  By default we use 200ms, but as soon as we
+ * know the chip ID, we change the sleep value in the wl12xx chip structure,
+ * so in subsequent power ons, we don't waste more time then needed.  */
+#define WL12XX_DEFAULT_POWER_ON_SLEEP 200
+
+#define CHIP_ID_1251_PG10	           (0x7010101)
+#define CHIP_ID_1251_PG11	           (0x7020101)
+#define CHIP_ID_1251_PG12	           (0x7030101)
+#define CHIP_ID_1271_PG10	           (0x4030101)
+
+#endif
diff --git a/drivers/net/wireless/wl12xx/wl12xx_80211.h b/drivers/net/wireless/wl12xx/wl12xx_80211.h
new file mode 100644
index 00000000000..657c2dbcb7d
--- /dev/null
+++ b/drivers/net/wireless/wl12xx/wl12xx_80211.h
@@ -0,0 +1,156 @@
+#ifndef __WL12XX_80211_H__
+#define __WL12XX_80211_H__
+
+#include <linux/if_ether.h>	/* ETH_ALEN */
+
+/* RATES */
+#define IEEE80211_CCK_RATE_1MB		        0x02
+#define IEEE80211_CCK_RATE_2MB		        0x04
+#define IEEE80211_CCK_RATE_5MB		        0x0B
+#define IEEE80211_CCK_RATE_11MB		        0x16
+#define IEEE80211_OFDM_RATE_6MB		        0x0C
+#define IEEE80211_OFDM_RATE_9MB		        0x12
+#define IEEE80211_OFDM_RATE_12MB		0x18
+#define IEEE80211_OFDM_RATE_18MB		0x24
+#define IEEE80211_OFDM_RATE_24MB		0x30
+#define IEEE80211_OFDM_RATE_36MB		0x48
+#define IEEE80211_OFDM_RATE_48MB		0x60
+#define IEEE80211_OFDM_RATE_54MB		0x6C
+#define IEEE80211_BASIC_RATE_MASK		0x80
+
+#define IEEE80211_CCK_RATE_1MB_MASK		(1<<0)
+#define IEEE80211_CCK_RATE_2MB_MASK		(1<<1)
+#define IEEE80211_CCK_RATE_5MB_MASK		(1<<2)
+#define IEEE80211_CCK_RATE_11MB_MASK		(1<<3)
+#define IEEE80211_OFDM_RATE_6MB_MASK		(1<<4)
+#define IEEE80211_OFDM_RATE_9MB_MASK		(1<<5)
+#define IEEE80211_OFDM_RATE_12MB_MASK		(1<<6)
+#define IEEE80211_OFDM_RATE_18MB_MASK		(1<<7)
+#define IEEE80211_OFDM_RATE_24MB_MASK		(1<<8)
+#define IEEE80211_OFDM_RATE_36MB_MASK		(1<<9)
+#define IEEE80211_OFDM_RATE_48MB_MASK		(1<<10)
+#define IEEE80211_OFDM_RATE_54MB_MASK		(1<<11)
+
+#define IEEE80211_CCK_RATES_MASK	  0x0000000F
+#define IEEE80211_CCK_BASIC_RATES_MASK	 (IEEE80211_CCK_RATE_1MB_MASK | \
+	IEEE80211_CCK_RATE_2MB_MASK)
+#define IEEE80211_CCK_DEFAULT_RATES_MASK (IEEE80211_CCK_BASIC_RATES_MASK | \
+	IEEE80211_CCK_RATE_5MB_MASK | \
+	IEEE80211_CCK_RATE_11MB_MASK)
+
+#define IEEE80211_OFDM_RATES_MASK	  0x00000FF0
+#define IEEE80211_OFDM_BASIC_RATES_MASK	  (IEEE80211_OFDM_RATE_6MB_MASK | \
+	IEEE80211_OFDM_RATE_12MB_MASK | \
+	IEEE80211_OFDM_RATE_24MB_MASK)
+#define IEEE80211_OFDM_DEFAULT_RATES_MASK (IEEE80211_OFDM_BASIC_RATES_MASK | \
+	IEEE80211_OFDM_RATE_9MB_MASK  | \
+	IEEE80211_OFDM_RATE_18MB_MASK | \
+	IEEE80211_OFDM_RATE_36MB_MASK | \
+	IEEE80211_OFDM_RATE_48MB_MASK | \
+	IEEE80211_OFDM_RATE_54MB_MASK)
+#define IEEE80211_DEFAULT_RATES_MASK (IEEE80211_OFDM_DEFAULT_RATES_MASK | \
+				      IEEE80211_CCK_DEFAULT_RATES_MASK)
+
+
+/* This really should be 8, but not for our firmware */
+#define MAX_SUPPORTED_RATES 32
+#define COUNTRY_STRING_LEN 3
+#define MAX_COUNTRY_TRIPLETS 32
+
+/* Headers */
+struct ieee80211_header {
+	__le16 frame_ctl;
+	__le16 duration_id;
+	u8 da[ETH_ALEN];
+	u8 sa[ETH_ALEN];
+	u8 bssid[ETH_ALEN];
+	__le16 seq_ctl;
+	u8 payload[0];
+} __attribute__ ((packed));
+
+struct wl12xx_ie_header {
+	u8 id;
+	u8 len;
+} __attribute__ ((packed));
+
+/* IEs */
+
+struct wl12xx_ie_ssid {
+	struct wl12xx_ie_header header;
+	char ssid[IW_ESSID_MAX_SIZE];
+} __attribute__ ((packed));
+
+struct wl12xx_ie_rates {
+	struct wl12xx_ie_header header;
+	u8 rates[MAX_SUPPORTED_RATES];
+} __attribute__ ((packed));
+
+struct wl12xx_ie_ds_params {
+	struct wl12xx_ie_header header;
+	u8 channel;
+} __attribute__ ((packed));
+
+struct country_triplet {
+	u8 channel;
+	u8 num_channels;
+	u8 max_tx_power;
+} __attribute__ ((packed));
+
+struct wl12xx_ie_country {
+	struct wl12xx_ie_header header;
+	u8 country_string[COUNTRY_STRING_LEN];
+	struct country_triplet triplets[MAX_COUNTRY_TRIPLETS];
+} __attribute__ ((packed));
+
+
+/* Templates */
+
+struct wl12xx_beacon_template {
+	struct ieee80211_header header;
+	__le32 time_stamp[2];
+	__le16 beacon_interval;
+	__le16 capability;
+	struct wl12xx_ie_ssid ssid;
+	struct wl12xx_ie_rates rates;
+	struct wl12xx_ie_rates ext_rates;
+	struct wl12xx_ie_ds_params ds_params;
+	struct wl12xx_ie_country country;
+} __attribute__ ((packed));
+
+struct wl12xx_null_data_template {
+	struct ieee80211_header header;
+} __attribute__ ((packed));
+
+struct wl12xx_ps_poll_template {
+	u16 fc;
+	u16 aid;
+	u8 bssid[ETH_ALEN];
+	u8 ta[ETH_ALEN];
+} __attribute__ ((packed));
+
+struct wl12xx_qos_null_data_template {
+	struct ieee80211_header header;
+	__le16 qos_ctl;
+} __attribute__ ((packed));
+
+struct wl12xx_probe_req_template {
+	struct ieee80211_header header;
+	struct wl12xx_ie_ssid ssid;
+	struct wl12xx_ie_rates rates;
+	struct wl12xx_ie_rates ext_rates;
+} __attribute__ ((packed));
+
+
+struct wl12xx_probe_resp_template {
+	struct ieee80211_header header;
+	__le32 time_stamp[2];
+	__le16 beacon_interval;
+	__le16 capability;
+	struct wl12xx_ie_ssid ssid;
+	struct wl12xx_ie_rates rates;
+	struct wl12xx_ie_rates ext_rates;
+	struct wl12xx_ie_ds_params ds_params;
+	struct wl12xx_ie_country country;
+} __attribute__ ((packed));
+
+#endif
diff --git a/include/linux/spi/wl12xx.h b/include/linux/spi/wl12xx.h
new file mode 100644
index 00000000000..11430cab2aa
--- /dev/null
+++ b/include/linux/spi/wl12xx.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of wl12xx
+ *
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * Contact: Kalle Valo <kalle.valo@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef _LINUX_SPI_WL12XX_H
+#define _LINUX_SPI_WL12XX_H
+
+struct wl12xx_platform_data {
+	void (*set_power)(bool enable);
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 9dfd6ba353b993d648dcda72480c7ce92cd27c7e Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Wed, 6 May 2009 20:34:10 +0300
Subject: mac80211: Update SA Query transaction id length

IEEE 802.11w/D8.0 changed the length of the SA Query transaction
identifier from 16 to 2 octets.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c52e7fba4e4..dc92359f37e 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -543,7 +543,7 @@ struct ieee80211_tim_ie {
 	u8 virtual_map[1];
 } __attribute__ ((packed));
 
-#define WLAN_SA_QUERY_TR_ID_LEN 16
+#define WLAN_SA_QUERY_TR_ID_LEN 2
 
 struct ieee80211_mgmt {
 	__le16 frame_control;
-- 
cgit v1.2.3-70-g09d2


From 4d5b78c055c76bb563c4a43d2adb92735b3b9405 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 6 May 2009 16:52:51 -0700
Subject: net: Add missing rculist.h include to netdevice.h

Otherwise list_for_each_entry_rcu() et al. aren't visible
and we get build failures in some configurations.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 02882e2ebd4..2af89b662ca 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -39,6 +39,7 @@
 
 #include <linux/device.h>
 #include <linux/percpu.h>
+#include <linux/rculist.h>
 #include <linux/dmaengine.h>
 #include <linux/workqueue.h>
 
-- 
cgit v1.2.3-70-g09d2


From 08ce4c91e44d51bb6c946f2756825a462d53c545 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 7 Apr 2009 23:40:39 +0200
Subject: dlm: Make name input parameter of {,dlm_}new_lockspace() const

| fs/gfs2/lock_dlm.c:207: warning: passing argument 1 of 'dlm_new_lockspace' discards qualifiers from pointer target type

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c  | 4 ++--
 include/linux/dlm.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index cd8e2df3c29..82528d9b72e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -384,7 +384,7 @@ static void threads_stop(void)
 	dlm_astd_stop();
 }
 
-static int new_lockspace(char *name, int namelen, void **lockspace,
+static int new_lockspace(const char *name, int namelen, void **lockspace,
 			 uint32_t flags, int lvblen)
 {
 	struct dlm_ls *ls;
@@ -614,7 +614,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	return error;
 }
 
-int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
 		      uint32_t flags, int lvblen)
 {
 	int error = 0;
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index b9cd38603fd..0b3518c4235 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -81,8 +81,8 @@ struct dlm_lksb {
  * the cluster, the calling node joins it.
  */
 
-int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
-		      uint32_t flags, int lvblen);
+int dlm_new_lockspace(const char *name, int namelen,
+		      dlm_lockspace_t **lockspace, uint32_t flags, int lvblen);
 
 /*
  * dlm_release_lockspace
-- 
cgit v1.2.3-70-g09d2


From 9080b72819650c3a757d173a19bc930d603b79d6 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:58 +0000
Subject: sh-sci: remove early_sci_setup()

Remove unused early_sci_setup() function from sh-sci.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c    | 14 --------------
 include/linux/serial_sci.h |  2 --
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index af5da110d52..5f37f7d3166 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1082,20 +1082,6 @@ static void __init sci_init_ports(void)
 	}
 }
 
-int __init early_sci_setup(struct uart_port *port)
-{
-	if (unlikely(port->line > SCI_NPORTS))
-		return -ENODEV;
-
-	sci_init_ports();
-
-	sci_ports[port->line].port.membase	= port->membase;
-	sci_ports[port->line].port.mapbase	= port->mapbase;
-	sci_ports[port->line].port.type		= port->type;
-
-	return 0;
-}
-
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
 /*
  *	Print a string to the serial port trying not to disturb
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 893cc53486b..717059d6791 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -27,6 +27,4 @@ struct plat_sci_port {
 	upf_t		flags;			/* UPF_* flags */
 };
 
-int early_sci_setup(struct uart_port *port);
-
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
cgit v1.2.3-70-g09d2


From 501b825d01efb93766c87d29f299851152cf4eb0 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:30 +0000
Subject: sh-sci: improve clock framework support

Use enable/disable hooks for clock framework integration.
Make sure we control the clock for the serial console as well.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c    | 77 +++++++++++++++++++++++++++-------------------
 include/linux/serial_sci.h |  1 +
 2 files changed, 47 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 408624ae7fe..3daf76725ac 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -76,8 +76,10 @@ struct sci_port {
 	int			break_flag;
 
 #ifdef CONFIG_HAVE_CLK
-	/* Port clock */
-	struct clk		*clk;
+	/* Interface clock */
+	struct clk		*iclk;
+	/* Data clock */
+	struct clk		*dclk;
 #endif
 	struct list_head	node;
 };
@@ -166,12 +168,12 @@ static void h8300_sci_config(struct uart_port *port, unsigned int ctrl)
 		*mstpcrl &= ~mask;
 }
 
-static inline void h8300_sci_enable(struct uart_port *port)
+static void h8300_sci_enable(struct uart_port *port)
 {
 	h8300_sci_config(port, sci_enable);
 }
 
-static inline void h8300_sci_disable(struct uart_port *port)
+static void h8300_sci_disable(struct uart_port *port)
 {
 	h8300_sci_config(port, sci_disable);
 }
@@ -742,13 +744,34 @@ static int sci_notifier(struct notifier_block *self,
 	    (phase == CPUFREQ_RESUMECHANGE)) {
 		spin_lock_irqsave(&priv->lock, flags);
 		list_for_each_entry(sci_port, &priv->ports, node)
-			sci_port->port.uartclk = clk_get_rate(sci_port->clk);
+			sci_port->port.uartclk = clk_get_rate(sci_port->dclk);
 
 		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
 	return NOTIFY_OK;
 }
+
+static void sci_clk_enable(struct uart_port *port)
+{
+	struct sci_port *sci_port = to_sci_port(port);
+
+	clk_enable(sci_port->dclk);
+	sci_port->port.uartclk = clk_get_rate(sci_port->dclk);
+
+	if (sci_port->iclk)
+		clk_enable(sci_port->iclk);
+}
+
+static void sci_clk_disable(struct uart_port *port)
+{
+	struct sci_port *sci_port = to_sci_port(port);
+
+	if (sci_port->iclk)
+		clk_disable(sci_port->iclk);
+
+	clk_disable(sci_port->dclk);
+}
 #endif
 
 static int sci_request_irq(struct sci_port *port)
@@ -880,10 +903,6 @@ static int sci_startup(struct uart_port *port)
 	if (s->enable)
 		s->enable(port);
 
-#ifdef CONFIG_HAVE_CLK
-	s->clk = clk_get(NULL, "module_clk");
-#endif
-
 	sci_request_irq(s);
 	sci_start_tx(port);
 	sci_start_rx(port, 1);
@@ -901,11 +920,6 @@ static void sci_shutdown(struct uart_port *port)
 
 	if (s->disable)
 		s->disable(port);
-
-#ifdef CONFIG_HAVE_CLK
-	clk_put(s->clk);
-	s->clk = NULL;
-#endif
 }
 
 static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
@@ -1048,7 +1062,8 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static void __devinit sci_init_single(struct sci_port *sci_port,
+static void __devinit sci_init_single(struct platform_device *dev,
+				      struct sci_port *sci_port,
 				      unsigned int index,
 				      struct plat_sci_port *p)
 {
@@ -1064,14 +1079,10 @@ static void __devinit sci_init_single(struct sci_port *sci_port,
 #endif
 	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
-	/*
-	 * XXX: We should use a proper SCI/SCIF clock
-	 */
-	{
-		struct clk *clk = clk_get(NULL, "module_clk");
-		sci_port->port.uartclk = clk_get_rate(clk);
-		clk_put(clk);
-	}
+	sci_port->iclk		= p->clk ? clk_get(&dev->dev, p->clk) : NULL;
+	sci_port->dclk		= clk_get(&dev->dev, "module_clk");
+	sci_port->enable	= sci_clk_enable;
+	sci_port->disable	= sci_clk_disable;
 #else
 #error "Need a valid uartclk"
 #endif
@@ -1085,9 +1096,11 @@ static void __devinit sci_init_single(struct sci_port *sci_port,
 
 	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
 	sci_port->port.flags	= p->flags;
+	sci_port->port.dev	= &dev->dev;
 	sci_port->type		= sci_port->port.type = p->type;
 
 	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
+
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1111,14 +1124,21 @@ static void serial_console_write(struct console *co, const char *s,
 				 unsigned count)
 {
 	struct uart_port *port = co->data;
+	struct sci_port *sci_port = to_sci_port(port);
 	unsigned short bits;
 
-	uart_console_write(co->data, s, count, serial_console_putchar);
+	if (sci_port->enable)
+		sci_port->enable(port);
+
+	uart_console_write(port, s, count, serial_console_putchar);
 
 	/* wait until fifo is empty and last bit has been transmitted */
 	bits = SCxSR_TDxE(port) | SCxSR_TEND(port);
 	while ((sci_in(port, SCxSR) & bits) != bits)
 		cpu_relax();
+
+	if (sci_port->disable);
+		sci_port->disable(port);
 }
 
 static int __init serial_console_setup(struct console *co, char *options)
@@ -1152,11 +1172,6 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (!port->type)
 		return -ENODEV;
 
-#ifdef CONFIG_HAVE_CLK
-	if (!sci_port->clk)
-		sci_port->clk = clk_get(NULL, "module_clk");
-#endif
-
 	sci_config_port(port, 0);
 
 	if (sci_port->enable)
@@ -1171,6 +1186,7 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (ret == 0)
 		sci_stop_rx(port);
 #endif
+	/* TODO: disable clock */
 	return ret;
 }
 
@@ -1250,8 +1266,7 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 		return 0;
 	}
 
-	sciport->port.dev = &dev->dev;
-	sci_init_single(sciport, index, p);
+	sci_init_single(dev, sciport, index, p);
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 	if (ret)
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 717059d6791..1c297ddc9d5 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -25,6 +25,7 @@ struct plat_sci_port {
 	unsigned int	irqs[SCIx_NR_IRQS];	/* ERI, RXI, TXI, BRI */
 	unsigned int	type;			/* SCI / SCIF / IRDA */
 	upf_t		flags;			/* UPF_* flags */
+	char		*clk;			/* clock string */
 };
 
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
cgit v1.2.3-70-g09d2


From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:22 +0200
Subject: perf_counter: rework ioctl()s

Corey noticed that ioctl()s on grouped counters didn't work on
the whole group. This extends the ioctl() interface to take a
second argument that is interpreted as a flags field. We then
provide PERF_IOC_FLAG_GROUP to toggle the behaviour.

Having this flag gives the greatest flexibility, allowing you
to individually enable/disable/reset counters in a group, or
all together.

[ Impact: fix group counter enable/disable semantics ]

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090508170028.837558214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  10 +++--
 kernel/perf_counter.c        | 104 ++++++++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 00081d84169..88f863ec274 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -157,10 +157,14 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
+#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fdb0d242127..f4883f1f47e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 * add it straight to the context's counter list, or to the group
 	 * leader's sibling list:
 	 */
-	if (counter->group_leader == counter)
+	if (group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
@@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Disable a counter and all its children.
- */
-static void perf_counter_disable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_disable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_disable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	return 0;
 }
 
-/*
- * Enable a counter and all its children.
- */
-static void perf_counter_enable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_enable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_enable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
@@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 
 static void perf_counter_reset(struct perf_counter *counter)
 {
+	(void)perf_counter_read(counter);
 	atomic_set(&counter->count, 0);
+	perf_counter_update_userpage(counter);
+}
+
+static void perf_counter_for_each_sibling(struct perf_counter *counter,
+					  void (*func)(struct perf_counter *))
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
+
+	spin_lock_irq(&ctx->lock);
+	counter = counter->group_leader;
+
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		func(sibling);
+	spin_unlock_irq(&ctx->lock);
+}
+
+static void perf_counter_for_each_child(struct perf_counter *counter,
+					void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	func(counter);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		func(child);
+	mutex_unlock(&counter->mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+				  void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	perf_counter_for_each_sibling(counter, func);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_for_each_sibling(child, func);
+	mutex_unlock(&counter->mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
-	int err = 0;
+	void (*func)(struct perf_counter *);
+	u32 flags = arg;
 
 	switch (cmd) {
 	case PERF_COUNTER_IOC_ENABLE:
-		perf_counter_enable_family(counter);
+		func = perf_counter_enable;
 		break;
 	case PERF_COUNTER_IOC_DISABLE:
-		perf_counter_disable_family(counter);
-		break;
-	case PERF_COUNTER_IOC_REFRESH:
-		err = perf_counter_refresh(counter, arg);
+		func = perf_counter_disable;
 		break;
 	case PERF_COUNTER_IOC_RESET:
-		perf_counter_reset(counter);
+		func = perf_counter_reset;
 		break;
+
+	case PERF_COUNTER_IOC_REFRESH:
+		return perf_counter_refresh(counter, arg);
 	default:
-		err = -ENOTTY;
+		return -ENOTTY;
 	}
-	return err;
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_counter_for_each(counter, func);
+	else
+		perf_counter_for_each_child(counter, func);
+
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:23 +0200
Subject: perf_counter: add PERF_RECORD_CONFIG

Much like CONFIG_RECORD_GROUP records the hw_event.config to
identify the values, allow to record this for all counters.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.923228280@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 ++
 kernel/perf_counter.c        | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 88f863ec274..0e6303d36c6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,6 +104,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_ADDR	= 1U << 3,
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
+	PERF_RECORD_CONFIG	= 1U << 6,
 };
 
 /*
@@ -258,6 +259,7 @@ enum perf_event_type {
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
+	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f4883f1f47e..c615f52aa40 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CONFIG) {
+		header.type |= PERF_RECORD_CONFIG;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	if (record_type & PERF_RECORD_CONFIG)
+		perf_output_put(&handle, counter->hw_event.config);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */
-- 
cgit v1.2.3-70-g09d2


From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:24 +0200
Subject: perf_counter: add PERF_RECORD_CPU

Allow recording the CPU number the event was generated on.

RFC: this leaves a u32 as reserved, should we fill in the
     node_id() there, or leave this open for future extention,
     as userspace can already easily do the cpu->node mapping
     if needed.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170029.008627711@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e6303d36c6..614f921d616 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -105,6 +105,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
 	PERF_RECORD_CONFIG	= 1U << 6,
+	PERF_RECORD_CPU		= 1U << 7,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
+	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c615f52aa40..d850a1fb8d4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
+	struct {
+		u32 cpu, reserved;
+	} cpu_entry;
 
 	header.type = 0;
 	header.size = sizeof(header);
@@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CPU) {
+		header.type |= PERF_RECORD_CPU;
+		header.size += sizeof(cpu_entry);
+
+		cpu_entry.cpu = raw_smp_processor_id();
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
+	if (record_type & PERF_RECORD_CPU)
+		perf_output_put(&handle, cpu_entry);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */
-- 
cgit v1.2.3-70-g09d2


From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:27:41 -0400
Subject: tracing: add trace_set_clr_event to export event enabling function

Other parts of the kernel may need to be able to enable or disable
specific events. Especially parts that create trace events.

[ Impact: allow enabling of trace events by those that create the event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  2 ++
 kernel/trace/trace_events.c  | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 662c1becf36..bae51ddfabd 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -127,6 +127,8 @@ extern int trace_define_field(struct ftrace_event_call *call, char *type,
 
 #define is_signed_type(type)	(((type)(-1)) < 0)
 
+int trace_set_clr_event(const char *system, const char *event, int set);
+
 /*
  * The double __builtin_constant_p is because gcc will give us an error
  * if we try to allocate the static variable to fmt if it is not a
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2eecb87e42d..0eec0c55dd8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set)
 	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+	return __ftrace_set_clr_event(NULL, system, event, set);
+}
+
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
 
-- 
cgit v1.2.3-70-g09d2


From 7e044e056a6aa0dc695db50461d7b326fde15e8b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 9 May 2009 16:08:05 -0700
Subject: Input: serio - do not use deprecated dev.power.power_state

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/gameport/gameport.c |  6 +++---
 drivers/input/serio/i8042.c       | 17 +++++++++--------
 drivers/input/serio/serio.c       | 37 +++++++++++++++++++------------------
 include/linux/gameport.h          |  3 ++-
 include/linux/serio.h             |  9 ++++++---
 5 files changed, 39 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 0279d6983cc..ac11be08585 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -723,7 +723,7 @@ int __gameport_register_driver(struct gameport_driver *drv, struct module *owner
 	 * Temporarily disable automatic binding because probing
 	 * takes long time and we are better off doing it in kgameportd
 	 */
-	drv->ignore = 1;
+	drv->ignore = true;
 
 	error = driver_register(&drv->driver);
 	if (error) {
@@ -736,7 +736,7 @@ int __gameport_register_driver(struct gameport_driver *drv, struct module *owner
 	/*
 	 * Reset ignore flag and let kgameportd bind the driver to free ports
 	 */
-	drv->ignore = 0;
+	drv->ignore = false;
 	error = gameport_queue_event(drv, NULL, GAMEPORT_ATTACH_DRIVER);
 	if (error) {
 		driver_unregister(&drv->driver);
@@ -753,7 +753,7 @@ void gameport_unregister_driver(struct gameport_driver *drv)
 
 	mutex_lock(&gameport_mutex);
 
-	drv->ignore = 1;	/* so gameport_find_driver ignores it */
+	drv->ignore = true;	/* so gameport_find_driver ignores it */
 	gameport_remove_pending_events(drv);
 
 start_over:
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 3cffb704e37..f919bf57293 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -10,6 +10,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/types.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
@@ -921,6 +922,9 @@ static void i8042_dritek_enable(void)
 #endif
 
 #ifdef CONFIG_PM
+
+static bool i8042_suspended;
+
 /*
  * Here we try to restore the original BIOS settings. We only want to
  * do that once, when we really suspend, not when we taking memory
@@ -930,11 +934,9 @@ static void i8042_dritek_enable(void)
 
 static int i8042_suspend(struct platform_device *dev, pm_message_t state)
 {
-	if (dev->dev.power.power_state.event != state.event) {
-		if (state.event == PM_EVENT_SUSPEND)
-			i8042_controller_reset();
-
-		dev->dev.power.power_state = state;
+	if (!i8042_suspended && state.event == PM_EVENT_SUSPEND) {
+		i8042_controller_reset();
+		i8042_suspended = true;
 	}
 
 	return 0;
@@ -952,7 +954,7 @@ static int i8042_resume(struct platform_device *dev)
 /*
  * Do not bother with restoring state if we haven't suspened yet
  */
-	if (dev->dev.power.power_state.event == PM_EVENT_ON)
+	if (!i8042_suspended)
 		return 0;
 
 	error = i8042_controller_check();
@@ -998,10 +1000,9 @@ static int i8042_resume(struct platform_device *dev)
 	if (i8042_ports[I8042_KBD_PORT_NO].serio)
 		i8042_enable_kbd_port();
 
+	i8042_suspended = false;
 	i8042_interrupt(0, NULL);
 
-	dev->dev.power.power_state = PMSG_ON;
-
 	return 0;
 }
 #endif /* CONFIG_PM */
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index 8d2df5d2d5a..fb17573f8f2 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -495,9 +495,9 @@ static ssize_t serio_set_bind_mode(struct device *dev, struct device_attribute *
 
 	retval = count;
 	if (!strncmp(buf, "manual", count)) {
-		serio->manual_bind = 1;
+		serio->manual_bind = true;
 	} else if (!strncmp(buf, "auto", count)) {
-		serio->manual_bind = 0;
+		serio->manual_bind = false;
 	} else {
 		retval = -EINVAL;
 	}
@@ -570,7 +570,7 @@ static void serio_add_port(struct serio *serio)
 			"serio: device_add() failed for %s (%s), error: %d\n",
 			serio->phys, serio->name, error);
 	else {
-		serio->registered = 1;
+		serio->registered = true;
 		error = sysfs_create_group(&serio->dev.kobj, &serio_id_attr_group);
 		if (error)
 			printk(KERN_ERR
@@ -606,7 +606,7 @@ static void serio_destroy_port(struct serio *serio)
 	if (serio->registered) {
 		sysfs_remove_group(&serio->dev.kobj, &serio_id_attr_group);
 		device_del(&serio->dev);
-		serio->registered = 0;
+		serio->registered = false;
 	}
 
 	list_del_init(&serio->node);
@@ -750,9 +750,9 @@ static ssize_t serio_driver_set_bind_mode(struct device_driver *drv, const char
 
 	retval = count;
 	if (!strncmp(buf, "manual", count)) {
-		serio_drv->manual_bind = 1;
+		serio_drv->manual_bind = true;
 	} else if (!strncmp(buf, "auto", count)) {
-		serio_drv->manual_bind = 0;
+		serio_drv->manual_bind = false;
 	} else {
 		retval = -EINVAL;
 	}
@@ -812,7 +812,7 @@ static void serio_attach_driver(struct serio_driver *drv)
 
 int __serio_register_driver(struct serio_driver *drv, struct module *owner, const char *mod_name)
 {
-	int manual_bind = drv->manual_bind;
+	bool manual_bind = drv->manual_bind;
 	int error;
 
 	drv->driver.bus = &serio_bus;
@@ -823,7 +823,7 @@ int __serio_register_driver(struct serio_driver *drv, struct module *owner, cons
 	 * Temporarily disable automatic binding because probing
 	 * takes long time and we are better off doing it in kseriod
 	 */
-	drv->manual_bind = 1;
+	drv->manual_bind = true;
 
 	error = driver_register(&drv->driver);
 	if (error) {
@@ -838,7 +838,7 @@ int __serio_register_driver(struct serio_driver *drv, struct module *owner, cons
 	 * driver to free ports
 	 */
 	if (!manual_bind) {
-		drv->manual_bind = 0;
+		drv->manual_bind = false;
 		error = serio_queue_event(drv, NULL, SERIO_ATTACH_DRIVER);
 		if (error) {
 			driver_unregister(&drv->driver);
@@ -856,7 +856,7 @@ void serio_unregister_driver(struct serio_driver *drv)
 
 	mutex_lock(&serio_mutex);
 
-	drv->manual_bind = 1;	/* so serio_find_driver ignores it */
+	drv->manual_bind = true;	/* so serio_find_driver ignores it */
 	serio_remove_pending_events(drv);
 
 start_over:
@@ -933,11 +933,11 @@ static int serio_uevent(struct device *dev, struct kobj_uevent_env *env)
 #ifdef CONFIG_PM
 static int serio_suspend(struct device *dev, pm_message_t state)
 {
-	if (dev->power.power_state.event != state.event) {
-		if (state.event == PM_EVENT_SUSPEND)
-			serio_cleanup(to_serio_port(dev));
+	struct serio *serio = to_serio_port(dev);
 
-		dev->power.power_state = state;
+	if (!serio->suspended && state.event == PM_EVENT_SUSPEND) {
+		serio_cleanup(serio);
+		serio->suspended = true;
 	}
 
 	return 0;
@@ -945,14 +945,15 @@ static int serio_suspend(struct device *dev, pm_message_t state)
 
 static int serio_resume(struct device *dev)
 {
+	struct serio *serio = to_serio_port(dev);
+
 	/*
 	 * Driver reconnect can take a while, so better let kseriod
 	 * deal with it.
 	 */
-	if (dev->power.power_state.event != PM_EVENT_ON) {
-		dev->power.power_state = PMSG_ON;
-		serio_queue_event(to_serio_port(dev), NULL,
-				  SERIO_RECONNECT_PORT);
+	if (serio->suspended) {
+		serio->suspended = false;
+		serio_queue_event(serio, NULL, SERIO_RECONNECT_PORT);
 	}
 
 	return 0;
diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 0cd825f7363..1bc08541c2b 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -11,6 +11,7 @@
 
 #ifdef __KERNEL__
 #include <asm/io.h>
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
@@ -62,7 +63,7 @@ struct gameport_driver {
 
 	struct device_driver driver;
 
-	unsigned int ignore;
+	bool ignore;
 };
 #define to_gameport_driver(d)	container_of(d, struct gameport_driver, driver)
 
diff --git a/include/linux/serio.h b/include/linux/serio.h
index e0417e4d3f1..126d24c9eaa 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -15,6 +15,7 @@
 
 #ifdef __KERNEL__
 
+#include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
@@ -28,7 +29,10 @@ struct serio {
 	char name[32];
 	char phys[32];
 
-	unsigned int manual_bind;
+	bool manual_bind;
+	bool registered;	/* port has been fully registered with driver core */
+	bool suspended;		/* port is suspended */
+
 
 	struct serio_device_id id;
 
@@ -47,7 +51,6 @@ struct serio {
 	struct mutex drv_mutex;		/* protects serio->drv so attributes can pin driver */
 
 	struct device dev;
-	unsigned int registered;	/* port has been fully registered with driver core */
 
 	struct list_head node;
 };
@@ -58,7 +61,7 @@ struct serio_driver {
 	char *description;
 
 	struct serio_device_id *id_table;
-	unsigned int manual_bind;
+	bool manual_bind;
 
 	void (*write_wakeup)(struct serio *);
 	irqreturn_t (*interrupt)(struct serio *, unsigned char, unsigned int);
-- 
cgit v1.2.3-70-g09d2


From 4c25a2c1b90bf785fc2e2f0f0c74a80b3e070d39 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 10 May 2009 17:16:06 +0100
Subject: intel-iommu: Clean up handling of "caching mode" vs. context
 flushing.

It really doesn't make a lot of sense to have some of the logic to
handle caching vs. non-caching mode duplicated in qi_flush_context() and
__iommu_flush_context(), while the return value indicates whether the
caller should take other action which depends on the same thing.

Especially since qi_flush_context() thought it was returning something
entirely different anyway.

This patch makes qi_flush_context() and __iommu_flush_context() both
return void, removes the 'non_present_entry_flush' argument and makes
the only call site which _set_ that argument to 1 do the right thing.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 13 +++---------
 drivers/pci/intel-iommu.c   | 52 ++++++++++++++++++---------------------------
 include/linux/intel-iommu.h |  8 +++----
 3 files changed, 28 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index d3d86b749ee..10a071ba323 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -723,23 +723,16 @@ void qi_global_iec(struct intel_iommu *iommu)
 	qi_submit_sync(&desc, iommu);
 }
 
-int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		     u64 type, int non_present_entry_flush)
+void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		      u64 type)
 {
 	struct qi_desc desc;
 
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
 			| QI_CC_GRAN(type) | QI_CC_TYPE;
 	desc.high = 0;
 
-	return qi_submit_sync(&desc, iommu);
+	qi_submit_sync(&desc, iommu);
 }
 
 int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index d6f4ee50924..9f5d9151edc 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -857,26 +857,13 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 }
 
 /* return value determine if we need a write buffer flush */
-static int __iommu_flush_context(struct intel_iommu *iommu,
-	u16 did, u16 source_id, u8 function_mask, u64 type,
-	int non_present_entry_flush)
+static void __iommu_flush_context(struct intel_iommu *iommu,
+				  u16 did, u16 source_id, u8 function_mask,
+				  u64 type)
 {
 	u64 val = 0;
 	unsigned long flag;
 
-	/*
-	 * In the non-present entry flush case, if hardware doesn't cache
-	 * non-present entry we do nothing and if hardware cache non-present
-	 * entry, we flush entries of domain 0 (the domain id is used to cache
-	 * any non-present entries)
-	 */
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	switch (type) {
 	case DMA_CCMD_GLOBAL_INVL:
 		val = DMA_CCMD_GLOBAL_INVL;
@@ -901,9 +888,6 @@ static int __iommu_flush_context(struct intel_iommu *iommu,
 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-	/* flush context entry will implicitly flush write buffer */
-	return 0;
 }
 
 /* return value determine if we need a write buffer flush */
@@ -1428,14 +1412,21 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	context_set_present(context);
 	domain_flush_cache(domain, context, sizeof(*context));
 
-	/* it's a non-present to present mapping */
-	if (iommu->flush.flush_context(iommu, id,
-		(((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
-		DMA_CCMD_DEVICE_INVL, 1))
-		iommu_flush_write_buffer(iommu);
-	else
+	/*
+	 * It's a non-present to present mapping. If hardware doesn't cache
+	 * non-present entry we only need to flush the write-buffer. If the
+	 * _does_ cache non-present entries, then it does so in the special
+	 * domain #0, which we have to flush:
+	 */
+	if (cap_caching_mode(iommu->cap)) {
+		iommu->flush.flush_context(iommu, 0,
+					   (((u16)bus) << 8) | devfn,
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
-
+	} else {
+		iommu_flush_write_buffer(iommu);
+	}
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	spin_lock_irqsave(&domain->iommu_lock, flags);
@@ -1566,7 +1557,7 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
 
 	clear_context_table(iommu, bus, devfn);
 	iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL, 0);
+					   DMA_CCMD_GLOBAL_INVL);
 	iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 					 DMA_TLB_GLOBAL_FLUSH, 0);
 }
@@ -2104,8 +2095,7 @@ static int __init init_dmars(void)
 
 		iommu_set_root_entry(iommu);
 
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
-					   0);
+		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
 					 0);
 		iommu_disable_protect_mem_regions(iommu);
@@ -2721,7 +2711,7 @@ static int init_iommu_hw(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL, 0);
+						DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 						DMA_TLB_GLOBAL_FLUSH, 0);
 		iommu_disable_protect_mem_regions(iommu);
@@ -2738,7 +2728,7 @@ static void iommu_flush_all(void)
 
 	for_each_active_iommu(iommu, drhd) {
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL, 0);
+						DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 						DMA_TLB_GLOBAL_FLUSH, 0);
 	}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 7246971a7fe..f2b94dafbf3 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -281,8 +281,8 @@ struct ir_table {
 #endif
 
 struct iommu_flush {
-	int (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
-		u64 type, int non_present_entry_flush);
+	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
+			      u8 fm, u64 type);
 	int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
 		unsigned int size_order, u64 type, int non_present_entry_flush);
 };
@@ -339,8 +339,8 @@ extern void dmar_disable_qi(struct intel_iommu *iommu);
 extern int dmar_reenable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
-extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-			        u8 fm, u64 type, int non_present_entry_flush);
+extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
+			     u8 fm, u64 type);
 extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type,
 			  int non_present_entry_flush);
-- 
cgit v1.2.3-70-g09d2


From 1f0ef2aa18802a8ce7eb5a5164aaaf4d59073801 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 10 May 2009 19:58:49 +0100
Subject: intel-iommu: Clean up handling of "caching mode" vs. IOTLB flushing.

As we just did for context cache flushing, clean up the logic around
whether we need to flush the iotlb or just the write-buffer, depending
on caching mode.

Fix the same bug in qi_flush_iotlb() that qi_flush_context() had -- it
isn't supposed to be returning an error; it's supposed to be returning a
flag which triggers a write-buffer flush.

Remove some superfluous conditional write-buffer flushes which could
never have happened because they weren't for non-present-to-present
mapping changes anyway.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 14 ++------
 drivers/pci/intel-iommu.c   | 78 +++++++++++++++++----------------------------
 include/linux/intel-iommu.h |  9 +++---
 3 files changed, 37 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 10a071ba323..df6af0d4ec0 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -735,22 +735,14 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
 	qi_submit_sync(&desc, iommu);
 }
 
-int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-		   unsigned int size_order, u64 type,
-		   int non_present_entry_flush)
+void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+		    unsigned int size_order, u64 type)
 {
 	u8 dw = 0, dr = 0;
 
 	struct qi_desc desc;
 	int ih = 0;
 
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	if (cap_write_drain(iommu->cap))
 		dw = 1;
 
@@ -762,7 +754,7 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
 		| QI_IOTLB_AM(size_order);
 
-	return qi_submit_sync(&desc, iommu);
+	qi_submit_sync(&desc, iommu);
 }
 
 /*
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 9f5d9151edc..f47d04aced8 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -891,27 +891,13 @@ static void __iommu_flush_context(struct intel_iommu *iommu,
 }
 
 /* return value determine if we need a write buffer flush */
-static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
-	u64 addr, unsigned int size_order, u64 type,
-	int non_present_entry_flush)
+static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
+				u64 addr, unsigned int size_order, u64 type)
 {
 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
 	u64 val = 0, val_iva = 0;
 	unsigned long flag;
 
-	/*
-	 * In the non-present entry flush case, if hardware doesn't cache
-	 * non-present entry we do nothing and if hardware cache non-present
-	 * entry, we flush entries of domain 0 (the domain id is used to cache
-	 * any non-present entries)
-	 */
-	if (non_present_entry_flush) {
-		if (!cap_caching_mode(iommu->cap))
-			return 1;
-		else
-			did = 0;
-	}
-
 	switch (type) {
 	case DMA_TLB_GLOBAL_FLUSH:
 		/* global flush doesn't need set IVA_REG */
@@ -959,12 +945,10 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
 			(unsigned long long)DMA_TLB_IIRG(type),
 			(unsigned long long)DMA_TLB_IAIG(val));
-	/* flush iotlb entry will implicitly flush write buffer */
-	return 0;
 }
 
-static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-	u64 addr, unsigned int pages, int non_present_entry_flush)
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
+				  u64 addr, unsigned int pages)
 {
 	unsigned int mask;
 
@@ -974,8 +958,7 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	/* Fallback to domain selective flush if no PSI support */
 	if (!cap_pgsel_inv(iommu->cap))
 		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH,
-						non_present_entry_flush);
+						DMA_TLB_DSI_FLUSH);
 
 	/*
 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
@@ -985,11 +968,10 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	/* Fallback to domain selective flush if size is too big */
 	if (mask > cap_max_amask_val(iommu->cap))
 		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-			DMA_TLB_DSI_FLUSH, non_present_entry_flush);
+						DMA_TLB_DSI_FLUSH);
 
 	return iommu->flush.flush_iotlb(iommu, did, addr, mask,
-					DMA_TLB_PSI_FLUSH,
-					non_present_entry_flush);
+					DMA_TLB_PSI_FLUSH);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1423,7 +1405,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 					   (((u16)bus) << 8) | devfn,
 					   DMA_CCMD_MASK_NOBIT,
 					   DMA_CCMD_DEVICE_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
 	} else {
 		iommu_flush_write_buffer(iommu);
 	}
@@ -1558,8 +1540,7 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
 	clear_context_table(iommu, bus, devfn);
 	iommu->flush.flush_context(iommu, 0, 0, 0,
 					   DMA_CCMD_GLOBAL_INVL);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 static void domain_remove_dev_info(struct dmar_domain *domain)
@@ -2096,8 +2077,7 @@ static int __init init_dmars(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
-					 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 		iommu_disable_protect_mem_regions(iommu);
 
 		ret = iommu_enable_translation(iommu);
@@ -2244,10 +2224,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 	if (ret)
 		goto error;
 
-	/* it's a non-present to present mapping */
-	ret = iommu_flush_iotlb_psi(iommu, domain->id,
-			start_paddr, size >> VTD_PAGE_SHIFT, 1);
-	if (ret)
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, 0, start_paddr,
+				      size >> VTD_PAGE_SHIFT);
+	else
 		iommu_flush_write_buffer(iommu);
 
 	return start_paddr + ((u64)paddr & (~PAGE_MASK));
@@ -2283,7 +2264,7 @@ static void flush_unmaps(void)
 
 		if (deferred_flush[i].next) {
 			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						 DMA_TLB_GLOBAL_FLUSH, 0);
+						 DMA_TLB_GLOBAL_FLUSH);
 			for (j = 0; j < deferred_flush[i].next; j++) {
 				__free_iova(&deferred_flush[i].domain[j]->iovad,
 						deferred_flush[i].iova[j]);
@@ -2362,9 +2343,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 	if (intel_iommu_strict) {
-		if (iommu_flush_iotlb_psi(iommu,
-			domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
-			iommu_flush_write_buffer(iommu);
+		iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
+				      size >> VTD_PAGE_SHIFT);
 		/* free iova */
 		__free_iova(&domain->iovad, iova);
 	} else {
@@ -2455,9 +2435,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 
-	if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
-			size >> VTD_PAGE_SHIFT, 0))
-		iommu_flush_write_buffer(iommu);
+	iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
+			      size >> VTD_PAGE_SHIFT);
 
 	/* free iova */
 	__free_iova(&domain->iovad, iova);
@@ -2549,10 +2528,13 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
 		offset += size;
 	}
 
-	/* it's a non-present to present mapping */
-	if (iommu_flush_iotlb_psi(iommu, domain->id,
-			start_addr, offset >> VTD_PAGE_SHIFT, 1))
+	/* it's a non-present to present mapping. Only flush if caching mode */
+	if (cap_caching_mode(iommu->cap))
+		iommu_flush_iotlb_psi(iommu, 0, start_addr,
+				      offset >> VTD_PAGE_SHIFT);
+	else
 		iommu_flush_write_buffer(iommu);
+
 	return nelems;
 }
 
@@ -2711,9 +2693,9 @@ static int init_iommu_hw(void)
 		iommu_set_root_entry(iommu);
 
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL);
+					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 		iommu_disable_protect_mem_regions(iommu);
 		iommu_enable_translation(iommu);
 	}
@@ -2728,9 +2710,9 @@ static void iommu_flush_all(void)
 
 	for_each_active_iommu(iommu, drhd) {
 		iommu->flush.flush_context(iommu, 0, 0, 0,
-						DMA_CCMD_GLOBAL_INVL);
+					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 	}
 }
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f2b94dafbf3..29e05a034c0 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -283,8 +283,8 @@ struct ir_table {
 struct iommu_flush {
 	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
 			      u8 fm, u64 type);
-	int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
-		unsigned int size_order, u64 type, int non_present_entry_flush);
+	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
+			    unsigned int size_order, u64 type);
 };
 
 enum {
@@ -341,9 +341,8 @@ extern void qi_global_iec(struct intel_iommu *iommu);
 
 extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
 			     u8 fm, u64 type);
-extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-			  unsigned int size_order, u64 type,
-			  int non_present_entry_flush);
+extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+			  unsigned int size_order, u64 type);
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-- 
cgit v1.2.3-70-g09d2


From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:22 +0100
Subject: CRED: Rename cred_exec_mutex to reflect that it's a guard against
 ptrace

Rename cred_exec_mutex to reflect that it's a guard against foreign
intervention on a process's credential state, such as is made by ptrace().  The
attachment of a debugger to a process affects execve()'s calculation of the new
credential state - _and_ also setprocattr()'s calculation of that state.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c               |  6 +++---
 fs/exec.c                 | 10 +++++-----
 include/linux/init_task.h |  4 ++--
 include/linux/sched.h     |  4 +++-
 kernel/cred.c             |  4 ++--
 kernel/ptrace.c           |  9 +++++----
 6 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be..bb2a9b2e817 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1573,7 +1573,7 @@ out_unmark:
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/fs/exec.c b/fs/exec.c
index 639177b0eea..998e856c307 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1045,7 +1045,7 @@ void install_exec_creds(struct linux_binprm *bprm)
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
 
-	/* cred_exec_mutex must be held at least to this point to prevent
+	/* cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
 	 * credentials; any time after this it may be unlocked */
 
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1297,7 +1297,7 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1360,7 +1360,7 @@ int do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1383,7 +1383,7 @@ out_unmark:
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641..7f54ba94242 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -145,8 +145,8 @@ extern struct cred init_cred;
 	.group_leader	= &tsk,						\
 	.real_cred	= &init_cred,					\
 	.cred		= &init_cred,					\
-	.cred_exec_mutex =						\
-		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
+	.cred_guard_mutex =						\
+		 __MUTEX_INITIALIZER(tsk.cred_guard_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3fa82b353c9..5932ace2240 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1247,7 +1247,9 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d70..1bb4d7e5d61 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
 
 /*
  * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
  */
 struct cred *prepare_exec_creds(void)
 {
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
-	mutex_init(&p->cred_exec_mutex);
+	mutex_init(&p->cred_guard_mutex);
 
 	if (
 #ifdef CONFIG_KEYS
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0692ab5a0d6..27ac80298bf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,10 +185,11 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
-	/* Protect exec's credential calculations against our interference;
-	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	/* Protect the target's credential calculations against our
+	 * interference; SUID, SGID and LSM creds get determined differently
+	 * under ptrace.
 	 */
-	retval = mutex_lock_interruptible(&task->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&task->cred_guard_mutex);
 	if (retval  < 0)
 		goto out;
 
@@ -232,7 +233,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
-	mutex_unlock(&task->cred_exec_mutex);
+	mutex_unlock(&task->cred_guard_mutex);
 out:
 	return retval;
 }
-- 
cgit v1.2.3-70-g09d2


From 7303f240981888884412a97ac742772527356880 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 11 May 2009 09:59:34 +0300
Subject: slob: use PG_slab for identifying SLOB pages

For the sake of consistency.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Acked-by: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/page-flags.h | 2 --
 mm/slob.c                  | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 62214c7d2d9..f036dfbdad5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -120,7 +120,6 @@ enum pageflags {
 	PG_savepinned = PG_dirty,
 
 	/* SLOB */
-	PG_slob_page = PG_active,
 	PG_slob_free = PG_private,
 
 	/* SLUB */
@@ -203,7 +202,6 @@ PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
-__PAGEFLAG(SlobPage, slob_page)
 __PAGEFLAG(SlobFree, slob_free)
 
 __PAGEFLAG(SlubFrozen, slub_frozen)
diff --git a/mm/slob.c b/mm/slob.c
index a2d4ab32198..aad9dad2e82 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -131,17 +131,17 @@ static LIST_HEAD(free_slob_large);
  */
 static inline int is_slob_page(struct slob_page *sp)
 {
-	return PageSlobPage((struct page *)sp);
+	return PageSlab((struct page *)sp);
 }
 
 static inline void set_slob_page(struct slob_page *sp)
 {
-	__SetPageSlobPage((struct page *)sp);
+	__SetPageSlab((struct page *)sp);
 }
 
 static inline void clear_slob_page(struct slob_page *sp)
 {
-	__ClearPageSlobPage((struct page *)sp);
+	__ClearPageSlab((struct page *)sp);
 }
 
 static inline struct slob_page *slob_page(const void *addr)
-- 
cgit v1.2.3-70-g09d2


From c3a4d78c580de4edc9ef0f7c59812fb02ceb037f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:37 +0900
Subject: block: add rq->resid_len

rq->data_len served two purposes - the length of data buffer on issue
and the residual count on completion.  This duality creates some
headaches.

First of all, block layer and low level drivers can't really determine
what rq->data_len contains while a request is executing.  It could be
the total request length or it coulde be anything else one of the
lower layers is using to keep track of residual count.  This
complicates things because blk_rq_bytes() and thus
[__]blk_end_request_all() relies on rq->data_len for PC commands.
Drivers which want to report residual count should first cache the
total request length, update rq->data_len and then complete the
request with the cached data length.

Secondly, it makes requests default to reporting full residual count,
ie. reporting that no data transfer occurred.  The residual count is
an exception not the norm; however, the driver should clear
rq->data_len to zero to signify the normal cases while leaving it
alone means no data transfer occurred at all.  This reverse default
behavior complicates code unnecessarily and renders block PC on some
drivers (ide-tape/floppy) unuseable.

This patch adds rq->resid_len which is used only for residual count.

While at it, remove now unnecessasry blk_rq_bytes() caching in
ide_pc_intr() as rq->data_len is not changed anymore.

Boaz	: spotted missing conversion in osd
Sergei	: spotted too early conversion to blk_rq_bytes() in ide-tape

[ Impact: cleanup residual count handling, report 0 resid by default ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/bsg.c                              |  8 +++----
 block/scsi_ioctl.c                       |  2 +-
 drivers/block/cciss.c                    | 13 ++++-------
 drivers/block/ub.c                       |  6 ++---
 drivers/ide/ide-atapi.c                  |  9 +-------
 drivers/ide/ide-cd.c                     | 13 +++++------
 drivers/ide/ide-tape.c                   |  4 ++--
 drivers/message/fusion/mptsas.c          |  3 +--
 drivers/scsi/libsas/sas_expander.c       |  6 +----
 drivers/scsi/libsas/sas_host_smp.c       | 38 +++++++++++++++++---------------
 drivers/scsi/mpt2sas/mpt2sas_transport.c |  4 +---
 drivers/scsi/scsi_lib.c                  | 24 ++++++++++----------
 drivers/scsi/sg.c                        |  2 +-
 drivers/scsi/st.c                        |  2 +-
 fs/exofs/osd.c                           |  4 ++--
 include/linux/blkdev.h                   |  1 +
 16 files changed, 59 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 206060e795d..2d746e34f4c 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -445,14 +445,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->data_len;
-		hdr->din_resid = rq->next_rq->data_len;
+		hdr->dout_resid = rq->resid_len;
+		hdr->din_resid = rq->next_rq->resid_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->data_len;
+		hdr->din_resid = rq->resid_len;
 	else
-		hdr->dout_resid = rq->data_len;
+		hdr->dout_resid = rq->resid_len;
 
 	/*
 	 * If the request generated a negative error number, return it
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 58cf4560f74..a9670dd4b5d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -230,7 +230,7 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
+	hdr->resid = rq->resid_len;
 	hdr->sb_len_wr = 0;
 
 	if (rq->sense_len && hdr->sbp) {
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4d4d5e0d3fa..f22d4932433 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1299,7 +1299,6 @@ static void cciss_softirq_done(struct request *rq)
 {
 	CommandList_struct *cmd = rq->completion_data;
 	ctlr_info_t *h = hba[cmd->ctlr];
-	unsigned int nr_bytes;
 	unsigned long flags;
 	u64bit temp64;
 	int i, ddir;
@@ -1321,15 +1320,11 @@ static void cciss_softirq_done(struct request *rq)
 	printk("Done with %p\n", rq);
 #endif				/* CCISS_DEBUG */
 
-	/*
-	 * Store the full size and set the residual count for pc requests
-	 */
-	nr_bytes = blk_rq_bytes(rq);
+	/* set the residual count for pc requests */
 	if (blk_pc_request(rq))
-		rq->data_len = cmd->err_info->ResidualCnt;
+		rq->resid_len = cmd->err_info->ResidualCnt;
 
-	if (blk_end_request(rq, (rq->errors == 0) ? 0 : -EIO, nr_bytes))
-		BUG();
+	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
 
 	spin_lock_irqsave(&h->lock, flags);
 	cmd_free(h, cmd, 1);
@@ -2691,7 +2686,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 			printk(KERN_WARNING "cciss: cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
-			cmd->rq->data_len = cmd->err_info->ResidualCnt;
+			cmd->rq->resid_len = cmd->err_info->ResidualCnt;
 		}
 		break;
 	case CMD_DATA_OVERRUN:
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 689cd27ac89..8c2cc71327e 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -783,10 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len >= rq->data_len)
-				rq->data_len = 0;
-			else
-				rq->data_len -= cmd->act_len;
+			if (cmd->act_len < rq->data_len)
+				rq->resid_len = rq->data_len - cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index afe5a432387..e4a02a05fc8 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -367,7 +367,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	/* No more interrupts */
 	if ((stat & ATA_DRQ) == 0) {
 		int uptodate, error;
-		unsigned int done;
 
 		debug_log("Packet command completed, %d bytes transferred\n",
 			  pc->xferred);
@@ -406,12 +405,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
-		/*
-		 * ->pc_callback() might change rq->data_len for
-		 * residual count, cache total length.
-		 */
-		done = blk_rq_bytes(rq);
-
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -431,7 +424,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			error = uptodate ? 0 : -EIO;
 		}
 
-		ide_complete_rq(drive, error, done);
+		ide_complete_rq(drive, error, blk_rq_bytes(rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 673628790f1..8bbe222c5e4 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -519,7 +519,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
 
 		if (buffer)
-			*bufflen = rq->data_len;
+			*bufflen = rq->resid_len;
 
 		flags = rq->cmd_flags;
 		blk_put_request(rq);
@@ -707,11 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
-		unsigned int dlen = rq->data_len;
-
-		rq->data_len = 0;
-
-		if (blk_end_request(rq, 0, dlen))
+		if (blk_end_request(rq, 0, rq->data_len))
 			BUG();
 
 		hwif->rq = NULL;
@@ -740,9 +736,10 @@ out_end:
 			nsectors = 1;
 
 		if (blk_fs_request(rq) == 0) {
-			rq->data_len -= (cmd->nbytes - cmd->nleft);
+			rq->resid_len = rq->data_len -
+				(cmd->nbytes - cmd->nleft);
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
-				rq->data_len += cmd->last_xfer_len;
+				rq->resid_len += cmd->last_xfer_len;
 		}
 
 		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 7149224d1fe..65c5b705883 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->data_len -= blocks * tape->blk_size;
+		rq->resid_len = rq->data_len - blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -903,7 +903,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
 	/* calculate the number of transferred bytes and update buffer state */
-	size -= rq->data_len;
+	size -= rq->resid_len;
 	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
 		tape->valid = size;
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index a9019f081b9..5d5f34715de 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1357,8 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		req->data_len = 0;
-		rsp->data_len -= smprep->ResponseDataLength;
+		rsp->resid_len = rsp->data_len - smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 3da02e43678..6605ec905cc 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1936,12 +1936,8 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			       bio_data(rsp->bio), rsp->data_len);
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
-		rsp->data_len = ret;
-		req->data_len = 0;
+		rsp->resid_len = ret;
 		ret = 0;
-	} else if (ret == 0) {
-		rsp->data_len = 0;
-		req->data_len = 0;
 	}
 
 	return ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index d110a366c48..89952edd0be 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -134,7 +134,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 {
 	u8 *req_data = NULL, *resp_data = NULL, *buf;
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost);
-	int error = -EINVAL, resp_data_len = rsp->data_len;
+	int error = -EINVAL;
 
 	/* eight is the minimum size for request and response frames */
 	if (req->data_len < 8 || rsp->data_len < 8)
@@ -176,17 +176,20 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
+	req->resid_len = req->data_len;
+	rsp->resid_len = rsp->data_len;
+
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
-		req->data_len -= 8;
-		resp_data_len -= 32;
+		req->resid_len -= 8;
+		rsp->resid_len -= 32;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		resp_data[9] = sas_ha->num_phys;
 		break;
 
 	case SMP_REPORT_MANUF_INFO:
-		req->data_len -= 8;
-		resp_data_len -= 64;
+		req->resid_len -= 8;
+		rsp->resid_len -= 64;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		memcpy(resp_data + 12, shost->hostt->name,
 		       SAS_EXPANDER_VENDOR_ID_LEN);
@@ -199,13 +202,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_DISCOVER:
-		req->data_len -= 16;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 16;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 56;
+		rsp->resid_len -= 56;
 		sas_host_smp_discover(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -215,13 +218,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_REPORT_PHY_SATA:
-		req->data_len -= 16;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 16;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 60;
+		rsp->resid_len -= 60;
 		sas_report_phy_sata(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -238,13 +241,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_PHY_CONTROL:
-		req->data_len -= 44;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 44;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 8;
+		rsp->resid_len -= 8;
 		sas_phy_control(sas_ha, req_data[9], req_data[10],
 				req_data[32] >> 4, req_data[33] >> 4,
 				resp_data);
@@ -265,7 +268,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	flush_kernel_dcache_page(bio_page(rsp->bio));
 	kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0);
 	local_irq_enable();
-	rsp->data_len = resp_data_len;
 
  out:
 	kfree(req_data);
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index e03dc0b1e1a..53759c566bf 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1170,9 +1170,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		req->data_len = 0;
-		rsp->data_len -= mpi_reply->ResponseDataLength;
-
+		rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index aa9fc572e45..7d49ef589f3 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -240,11 +240,11 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	 * is invalid.  Prevent the garbage from being misinterpreted
 	 * and prevent security leaks by zeroing out the excess data.
 	 */
-	if (unlikely(req->data_len > 0 && req->data_len <= bufflen))
-		memset(buffer + (bufflen - req->data_len), 0, req->data_len);
+	if (unlikely(req->resid_len > 0 && req->resid_len <= bufflen))
+		memset(buffer + (bufflen - req->resid_len), 0, req->resid_len);
 
 	if (resid)
-		*resid = req->data_len;
+		*resid = req->resid_len;
 	ret = req->errors;
  out:
 	blk_put_request(req);
@@ -549,7 +549,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 		int leftover = (req->hard_nr_sectors << 9);
 
 		if (blk_pc_request(req))
-			leftover = req->data_len;
+			leftover = req->resid_len;
 
 		/* kill remainder if no retrys */
 		if (error && scsi_noretry_cmd(cmd))
@@ -673,11 +673,11 @@ void scsi_release_buffers(struct scsi_cmnd *cmd)
 EXPORT_SYMBOL(scsi_release_buffers);
 
 /*
- * Bidi commands Must be complete as a whole, both sides at once.
- * If part of the bytes were written and lld returned
- * scsi_in()->resid and/or scsi_out()->resid this information will be left
- * in req->data_len and req->next_rq->data_len. The upper-layer driver can
- * decide what to do with this information.
+ * Bidi commands Must be complete as a whole, both sides at once.  If
+ * part of the bytes were written and lld returned scsi_in()->resid
+ * and/or scsi_out()->resid this information will be left in
+ * req->resid_len and req->next_rq->resid_len. The upper-layer driver
+ * can decide what to do with this information.
  */
 static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 {
@@ -685,8 +685,8 @@ static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 	unsigned int dlen = req->data_len;
 	unsigned int next_dlen = req->next_rq->data_len;
 
-	req->data_len = scsi_out(cmd)->resid;
-	req->next_rq->data_len = scsi_in(cmd)->resid;
+	req->resid_len = scsi_out(cmd)->resid;
+	req->next_rq->resid_len = scsi_in(cmd)->resid;
 
 	/* The req and req->next_rq have not been completed */
 	BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
@@ -778,7 +778,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			scsi_end_bidi_request(cmd);
 			return;
 		}
-		req->data_len = scsi_get_resid(cmd);
+		req->resid_len = scsi_get_resid(cmd);
 	}
 
 	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 82312df9b0b..dec4c70677d 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1260,7 +1260,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 
 	sense = rq->sense;
 	result = rq->errors;
-	resid = rq->data_len;
+	resid = rq->resid_len;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_cmd_done: %s, pack_id=%d, res=0x%x\n",
 		sdp->disk->disk_name, srp->header.pack_id, result));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index eb24efea8f1..8681b708344 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -463,7 +463,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
 	struct scsi_tape *STp = SRpnt->stp;
 
 	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
-	STp->buffer->cmdstat.residual = req->data_len;
+	STp->buffer->cmdstat.residual = req->resid_len;
 
 	if (SRpnt->waiting)
 		complete(SRpnt->waiting);
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb1..06ca92672eb 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
 
 	/* FIXME: should be include in osd_sense_info */
 	if (in_resid)
-		*in_resid = or->in.req ? or->in.req->data_len : 0;
+		*in_resid = or->in.req ? or->in.req->resid_len : 0;
 
 	if (out_resid)
-		*out_resid = or->out.req ? or->out.req->data_len : 0;
+		*out_resid = or->out.req ? or->out.req->resid_len : 0;
 
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3a5b1bd6582..6a967cad89f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -229,6 +229,7 @@ struct request {
 	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
+	unsigned int resid_len;	/* residual count */
 	void *sense;
 
 	unsigned long deadline;
-- 
cgit v1.2.3-70-g09d2


From 5b93629b4509c03ffa87a9316412fedf6f58cb37 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:38 +0900
Subject: block: implement blk_rq_pos/[cur_]sectors() and convert obvious ones

Implement accessors - blk_rq_pos(), blk_rq_sectors() and
blk_rq_cur_sectors() which return rq->hard_sector, rq->hard_nr_sectors
and rq->hard_cur_sectors respectively and convert direct references of
the said fields to the accessors.

This is in preparation of request data length handling cleanup.

Geert	: suggested adding const to struct request * parameter to accessors
Sergei	: spotted error in patch description

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Tested-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Ackec-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c             |  2 +-
 block/blk-core.c                |  2 +-
 block/cfq-iosched.c             |  2 +-
 drivers/block/ps3disk.c         |  2 +-
 drivers/block/viodasd.c         |  6 +++---
 drivers/block/xsysace.c         | 10 +++++-----
 drivers/ide/ide-cd.c            |  8 ++++----
 drivers/ide/ide-io.c            |  4 ++--
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/scsi/scsi_lib.c         |  2 +-
 include/linux/blkdev.h          | 23 ++++++++++++++++++++---
 kernel/trace/blktrace.c         |  4 ++--
 12 files changed, 42 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c8d087655ef..c167de5b9ef 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -163,7 +163,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!rq->hard_nr_sectors) {
+	if (!blk_rq_sectors(rq)) {
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
 		/*
diff --git a/block/blk-core.c b/block/blk-core.c
index 394c5bd8127..895e55b74a4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1683,7 +1683,7 @@ static void blk_account_io_done(struct request *req)
 unsigned int blk_rq_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
-		return rq->hard_nr_sectors << 9;
+		return blk_rq_sectors(rq) << 9;
 
 	return rq->data_len;
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index def0c698a4b..575083a9ffe 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -760,7 +760,7 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 
-	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
+	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
 
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f6586e4d351..c2388673684 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -136,7 +136,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core,
 		"%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n",
 		__func__, __LINE__, op, n, req->nr_sectors,
-		req->hard_nr_sectors);
+		blk_rq_sectors(req));
 #endif
 
 	start_sector = req->sector * priv->blocking_factor;
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index ecccf65dce2..e821eed7132 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -368,12 +368,12 @@ static void do_viodasd_request(struct request_queue *q)
 		blkdev_dequeue_request(req);
 		/* check that request contains a valid command */
 		if (!blk_fs_request(req)) {
-			viodasd_end_request(req, -EIO, req->hard_nr_sectors);
+			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 			continue;
 		}
 		/* Try sending the request */
 		if (send_request(req) != 0)
-			viodasd_end_request(req, -EIO, req->hard_nr_sectors);
+			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 	}
 }
 
@@ -590,7 +590,7 @@ static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
 		err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
 		printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n",
 				event->xRc, bevent->sub_result, err->msg);
-		num_sect = req->hard_nr_sectors;
+		num_sect = blk_rq_sectors(req);
 	}
 	qlock = req->q->queue_lock;
 	spin_lock_irqsave(qlock, irq_flags);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index b1e1d7e5ab1..5722931d14c 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -645,8 +645,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
-			"request: sec=%llx hcnt=%lx, ccnt=%x, dir=%i\n",
-			(unsigned long long) req->sector, req->hard_nr_sectors,
+			"request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n",
+			(unsigned long long) req->sector, blk_rq_sectors(req),
 			req->current_nr_sectors, rq_data_dir(req));
 
 		ace->req = req;
@@ -654,7 +654,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR;
 		ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF);
 
-		count = req->hard_nr_sectors;
+		count = blk_rq_sectors(req);
 		if (rq_data_dir(req)) {
 			/* Kick off write request */
 			dev_dbg(ace->dev, "write data\n");
@@ -719,8 +719,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		/* bio finished; is there another one? */
 		if (__blk_end_request(ace->req, 0,
 					blk_rq_cur_bytes(ace->req))) {
-			/* dev_dbg(ace->dev, "next block; h=%li c=%i\n",
-			 *      ace->req->hard_nr_sectors,
+			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
+			 *      blk_rq_sectors(ace->req),
 			 *      ace->req->current_nr_sectors);
 			 */
 			ace->data_ptr = ace->req->buffer;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 8bbe222c5e4..182320dd6ea 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -730,7 +730,7 @@ out_end:
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
 		else
-			nsectors = rq->hard_nr_sectors;
+			nsectors = blk_rq_sectors(rq);
 
 		if (nsectors == 0)
 			nsectors = 1;
@@ -875,7 +875,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	return ide_issue_pc(drive, &cmd);
 out_end:
-	nsectors = rq->hard_nr_sectors;
+	nsectors = blk_rq_sectors(rq);
 
 	if (nsectors == 0)
 		nsectors = 1;
@@ -1359,8 +1359,8 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
 	int hard_sect = queue_hardsect_size(q);
-	long block = (long)rq->hard_sector / (hard_sect >> 9);
-	unsigned long blocks = rq->hard_nr_sectors / (hard_sect >> 9);
+	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
+	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
 	memset(rq->cmd, 0, BLK_MAX_CDB);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a0309ea661a..df23bcbd94b 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -118,7 +118,7 @@ unsigned int ide_rq_bytes(struct request *rq)
 	if (blk_pc_request(rq))
 		return rq->data_len;
 	else
-		return rq->hard_cur_sectors << 9;
+		return blk_rq_cur_sectors(rq) << 9;
 }
 EXPORT_SYMBOL_GPL(ide_rq_bytes);
 
@@ -133,7 +133,7 @@ int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes)
 	 * and complete the whole request right now
 	 */
 	if (blk_noretry_request(rq) && error <= 0)
-		nr_bytes = rq->hard_nr_sectors << 9;
+		nr_bytes = blk_rq_sectors(rq) << 9;
 
 	rc = ide_end_rq(drive, rq, error, nr_bytes);
 	if (rc == 0)
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 221317e6a00..56e60f0d531 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -427,7 +427,7 @@ static void i2o_block_end_request(struct request *req, int error,
 	unsigned long flags;
 
 	if (blk_end_request(req, error, nr_bytes)) {
-		int leftover = (req->hard_nr_sectors << KERNEL_SECTOR_SHIFT);
+		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 
 		if (blk_pc_request(req))
 			leftover = req->data_len;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7d49ef589f3..9ff0ca9988a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = (req->hard_nr_sectors << 9);
+		int leftover = blk_rq_sectors(req) << 9;
 
 		if (blk_pc_request(req))
 			leftover = req->resid_len;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a967cad89f..4e5f8559872 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -832,13 +832,30 @@ static inline void blk_run_address_space(struct address_space *mapping)
 extern void blkdev_dequeue_request(struct request *req);
 
 /*
- * blk_end_request() takes bytes instead of sectors as a complete size.
- * blk_rq_bytes() returns bytes left to complete in the entire request.
- * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
+ * blk_rq_pos()		: the current sector
+ * blk_rq_bytes()	: bytes left in the entire request
+ * blk_rq_cur_bytes()	: bytes left in the current segment
+ * blk_rq_sectors()	: sectors left in the entire request
+ * blk_rq_cur_sectors()	: sectors left in the current segment
  */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+	return rq->hard_sector;
+}
+
 extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+	return rq->hard_nr_sectors;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+	return rq->hard_cur_sectors;
+}
+
 /*
  * Request completion related functions.
  *
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0b..42f1c11e754 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -646,7 +646,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 				rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
 				rw, what, rq->errors, 0, NULL);
 	}
 }
@@ -857,7 +857,7 @@ void blk_add_driver_data(struct request_queue *q,
 		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
 				rq->errors, len, data);
 	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
 				0, BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
-- 
cgit v1.2.3-70-g09d2


From 2e46e8b27aa57c6bd34b3102b40ee4d0144b4fab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:41 +0900
Subject: block: drop request->hard_* and *nr_sectors

struct request has had a few different ways to represent some
properties of a request.  ->hard_* represent block layer's view of the
request progress (completion cursor) and the ones without the prefix
are supposed to represent the issue cursor and allowed to be updated
as necessary by the low level drivers.  The thing is that as block
layer supports partial completion, the two cursors really aren't
necessary and only cause confusion.  In addition, manual management of
request detail from low level drivers is cumbersome and error-prone at
the very least.

Another interesting duplicate fields are rq->[hard_]nr_sectors and
rq->{hard_cur|current}_nr_sectors against rq->data_len and
rq->bio->bi_size.  This is more convoluted than the hard_ case.

rq->[hard_]nr_sectors are initialized for requests with bio but
blk_rq_bytes() uses it only for !pc requests.  rq->data_len is
initialized for all request but blk_rq_bytes() uses it only for pc
requests.  This causes good amount of confusion throughout block layer
and its drivers and determining the request length has been a bit of
black magic which may or may not work depending on circumstances and
what the specific LLD is actually doing.

rq->{hard_cur|current}_nr_sectors represent the number of sectors in
the contiguous data area at the front.  This is mainly used by drivers
which transfers data by walking request segment-by-segment.  This
value always equals rq->bio->bi_size >> 9.  However, data length for
pc requests may not be multiple of 512 bytes and using this field
becomes a bit confusing.

In general, having multiple fields to represent the same property
leads only to confusion and subtle bugs.  With recent block low level
driver cleanups, no driver is accessing or manipulating these
duplicate fields directly.  Drop all the duplicates.  Now rq->sector
means the current sector, rq->data_len the current total length and
rq->bio->bi_size the current segment length.  Everything else is
defined in terms of these three and available only through accessors.

* blk_recalc_rq_sectors() is collapsed into blk_update_request() and
  now handles pc and fs requests equally other than rq->sector update.
  This means that now pc requests can use partial completion too (no
  in-kernel user yet tho).

* bio_cur_sectors() is replaced with bio_cur_bytes() as block layer
  now uses byte count as the primary data length.

* blk_rq_pos() is now guranteed to be always correct.  In-block users
  converted.

* blk_rq_bytes() is now guaranteed to be always valid as is
  blk_rq_sectors().  In-block users converted.

* blk_rq_sectors() is now guaranteed to equal blk_rq_bytes() >> 9.
  More convenient one is used.

* blk_rq_bytes() and blk_rq_cur_bytes() are now inlined and take const
  pointer to request.

[ Impact: API cleanup, single way to represent one property of a request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c         | 81 ++++++++++++++++++------------------------------
 block/blk-merge.c        | 36 +++------------------
 block/blk.h              |  1 -
 block/cfq-iosched.c      | 10 +++---
 include/linux/bio.h      |  6 ++--
 include/linux/blkdev.h   | 37 ++++++++++------------
 include/linux/elevator.h |  2 +-
 kernel/trace/blktrace.c  | 16 +++++-----
 8 files changed, 67 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 82dc20621c0..3596ca71909 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
-	rq->sector = rq->hard_sector = (sector_t) -1;
+	rq->sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
@@ -189,8 +189,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	       (unsigned long long)blk_rq_pos(rq),
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
-						rq->bio, rq->biotail,
-						rq->buffer, rq->data_len);
+	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
@@ -1096,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
-	req->hard_sector = req->sector = bio->bi_sector;
+	req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1113,14 +1112,13 @@ static inline bool queue_should_plug(struct request_queue *q)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors;
+	int el_ret;
+	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	const int unplug = bio_unplug(bio);
 	int rw_flags;
 
-	nr_sectors = bio_sectors(bio);
-
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1145,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
-		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+		req->data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1171,10 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
-		req->current_nr_sectors = bio_cur_sectors(bio);
-		req->hard_cur_sectors = req->current_nr_sectors;
-		req->sector = req->hard_sector = bio->bi_sector;
-		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+		req->sector = bio->bi_sector;
+		req->data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1557,7 +1553,7 @@ EXPORT_SYMBOL(submit_bio);
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    rq->data_len > q->max_hw_sectors << 9) {
+	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1675,35 +1671,6 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
-/**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- * @rq: the request being processed
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return blk_rq_sectors(rq) << 9;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- * @rq: the request being processed
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->current_nr_sectors << 9;
-
-	if (rq->bio)
-		return rq->bio->bi_size;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
 struct request *elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -1736,7 +1703,7 @@ struct request *elv_next_request(struct request_queue *q)
 		if (rq->cmd_flags & REQ_DONTPREP)
 			break;
 
-		if (q->dma_drain_size && rq->data_len) {
+		if (q->dma_drain_size && blk_rq_bytes(rq)) {
 			/*
 			 * make sure space for the drain appears we
 			 * know we can do this because max_hw_segments
@@ -1759,7 +1726,7 @@ struct request *elv_next_request(struct request_queue *q)
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
-			if (q->dma_drain_size && rq->data_len &&
+			if (q->dma_drain_size && blk_rq_bytes(rq) &&
 			    !(rq->cmd_flags & REQ_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
@@ -1911,8 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
-		req->nr_sectors = req->hard_nr_sectors = 0;
-		req->current_nr_sectors = req->hard_cur_sectors = 0;
+		req->data_len = 0;
 		return false;
 	}
 
@@ -1926,8 +1892,25 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 
-	blk_recalc_rq_sectors(req, total_bytes >> 9);
+	req->data_len -= total_bytes;
+	req->buffer = bio_data(req->bio);
+
+	/* update sector only for requests with clear definition of sector */
+	if (blk_fs_request(req) || blk_discard_rq(req))
+		req->sector += total_bytes >> 9;
+
+	/*
+	 * If total number of sectors is less than the first segment
+	 * size, something has gone terribly wrong.
+	 */
+	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+		printk(KERN_ERR "blk: request botched\n");
+		req->data_len = blk_rq_cur_bytes(req);
+	}
+
+	/* recalculate the number of segments */
 	blk_recalc_rq_segments(req);
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
@@ -2049,11 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->current_nr_sectors = bio_cur_sectors(bio);
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->data_len = bio->bi_size;
-
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index bf62a87a9da..b8df66aef0f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,35 +9,6 @@
 
 #include "blk.h"
 
-void blk_recalc_rq_sectors(struct request *rq, int nsect)
-{
-	if (blk_fs_request(rq) || blk_discard_rq(rq)) {
-		rq->hard_sector += nsect;
-		rq->hard_nr_sectors -= nsect;
-
-		/*
-		 * Move the I/O submission pointers ahead if required.
-		 */
-		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
-		    (rq->sector <= rq->hard_sector)) {
-			rq->sector = rq->hard_sector;
-			rq->nr_sectors = rq->hard_nr_sectors;
-			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
-			rq->current_nr_sectors = rq->hard_cur_sectors;
-			rq->buffer = bio_data(rq->bio);
-		}
-
-		/*
-		 * if total number of sectors is less than the first segment
-		 * size, something has gone terribly wrong
-		 */
-		if (rq->nr_sectors < rq->current_nr_sectors) {
-			printk(KERN_ERR "blk: request botched\n");
-			rq->nr_sectors = rq->current_nr_sectors;
-		}
-	}
-}
-
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
@@ -199,8 +170,9 @@ new_segment:
 
 
 	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
-	    (rq->data_len & q->dma_pad_mask)) {
-		unsigned int pad_len = (q->dma_pad_mask & ~rq->data_len) + 1;
+	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
+		unsigned int pad_len =
+			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
 
 		sg->length += pad_len;
 		rq->extra_len += pad_len;
@@ -398,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
-	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
+	req->data_len += blk_rq_bytes(next);
 
 	elv_merge_requests(q, req, next);
 
diff --git a/block/blk.h b/block/blk.h
index 51115599df9..ab54529103c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -101,7 +101,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
 void blk_recalc_rq_segments(struct request *rq);
-void blk_recalc_rq_sectors(struct request *rq, int nsect);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index db4d990a66b..99ac4304d71 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -579,9 +579,9 @@ cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 		 * Sort strictly based on sector.  Smallest to the left,
 		 * largest to the right.
 		 */
-		if (sector > cfqq->next_rq->sector)
+		if (sector > blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_right;
-		else if (sector < cfqq->next_rq->sector)
+		else if (sector < blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_left;
 		else
 			break;
@@ -611,8 +611,8 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		return;
 
 	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
-	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector,
-					 &parent, &p);
+	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
+				      blk_rq_pos(cfqq->next_rq), &parent, &p);
 	if (!__cfqq) {
 		rb_link_node(&cfqq->p_node, parent, p);
 		rb_insert_color(&cfqq->p_node, cfqq->p_root);
@@ -996,7 +996,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	if (cfq_rq_close(cfqd, __cfqq->next_rq))
 		return __cfqq;
 
-	if (__cfqq->next_rq->sector < sector)
+	if (blk_rq_pos(__cfqq->next_rq) < sector)
 		node = rb_next(&__cfqq->p_node);
 	else
 		node = rb_prev(&__cfqq->p_node);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f37ca8c726b..d30ec6f30dd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -218,12 +218,12 @@ struct bio {
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 
-static inline unsigned int bio_cur_sectors(struct bio *bio)
+static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
 	if (bio->bi_vcnt)
-		return bio_iovec(bio)->bv_len >> 9;
+		return bio_iovec(bio)->bv_len;
 	else /* dataless requests such as discard */
-		return bio->bi_size >> 9;
+		return bio->bi_size;
 }
 
 static inline void *bio_data(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4e5f8559872..ce2bf5efa9b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -166,19 +166,8 @@ struct request {
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 
-	/* Maintain bio traversal state for part by part I/O submission.
-	 * hard_* are block layer internals, no driver should touch them!
-	 */
-
-	sector_t sector;		/* next sector to submit */
-	sector_t hard_sector;		/* next sector to complete */
-	unsigned long nr_sectors;	/* no. of sectors left to submit */
-	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
-	/* no. of sectors left to submit in the current segment */
-	unsigned int current_nr_sectors;
-
-	/* no. of sectors left to complete in the current segment */
-	unsigned int hard_cur_sectors;
+	sector_t sector;	/* sector cursor */
+	unsigned int data_len;	/* total data len, don't access directly */
 
 	struct bio *bio;
 	struct bio *biotail;
@@ -226,7 +215,6 @@ struct request {
 	unsigned char __cmd[BLK_MAX_CDB];
 	unsigned char *cmd;
 
-	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
 	unsigned int resid_len;	/* residual count */
@@ -840,20 +828,27 @@ extern void blkdev_dequeue_request(struct request *req);
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
-	return rq->hard_sector;
+	return rq->sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+	return rq->data_len;
 }
 
-extern unsigned int blk_rq_bytes(struct request *rq);
-extern unsigned int blk_rq_cur_bytes(struct request *rq);
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
+}
 
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
-	return rq->hard_nr_sectors;
+	return blk_rq_bytes(rq) >> 9;
 }
 
 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 {
-	return rq->hard_cur_sectors;
+	return blk_rq_cur_bytes(rq) >> 9;
 }
 
 /*
@@ -928,7 +923,7 @@ static inline void blk_end_request_all(struct request *rq, int error)
  */
 static inline bool blk_end_request_cur(struct request *rq, int error)
 {
-	return blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 
 /**
@@ -981,7 +976,7 @@ static inline void __blk_end_request_all(struct request *rq, int error)
  */
 static inline bool __blk_end_request_cur(struct request *rq, int error)
 {
-	return __blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 
 extern void blk_complete_request(struct request *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index c59b769f62b..4e462878c9c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -171,7 +171,7 @@ enum {
 	ELV_MQUEUE_MUST,
 };
 
-#define rq_end_sector(rq)	((rq)->sector + (rq)->nr_sectors)
+#define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 42f1c11e754..5708a14bee5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -642,12 +642,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				rq->cmd_len, rq->cmd);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
-				rw, what, rq->errors, 0, NULL);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+				what, rq->errors, 0, NULL);
 	}
 }
 
@@ -854,11 +854,11 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-- 
cgit v1.2.3-70-g09d2


From a2dec7b36364a5cc564c4d76cf16d2e7d33f5c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:44 +0900
Subject: block: hide request sector and data_len

Block low level drivers for some reason have been pretty good at
abusing block layer API.  Especially struct request's fields tend to
get violated in all possible ways.  Make it clear that low level
drivers MUST NOT access or manipulate rq->sector and rq->data_len
directly by prefixing them with double underscores.

This change is also necessary to break build of out-of-tree codes
which assume the previous block API where internal fields can be
manipulated and rq->data_len carries residual count on completion.

[ Impact: hide internal fields, block API change ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 20 ++++++++++----------
 block/blk-map.c        |  2 +-
 block/blk-merge.c      |  2 +-
 include/linux/blkdev.h |  9 +++++----
 4 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3596ca71909..6226a380fb6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
-	rq->sector = (sector_t) -1;
+	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
@@ -1095,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
-	req->sector = bio->bi_sector;
+	req->__sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1143,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
-		req->data_len += bytes;
+		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1169,8 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
-		req->sector = bio->bi_sector;
-		req->data_len += bytes;
+		req->__sector = bio->bi_sector;
+		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1878,7 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
-		req->data_len = 0;
+		req->__data_len = 0;
 		return false;
 	}
 
@@ -1892,12 +1892,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 
-	req->data_len -= total_bytes;
+	req->__data_len -= total_bytes;
 	req->buffer = bio_data(req->bio);
 
 	/* update sector only for requests with clear definition of sector */
 	if (blk_fs_request(req) || blk_discard_rq(req))
-		req->sector += total_bytes >> 9;
+		req->__sector += total_bytes >> 9;
 
 	/*
 	 * If total number of sectors is less than the first segment
@@ -1905,7 +1905,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 */
 	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 		printk(KERN_ERR "blk: request botched\n");
-		req->data_len = blk_rq_cur_bytes(req);
+		req->__data_len = blk_rq_cur_bytes(req);
 	}
 
 	/* recalculate the number of segments */
@@ -2032,7 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->data_len = bio->bi_size;
+	rq->__data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-map.c b/block/blk-map.c
index 694fefad34e..56082bea450 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
-		rq->data_len += bio->bi_size;
+		rq->__data_len += bio->bi_size;
 	}
 	return 0;
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b8df66aef0f..4974dd5767e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -370,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
-	req->data_len += blk_rq_bytes(next);
+	req->__data_len += blk_rq_bytes(next);
 
 	elv_merge_requests(q, req, next);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce2bf5efa9b..c7558034570 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -166,8 +166,9 @@ struct request {
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 
-	sector_t sector;	/* sector cursor */
-	unsigned int data_len;	/* total data len, don't access directly */
+	/* the following two fields are internal, NEVER access directly */
+	sector_t __sector;		/* sector cursor */
+	unsigned int __data_len;	/* total data len */
 
 	struct bio *bio;
 	struct bio *biotail;
@@ -828,12 +829,12 @@ extern void blkdev_dequeue_request(struct request *req);
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
-	return rq->sector;
+	return rq->__sector;
 }
 
 static inline unsigned int blk_rq_bytes(const struct request *rq)
 {
-	return rq->data_len;
+	return rq->__data_len;
 }
 
 static inline int blk_rq_cur_bytes(const struct request *rq)
-- 
cgit v1.2.3-70-g09d2


From 9934c8c04561413609d2bc38c6b9f268cba774a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:16 +0900
Subject: block: implement and enforce request peek/start/fetch

Till now block layer allowed two separate modes of request execution.
A request is always acquired from the request queue via
elv_next_request().  After that, drivers are free to either dequeue it
or process it without dequeueing.  Dequeue allows elv_next_request()
to return the next request so that multiple requests can be in flight.

Executing requests without dequeueing has its merits mostly in
allowing drivers for simpler devices which can't do sg to deal with
segments only without considering request boundary.  However, the
benefit this brings is dubious and declining while the cost of the API
ambiguity is increasing.  Segment based drivers are usually for very
old or limited devices and as converting to dequeueing model isn't
difficult, it doesn't justify the API overhead it puts on block layer
and its more modern users.

Previous patches converted all block low level drivers to dequeueing
model.  This patch completes the API transition by...

* renaming elv_next_request() to blk_peek_request()

* renaming blkdev_dequeue_request() to blk_start_request()

* adding blk_fetch_request() which is combination of peek and start

* disallowing completion of queued (not started) requests

* applying new API to all LLDs

Renamings are for consistency and to break out of tree code so that
it's apparent that out of tree drivers need updating.

[ Impact: block request issue API cleanup, no functional change ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: unsik Kim <donari75@gmail.com>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Laurent Vivier <Laurent@lvivier.info>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/arm/plat-omap/mailbox.c        |  12 ++---
 arch/um/drivers/ubd_kern.c          |   3 +-
 block/blk-barrier.c                 |   4 +-
 block/blk-core.c                    | 105 ++++++++++++++++++++++++++----------
 block/blk-tag.c                     |   2 +-
 block/blk.h                         |   1 +
 drivers/block/DAC960.c              |   4 +-
 drivers/block/amiflop.c             |   3 +-
 drivers/block/ataflop.c             |   3 +-
 drivers/block/cciss.c               |   4 +-
 drivers/block/cpqarray.c            |   4 +-
 drivers/block/floppy.c              |   6 +--
 drivers/block/hd.c                  |   3 +-
 drivers/block/mg_disk.c             |  12 ++---
 drivers/block/nbd.c                 |   4 +-
 drivers/block/paride/pcd.c          |   3 +-
 drivers/block/paride/pd.c           |   7 +--
 drivers/block/paride/pf.c           |   3 +-
 drivers/block/ps3disk.c             |   4 +-
 drivers/block/sunvdc.c              |   3 +-
 drivers/block/swim.c                |  12 ++---
 drivers/block/swim3.c               |   3 +-
 drivers/block/sx8.c                 |   8 ++-
 drivers/block/ub.c                  |   8 +--
 drivers/block/viodasd.c             |   4 +-
 drivers/block/virtio_blk.c          |   4 +-
 drivers/block/xd.c                  |  12 ++---
 drivers/block/xen-blkfront.c        |   4 +-
 drivers/block/xsysace.c             |  10 ++--
 drivers/block/z2ram.c               |  12 ++---
 drivers/cdrom/gdrom.c               |   4 +-
 drivers/cdrom/viocd.c               |   4 +-
 drivers/ide/ide-atapi.c             |   2 +-
 drivers/ide/ide-io.c                |   9 ++--
 drivers/memstick/core/mspro_block.c |   9 ++--
 drivers/message/i2o/i2o_block.c     |   6 +--
 drivers/mmc/card/queue.c            |  11 ++--
 drivers/mtd/mtd_blkdevs.c           |   7 +--
 drivers/s390/block/dasd.c           |  16 ++----
 drivers/s390/char/tape_block.c      |   7 +--
 drivers/sbus/char/jsflash.c         |  12 ++---
 drivers/scsi/scsi_lib.c             |  10 ++--
 drivers/scsi/scsi_transport_sas.c   |   4 +-
 include/linux/blkdev.h              |   9 +++-
 include/linux/elevator.h            |   2 -
 45 files changed, 172 insertions(+), 207 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 7a1f5c25fd1..40424edae93 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -197,9 +197,7 @@ static void mbox_tx_work(struct work_struct *work)
 		struct omap_msg_tx_data *tx_data;
 
 		spin_lock(q->queue_lock);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock(q->queue_lock);
 
 		if (!rq)
@@ -242,9 +240,7 @@ static void mbox_rx_work(struct work_struct *work)
 
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		if (!rq)
 			break;
@@ -351,9 +347,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 
 		if (!rq)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 402ba8f70fc..aa9e926e13d 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1228,12 +1228,11 @@ static void do_ubd_request(struct request_queue *q)
 	while(1){
 		struct ubd *dev = q->queuedata;
 		if(dev->end_sg == 0){
-			struct request *req = elv_next_request(q);
+			struct request *req = blk_fetch_request(q);
 			if(req == NULL)
 				return;
 
 			dev->request = req;
-			blkdev_dequeue_request(req);
 			dev->start_sg = 0;
 			dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
 		}
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8713c2fbc4f..0ab81a0a750 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -180,7 +180,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	}
 
 	/* stash away the original request */
-	elv_dequeue_request(q, rq);
+	blk_dequeue_request(rq);
 	q->orig_bar_rq = rq;
 	rq = NULL;
 
@@ -248,7 +248,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
 			 */
-			elv_dequeue_request(q, rq);
+			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
diff --git a/block/blk-core.c b/block/blk-core.c
index 6226a380fb6..93691d2ac5a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -902,6 +902,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
+	BUG_ON(blk_queued_rq(rq));
+
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
@@ -1610,28 +1612,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
-/**
- * blkdev_dequeue_request - dequeue request and start timeout timer
- * @req: request to dequeue
- *
- * Dequeue @req and start timeout timer on it.  This hands off the
- * request to the driver.
- *
- * Block internal functions which don't want to start timer should
- * call elv_dequeue_request().
- */
-void blkdev_dequeue_request(struct request *req)
-{
-	elv_dequeue_request(req->q, req);
-
-	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
-	 */
-	blk_add_timer(req);
-}
-EXPORT_SYMBOL(blkdev_dequeue_request);
-
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
@@ -1671,7 +1651,23 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
-struct request *elv_next_request(struct request_queue *q)
+/**
+ * blk_peek_request - peek at the top of a request queue
+ * @q: request queue to peek at
+ *
+ * Description:
+ *     Return the request at the top of @q.  The returned request
+ *     should be started using blk_start_request() before LLD starts
+ *     processing it.
+ *
+ * Return:
+ *     Pointer to the request at the top of @q if available.  Null
+ *     otherwise.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+struct request *blk_peek_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
@@ -1748,10 +1744,12 @@ struct request *elv_next_request(struct request_queue *q)
 
 	return rq;
 }
-EXPORT_SYMBOL(elv_next_request);
+EXPORT_SYMBOL(blk_peek_request);
 
-void elv_dequeue_request(struct request_queue *q, struct request *rq)
+void blk_dequeue_request(struct request *rq)
 {
+	struct request_queue *q = rq->q;
+
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 
@@ -1766,6 +1764,58 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 		q->in_flight++;
 }
 
+/**
+ * blk_start_request - start request processing on the driver
+ * @req: request to dequeue
+ *
+ * Description:
+ *     Dequeue @req and start timeout timer on it.  This hands off the
+ *     request to the driver.
+ *
+ *     Block internal functions which don't want to start timer should
+ *     call blk_dequeue_request().
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+void blk_start_request(struct request *req)
+{
+	blk_dequeue_request(req);
+
+	/*
+	 * We are now handing the request to the hardware, add the
+	 * timeout handler.
+	 */
+	blk_add_timer(req);
+}
+EXPORT_SYMBOL(blk_start_request);
+
+/**
+ * blk_fetch_request - fetch a request from a request queue
+ * @q: request queue to fetch a request from
+ *
+ * Description:
+ *     Return the request at the top of @q.  The request is started on
+ *     return and LLD can start processing it immediately.
+ *
+ * Return:
+ *     Pointer to the request at the top of @q if available.  Null
+ *     otherwise.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+struct request *blk_fetch_request(struct request_queue *q)
+{
+	struct request *rq;
+
+	rq = blk_peek_request(q);
+	if (rq)
+		blk_start_request(rq);
+	return rq;
+}
+EXPORT_SYMBOL(blk_fetch_request);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:	      the request being processed
@@ -1937,12 +1987,11 @@ static bool blk_update_bidi_request(struct request *rq, int error,
  */
 static void blk_finish_request(struct request *req, int error)
 {
+	BUG_ON(blk_queued_rq(req));
+
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
-	if (blk_queued_rq(req))
-		elv_dequeue_request(req->q, req);
-
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 3c518e3303a..c260f7c30dd 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -374,7 +374,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	rq->cmd_flags |= REQ_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 	list_add(&rq->queuelist, &q->tag_busy_list);
 	return 0;
 }
diff --git a/block/blk.h b/block/blk.h
index ab54529103c..9e0042ca949 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,7 @@ extern struct kobj_type blk_queue_ktype;
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
+void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
 void blk_unplug_work(struct work_struct *work);
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 774ab05973a..668dc234b8e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3321,7 +3321,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	DAC960_Command_T *Command;
 
    while(1) {
-	Request = elv_next_request(req_q);
+	Request = blk_peek_request(req_q);
 	if (!Request)
 		return 1;
 
@@ -3341,7 +3341,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	Command->BlockNumber = blk_rq_pos(Request);
 	Command->BlockCount = blk_rq_sectors(Request);
 	Command->Request = Request;
-	blkdev_dequeue_request(Request);
+	blk_start_request(Request);
 	Command->SegmentCount = blk_rq_map_sg(req_q,
 		  Command->Request, Command->cmd_sglist);
 	/* pci_map_sg MAY change the value of SegCount */
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 80a68b2e045..9c6e5b0fe89 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1342,12 +1342,11 @@ static void redo_fd_request(void)
 	int err;
 
 next_req:
-	rq = elv_next_request(floppy_queue);
+	rq = blk_fetch_request(floppy_queue);
 	if (!rq) {
 		/* Nothing left to do */
 		return;
 	}
-	blkdev_dequeue_request(rq);
 
 	floppy = rq->rq_disk->private_data;
 	drive = floppy - unit;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 89a591d9c83..f5e7180d7f4 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1404,10 +1404,9 @@ static void redo_fd_request(void)
 
 repeat:
 	if (!fd_request) {
-		fd_request = elv_next_request(floppy_queue);
+		fd_request = blk_fetch_request(floppy_queue);
 		if (!fd_request)
 			goto the_end;
-		blkdev_dequeue_request(fd_request);
 	}
 
 	floppy = fd_request->rq_disk->private_data;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ab7b04c0db7..e714e7cce6f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2801,7 +2801,7 @@ static void do_cciss_request(struct request_queue *q)
 		goto startio;
 
       queue:
-	creq = elv_next_request(q);
+	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
@@ -2810,7 +2810,7 @@ static void do_cciss_request(struct request_queue *q)
 	if ((c = cmd_alloc(h, 1)) == NULL)
 		goto full;
 
-	blkdev_dequeue_request(creq);
+	blk_start_request(creq);
 
 	spin_unlock_irq(q->queue_lock);
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a5caeff4718..a02dcfc00f1 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -903,7 +903,7 @@ static void do_ida_request(struct request_queue *q)
 		goto startio;
 
 queue_next:
-	creq = elv_next_request(q);
+	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
@@ -912,7 +912,7 @@ queue_next:
 	if ((c = cmd_alloc(h,1)) == NULL)
 		goto startio;
 
-	blkdev_dequeue_request(creq);
+	blk_start_request(creq);
 
 	c->ctlr = h->ctlr;
 	c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e2c70d2085a..90877fee0ee 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -931,7 +931,7 @@ static inline void unlock_fdc(void)
 	del_timer(&fd_timeout);
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || elv_next_request(floppy_queue))
+	if (current_req || blk_peek_request(floppy_queue))
 		do_fd_request(floppy_queue);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
@@ -2912,9 +2912,7 @@ static void redo_fd_request(void)
 			struct request *req;
 
 			spin_lock_irq(floppy_queue->queue_lock);
-			req = elv_next_request(floppy_queue);
-			if (req)
-				blkdev_dequeue_request(req);
+			req = blk_fetch_request(floppy_queue);
 			spin_unlock_irq(floppy_queue->queue_lock);
 			if (!req) {
 				do_floppy = NULL;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 288ab63c102..961de56d00a 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -592,12 +592,11 @@ repeat:
 	del_timer(&device_timer);
 
 	if (!hd_req) {
-		hd_req = elv_next_request(hd_queue);
+		hd_req = blk_fetch_request(hd_queue);
 		if (!hd_req) {
 			do_hd = NULL;
 			return;
 		}
-		blkdev_dequeue_request(hd_req);
 	}
 	req = hd_req;
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 1ca5d1423fa..c0cd0a03f69 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -671,10 +671,8 @@ static void mg_request_poll(struct request_queue *q)
 
 	while (1) {
 		if (!host->req) {
-			host->req = elv_next_request(q);
-			if (host->req)
-				blkdev_dequeue_request(host->req);
-			else
+			host->req = blk_fetch_request(q);
+			if (!host->req)
 				break;
 		}
 
@@ -744,10 +742,8 @@ static void mg_request(struct request_queue *q)
 
 	while (1) {
 		if (!host->req) {
-			host->req = elv_next_request(q);
-			if (host->req)
-				blkdev_dequeue_request(host->req);
-			else
+			host->req = blk_fetch_request(q);
+			if (!host->req)
 				break;
 		}
 		req = host->req;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index fad167de23b..5d23ffad7c7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -533,11 +533,9 @@ static void do_nbd_request(struct request_queue *q)
 {
 	struct request *req;
 	
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_fetch_request(q)) != NULL) {
 		struct nbd_device *lo;
 
-		blkdev_dequeue_request(req);
-
 		spin_unlock_irq(q->queue_lock);
 
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 425f81586a3..911dfd98d81 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -720,10 +720,9 @@ static void do_pcd_request(struct request_queue * q)
 		return;
 	while (1) {
 		if (!pcd_req) {
-			pcd_req = elv_next_request(q);
+			pcd_req = blk_fetch_request(q);
 			if (!pcd_req)
 				return;
-			blkdev_dequeue_request(pcd_req);
 		}
 
 		if (rq_data_dir(pcd_req) == READ) {
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index d2ca3f55206..bf5955b3d87 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -412,11 +412,9 @@ static void run_fsm(void)
 				spin_lock_irqsave(&pd_lock, saved_flags);
 				if (!__blk_end_request_cur(pd_req,
 						res == Ok ? 0 : -EIO)) {
-					pd_req = elv_next_request(pd_queue);
+					pd_req = blk_fetch_request(pd_queue);
 					if (!pd_req)
 						stop = 1;
-					else
-						blkdev_dequeue_request(pd_req);
 				}
 				spin_unlock_irqrestore(&pd_lock, saved_flags);
 				if (stop)
@@ -706,10 +704,9 @@ static void do_pd_request(struct request_queue * q)
 {
 	if (pd_req)
 		return;
-	pd_req = elv_next_request(q);
+	pd_req = blk_fetch_request(q);
 	if (!pd_req)
 		return;
-	blkdev_dequeue_request(pd_req);
 
 	schedule_fsm();
 }
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index d6f7bd84ed3..68a90834e99 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -762,10 +762,9 @@ static void do_pf_request(struct request_queue * q)
 		return;
 repeat:
 	if (!pf_req) {
-		pf_req = elv_next_request(q);
+		pf_req = blk_fetch_request(q);
 		if (!pf_req)
 			return;
-		blkdev_dequeue_request(pf_req);
 	}
 
 	pf_current = pf_req->rq_disk->private_data;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f4d8db944e7..338cee4cc0b 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -194,9 +194,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
-	while ((req = elv_next_request(q))) {
-		blkdev_dequeue_request(req);
-
+	while ((req = blk_fetch_request(q))) {
 		if (blk_fs_request(req)) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 9f351bfa15e..cbfd9c0aef0 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -441,12 +441,11 @@ out:
 static void do_vdc_request(struct request_queue *q)
 {
 	while (1) {
-		struct request *req = elv_next_request(q);
+		struct request *req = blk_fetch_request(q);
 
 		if (!req)
 			break;
 
-		blkdev_dequeue_request(req);
 		if (__send_request(req) < 0)
 			__blk_end_request_all(req, -EIO);
 	}
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index dedd4893f5e..cf7877fb8a7 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -528,10 +528,7 @@ static void redo_fd_request(struct request_queue *q)
 	struct request *req;
 	struct floppy_state *fs;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		int err = -EIO;
 
@@ -554,11 +551,8 @@ static void redo_fd_request(struct request_queue *q)
 			break;
 		}
 	done:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index f48c6dd47e0..80df93e3cdd 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -326,10 +326,9 @@ static void start_request(struct floppy_state *fs)
 	}
 	while (fs->state == idle) {
 		if (!fd_req) {
-			fd_req = elv_next_request(swim3_queue);
+			fd_req = blk_fetch_request(swim3_queue);
 			if (!fd_req)
 				break;
-			blkdev_dequeue_request(fd_req);
 		}
 		req = fd_req;
 #if 0
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 087c94c8b2d..da403b6a7f4 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -810,12 +810,10 @@ static void carm_oob_rq_fn(struct request_queue *q)
 
 	while (1) {
 		DPRINTK("get req\n");
-		rq = elv_next_request(q);
+		rq = blk_fetch_request(q);
 		if (!rq)
 			break;
 
-		blkdev_dequeue_request(rq);
-
 		crq = rq->special;
 		assert(crq != NULL);
 		assert(crq->rq == rq);
@@ -846,7 +844,7 @@ static void carm_rq_fn(struct request_queue *q)
 
 queue_one_request:
 	VPRINTK("get req\n");
-	rq = elv_next_request(q);
+	rq = blk_peek_request(q);
 	if (!rq)
 		return;
 
@@ -857,7 +855,7 @@ queue_one_request:
 	}
 	crq->rq = rq;
 
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 
 	if (rq_data_dir(rq) == WRITE) {
 		writing = 1;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 40d03cf63f2..178f459a50e 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -627,7 +627,7 @@ static void ub_request_fn(struct request_queue *q)
 	struct ub_lun *lun = q->queuedata;
 	struct request *rq;
 
-	while ((rq = elv_next_request(q)) != NULL) {
+	while ((rq = blk_peek_request(q)) != NULL) {
 		if (ub_request_fn_1(lun, rq) != 0) {
 			blk_stop_queue(q);
 			break;
@@ -643,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 	int n_elem;
 
 	if (atomic_read(&sc->poison)) {
-		blkdev_dequeue_request(rq);
+		blk_start_request(rq);
 		ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq));
 		return 0;
 	}
 
 	if (lun->changed && !blk_pc_request(rq)) {
-		blkdev_dequeue_request(rq);
+		blk_start_request(rq);
 		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq));
 		return 0;
 	}
@@ -660,7 +660,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 		return -1;
 	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
 
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 
 	urq = &lun->urq;
 	memset(urq, 0, sizeof(struct ub_request));
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2086cb12d3e..390d69bb7c4 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -361,11 +361,9 @@ static void do_viodasd_request(struct request_queue *q)
 	 * back later.
 	 */
 	while (num_req_outstanding < VIOMAXREQ) {
-		req = elv_next_request(q);
+		req = blk_fetch_request(q);
 		if (req == NULL)
 			return;
-		/* dequeue the current request from the queue */
-		blkdev_dequeue_request(req);
 		/* check that request contains a valid command */
 		if (!blk_fs_request(req)) {
 			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1980ab45635..29a9daf4862 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -128,7 +128,7 @@ static void do_virtblk_request(struct request_queue *q)
 	struct request *req;
 	unsigned int issued = 0;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_peek_request(q)) != NULL) {
 		vblk = req->rq_disk->private_data;
 		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
@@ -138,7 +138,7 @@ static void do_virtblk_request(struct request_queue *q)
 			blk_stop_queue(q);
 			break;
 		}
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		issued++;
 	}
 
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d4c4352354b..ce242921992 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -305,10 +305,7 @@ static void do_xd_request (struct request_queue * q)
 	if (xdc_busy)
 		return;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		unsigned block = blk_rq_pos(req);
 		unsigned count = blk_rq_cur_sectors(req);
@@ -325,11 +322,8 @@ static void do_xd_request (struct request_queue * q)
 					   block, count);
 	done:
 		/* wrap up, 0 = success, -errno = fail */
-		if (!__blk_end_request_cur(req, res)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, res))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 66f834571b8..6d4ac76c280 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -299,13 +299,13 @@ static void do_blkif_request(struct request_queue *rq)
 
 	queued = 0;
 
-	while ((req = elv_next_request(rq)) != NULL) {
+	while ((req = blk_peek_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 
 		if (RING_FULL(&info->ring))
 			goto wait;
 
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 
 		if (!blk_fs_request(req)) {
 			__blk_end_request_all(req, -EIO);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index edf137b6c37..3a4397edab7 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -463,10 +463,10 @@ struct request *ace_get_next_request(struct request_queue * q)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_peek_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		__blk_end_request_all(req, -EIO);
 	}
 	return req;
@@ -498,10 +498,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			__blk_end_request_all(ace->req, -EIO);
 			ace->req = NULL;
 		}
-		while ((req = elv_next_request(ace->queue)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(ace->queue)) != NULL)
 			__blk_end_request_all(req, -EIO);
-		}
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
@@ -649,7 +647,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			ace->fsm_state = ACE_FSM_STATE_IDLE;
 			break;
 		}
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index c909c1a3f65..4575171e5be 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -71,10 +71,7 @@ static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		unsigned long start = blk_rq_pos(req) << 9;
 		unsigned long len  = blk_rq_cur_bytes(req);
@@ -100,11 +97,8 @@ static void do_z2_request(struct request_queue *q)
 			len -= size;
 		}
 	done:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 3cc02bfe828..1e366ad8f68 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -642,9 +642,7 @@ static void gdrom_request(struct request_queue *rq)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(rq)) != NULL) {
-		blkdev_dequeue_request(req);
-
+	while ((req = blk_fetch_request(rq)) != NULL) {
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
 			__blk_end_request_all(req, -EIO);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index bbe9f086734..ca741c21e4a 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -297,9 +297,7 @@ static void do_viocd_request(struct request_queue *q)
 {
 	struct request *req;
 
-	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
-		blkdev_dequeue_request(req);
-
+	while ((rwreq == 0) && ((req = blk_fetch_request(q)) != NULL)) {
 		if (!blk_fs_request(req))
 			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2874c3d703a..8a894fa37b5 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -269,7 +269,7 @@ void ide_retry_pc(ide_drive_t *drive)
 	blk_requeue_request(failed_rq->q, failed_rq);
 	drive->hwif->rq = NULL;
 	if (ide_queue_sense_rq(drive, pc)) {
-		blkdev_dequeue_request(failed_rq);
+		blk_start_request(failed_rq);
 		ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq));
 	}
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index abda7337b3f..e4e3a0e3201 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -519,11 +519,8 @@ repeat:
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		if (!rq) {
-			rq = elv_next_request(drive->queue);
-			if (rq)
-				blkdev_dequeue_request(rq);
-		}
+		if (!rq)
+			rq = blk_fetch_request(drive->queue);
 
 		spin_unlock_irq(q->queue_lock);
 		spin_lock_irq(&hwif->lock);
@@ -536,7 +533,7 @@ repeat:
 		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
-		 * blk_stop_queue() doesn't prevent the elv_next_request()
+		 * blk_stop_queue() doesn't prevent the blk_fetch_request()
 		 * above to return us whatever is in the queue. Since we call
 		 * ide_do_request() ourselves, we end up taking requests while
 		 * the queue is blocked...
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 58f5be8cd69..c0bebc6a2f2 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -704,13 +704,12 @@ try_again:
 		return 0;
 	}
 
-	dev_dbg(&card->dev, "elv_next\n");
-	msb->block_req = elv_next_request(msb->queue);
+	dev_dbg(&card->dev, "blk_fetch\n");
+	msb->block_req = blk_fetch_request(msb->queue);
 	if (!msb->block_req) {
 		dev_dbg(&card->dev, "issue end\n");
 		return -EAGAIN;
 	}
-	blkdev_dequeue_request(msb->block_req);
 
 	dev_dbg(&card->dev, "trying again\n");
 	chunk = 1;
@@ -825,10 +824,8 @@ static void mspro_block_submit_req(struct request_queue *q)
 		return;
 
 	if (msb->eject) {
-		while ((req = elv_next_request(q)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(q)) != NULL)
 			__blk_end_request_all(req, -ENODEV);
-		}
 
 		return;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 8b5cbfc3ba9..6573ef4408f 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -877,7 +877,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 	struct request *req;
 
 	while (!blk_queue_plugged(q)) {
-		req = elv_next_request(q);
+		req = blk_peek_request(q);
 		if (!req)
 			break;
 
@@ -890,7 +890,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 
 			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
 				if (!i2o_block_transfer(req)) {
-					blkdev_dequeue_request(req);
+					blk_start_request(req);
 					continue;
 				} else
 					osm_info("transfer error\n");
@@ -917,7 +917,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 				break;
 			}
 		} else {
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 		}
 	}
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 4b70f1e2834..49e582356c6 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -54,11 +54,8 @@ static int mmc_queue_thread(void *d)
 
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!blk_queue_plugged(q)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!blk_queue_plugged(q))
+			req = blk_fetch_request(q);
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 
@@ -94,10 +91,8 @@ static void mmc_request(struct request_queue *q)
 
 	if (!mq) {
 		printk(KERN_ERR "MMC: killing requests for dead queue\n");
-		while ((req = elv_next_request(q)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(q)) != NULL)
 			__blk_end_request_all(req, -EIO);
-		}
 		return;
 	}
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 3e10442615d..502622f628b 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -100,12 +100,7 @@ static int mtd_blktrans_thread(void *arg)
 		struct mtd_blktrans_dev *dev;
 		int res;
 
-		if (!req) {
-			req = elv_next_request(rq);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
-		if (!req) {
+		if (!req && !(req = blk_fetch_request(rq))) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(rq->queue_lock);
 			schedule();
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7df03c7aea0..e64f62d5e0f 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1656,17 +1656,13 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 	if (basedev->state < DASD_STATE_READY)
 		return;
 	/* Now we try to fetch requests from the request queue */
-	while (!blk_queue_plugged(queue) &&
-	       elv_next_request(queue)) {
-
-		req = elv_next_request(queue);
-
+	while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) {
 		if (basedev->features & DASD_FEATURE_READONLY &&
 		    rq_data_dir(req) == WRITE) {
 			DBF_DEV_EVENT(DBF_ERR, basedev,
 				      "Rejecting write request %p",
 				      req);
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -1695,7 +1691,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "CCW creation failed (rc=%ld) "
 				      "on request %p",
 				      PTR_ERR(cqr), req);
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -1705,7 +1701,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 		 */
 		cqr->callback_data = (void *) req;
 		cqr->status = DASD_CQR_FILLED;
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		list_add_tail(&cqr->blocklist, &block->ccw_queue);
 		dasd_profile_start(block, cqr, req);
 	}
@@ -2029,10 +2025,8 @@ static void dasd_flush_request_queue(struct dasd_block *block)
 		return;
 
 	spin_lock_irq(&block->request_queue_lock);
-	while ((req = elv_next_request(block->request_queue))) {
-		blkdev_dequeue_request(req);
+	while ((req = blk_fetch_request(block->request_queue)))
 		__blk_end_request_all(req, -EIO);
-	}
 	spin_unlock_irq(&block->request_queue_lock);
 }
 
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 5d035e4939d..1e796767598 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -93,7 +93,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 		device->blk_data.block_position = -1;
 	device->discipline->free_bread(ccw_req);
 	if (!list_empty(&device->req_queue) ||
-	    elv_next_request(device->blk_data.request_queue))
+	    blk_peek_request(device->blk_data.request_queue))
 		tapeblock_trigger_requeue(device);
 }
 
@@ -162,19 +162,16 @@ tapeblock_requeue(struct work_struct *work) {
 	spin_lock_irq(&device->blk_data.request_queue_lock);
 	while (
 		!blk_queue_plugged(queue) &&
-		elv_next_request(queue)   &&
+		(req = blk_fetch_request(queue)) &&
 		nr_queued < TAPEBLOCK_MIN_REQUEUE
 	) {
-		req = elv_next_request(queue);
 		if (rq_data_dir(req) == WRITE) {
 			DBF_EVENT(1, "TBLOCK: Rejecting write request\n");
-			blkdev_dequeue_request(req);
 			spin_unlock_irq(&device->blk_data.request_queue_lock);
 			blk_end_request_all(req, -EIO);
 			spin_lock_irq(&device->blk_data.request_queue_lock);
 			continue;
 		}
-		blkdev_dequeue_request(req);
 		nr_queued++;
 		spin_unlock_irq(&device->blk_data.request_queue_lock);
 		rc = tapeblock_start_request(device, req);
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index f572a4a1d14..6d465168468 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -186,10 +186,7 @@ static void jsfd_do_request(struct request_queue *q)
 {
 	struct request *req;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
@@ -212,11 +209,8 @@ static void jsfd_do_request(struct request_queue *q)
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
 		err = 0;
 	end:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ee308f6f798..b12750f8216 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1207,7 +1207,7 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 		break;
 	case BLKPREP_DEFER:
 		/*
-		 * If we defer, the elv_next_request() returns NULL, but the
+		 * If we defer, the blk_peek_request() returns NULL, but the
 		 * queue must be restarted, so we plug here if no returning
 		 * command will automatically do that.
 		 */
@@ -1385,7 +1385,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	struct scsi_target *starget = scsi_target(sdev);
 	struct Scsi_Host *shost = sdev->host;
 
-	blkdev_dequeue_request(req);
+	blk_start_request(req);
 
 	if (unlikely(cmd == NULL)) {
 		printk(KERN_CRIT "impossible request in %s.\n",
@@ -1477,7 +1477,7 @@ static void scsi_request_fn(struct request_queue *q)
 
 	if (!sdev) {
 		printk("scsi: killing requests for dead queue\n");
-		while ((req = elv_next_request(q)) != NULL)
+		while ((req = blk_peek_request(q)) != NULL)
 			scsi_kill_request(req, q);
 		return;
 	}
@@ -1498,7 +1498,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * that the request is fully prepared even if we cannot 
 		 * accept it.
 		 */
-		req = elv_next_request(q);
+		req = blk_peek_request(q);
 		if (!req || !scsi_dev_queue_ready(q, sdev))
 			break;
 
@@ -1514,7 +1514,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * Remove the request from the request list.
 		 */
 		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 		sdev->device_busy++;
 
 		spin_unlock(q->queue_lock);
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 50988cbf7b2..d606452297c 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -163,12 +163,10 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 	int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
 
 	while (!blk_queue_plugged(q)) {
-		req = elv_next_request(q);
+		req = blk_fetch_request(q);
 		if (!req)
 			break;
 
-		blkdev_dequeue_request(req);
-
 		spin_unlock_irq(q->queue_lock);
 
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c7558034570..6e59d3b92ff 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -818,8 +818,6 @@ static inline void blk_run_address_space(struct address_space *mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 
-extern void blkdev_dequeue_request(struct request *req);
-
 /*
  * blk_rq_pos()		: the current sector
  * blk_rq_bytes()	: bytes left in the entire request
@@ -852,6 +850,13 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+/*
+ * Request issue related functions.
+ */
+extern struct request *blk_peek_request(struct request_queue *q);
+extern void blk_start_request(struct request *rq);
+extern struct request *blk_fetch_request(struct request_queue *q);
+
 /*
  * Request completion related functions.
  *
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4e462878c9c..1cb3372e65d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -103,10 +103,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
-extern void elv_dequeue_request(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
-extern struct request *elv_next_request(struct request_queue *q);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
-- 
cgit v1.2.3-70-g09d2


From 1822952ba2b9f22f79019d07ebbeca31dc14b718 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:07 +0900
Subject: block: let blk_end_request_all handle bidi requests

blk_end_request_all() and __blk_end_request_all() should finish all
bytes including bidi, by definition. That's what all bidi users need ,
bidi requests must be complete as a whole (partial completion is
impossible).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6e59d3b92ff..1069f4483c6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,8 +910,12 @@ static inline bool blk_end_request(struct request *rq, int error,
 static inline void blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
+	unsigned int bidi_bytes = 0;
 
-	pending = blk_end_request(rq, error, blk_rq_bytes(rq));
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 
@@ -962,8 +966,12 @@ static inline bool __blk_end_request(struct request *rq, int error,
 static inline void __blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
 
-	pending = __blk_end_request(rq, error, blk_rq_bytes(rq));
+	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 
-- 
cgit v1.2.3-70-g09d2


From b1f744937f1be3e6d3009382a755679133cf782d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:09 +0900
Subject: block: move completion related functions back to blk-core.c

Let's put the completion related functions back to block/blk-core.c
where they have lived. We can also unexport blk_end_bidi_request() and
__blk_end_bidi_request(), which nobody uses.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 128 ++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/blkdev.h | 128 ++++---------------------------------------------
 2 files changed, 130 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 93691d2ac5a..a2d97de1a12 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2026,8 +2026,8 @@ static void blk_finish_request(struct request *req, int error)
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool blk_end_bidi_request(struct request *rq, int error,
-			  unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool blk_end_bidi_request(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
@@ -2041,7 +2041,6 @@ bool blk_end_bidi_request(struct request *rq, int error,
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
@@ -2058,8 +2057,8 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool __blk_end_bidi_request(struct request *rq, int error,
-			    unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool __blk_end_bidi_request(struct request *rq, int error,
+				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
@@ -2068,7 +2067,124 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	return blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+EXPORT_SYMBOL_GPL(blk_end_request);
+
+/**
+ * blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.
+ */
+void blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+	BUG_ON(pending);
+}
+EXPORT_SYMBOL_GPL(blk_end_request_all);
+
+/**
+ * blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool blk_end_request_cur(struct request *rq, int error)
+{
+	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_cur);
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+EXPORT_SYMBOL_GPL(__blk_end_request);
+
+/**
+ * __blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.  Must be called with queue lock held.
+ */
+void __blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+	BUG_ON(pending);
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_all);
+
+/**
+ * __blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.  Must
+ *     be called with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool __blk_end_request_cur(struct request *rq, int error)
+{
+	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_cur);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1069f4483c6..f9d60a78c08 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -872,126 +872,14 @@ extern struct request *blk_fetch_request(struct request_queue *q);
  */
 extern bool blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
-extern bool blk_end_bidi_request(struct request *rq, int error,
-				 unsigned int nr_bytes,
-				 unsigned int bidi_bytes);
-extern bool __blk_end_bidi_request(struct request *rq, int error,
-				   unsigned int nr_bytes,
-				   unsigned int bidi_bytes);
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static inline bool blk_end_request(struct request *rq, int error,
-				   unsigned int nr_bytes)
-{
-	return blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-
-/**
- * blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Completely finish @rq.
- */
-static inline void blk_end_request_all(struct request *rq, int error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-
-/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-static inline bool blk_end_request_cur(struct request *rq, int error)
-{
-	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static inline bool __blk_end_request(struct request *rq, int error,
-				     unsigned int nr_bytes)
-{
-	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-
-/**
- * __blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Completely finish @rq.  Must be called with queue lock held.
- */
-static inline void __blk_end_request_all(struct request *rq, int error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-
-/**
- * __blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.  Must
- *     be called with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-static inline bool __blk_end_request_cur(struct request *rq, int error)
-{
-	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
+extern bool blk_end_request(struct request *rq, int error,
+			    unsigned int nr_bytes);
+extern void blk_end_request_all(struct request *rq, int error);
+extern bool blk_end_request_cur(struct request *rq, int error);
+extern bool __blk_end_request(struct request *rq, int error,
+			      unsigned int nr_bytes);
+extern void __blk_end_request_all(struct request *rq, int error);
+extern bool __blk_end_request_cur(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 6818173bd658439b83896a2a7586f64ab51bf29c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:36 +0200
Subject: splice: implement default splice_read method

If f_op->splice_read() is not implemented, fall back to a plain read.
Use vfs_readv() to read into previously allocated pages.

This will allow splice and functions using splice, such as the loop
device, to work on all filesystems.  This includes "direct_io" files
in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/loop.c      |  11 +----
 fs/coda/file.c            |   9 ++--
 fs/pipe.c                 |  14 ++++++
 fs/read_write.c           |   7 +--
 fs/splice.c               | 120 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h        |   2 +
 include/linux/pipe_fs_i.h |   1 +
 7 files changed, 140 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9ca4bb01465..801f4ab8330 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -709,10 +709,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 		goto out_putf;
 
-	/* new backing store needs to support loop (eg splice_read) */
-	if (!inode->i_fop->splice_read)
-		goto out_putf;
-
 	/* size of the new backing store needs to be the same */
 	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
 		goto out_putf;
@@ -788,12 +784,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	error = -EINVAL;
 	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		const struct address_space_operations *aops = mapping->a_ops;
-		/*
-		 * If we can't read - sorry. If we only can't write - well,
-		 * it's going to be read-only.
-		 */
-		if (!file->f_op->splice_read)
-			goto out_putf;
+
 		if (aops->write_begin)
 			lo_flags |= LO_FLAGS_USE_AOPS;
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998..ffd42815fda 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 		      struct pipe_inode_info *pipe, size_t count,
 		      unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	struct coda_file_info *cfi;
 	struct file *host_file;
 
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->splice_read)
-		return -EINVAL;
+	splice_read = host_file->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
 
-	return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+	return splice_read(host_file, ppos, pipe, count, flags);
 }
 
 static ssize_t
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8..f7dd21ad85a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 	return 0;
 }
 
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to put a reference to
+ *
+ * Description:
+ *	This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+}
+
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = generic_pipe_buf_map,
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee..6c8c55dec2b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto out;
 	if (!(in_file->f_mode & FMODE_READ))
 		goto fput_in;
-	retval = -EINVAL;
-	in_inode = in_file->f_path.dentry->d_inode;
-	if (!in_inode)
-		goto fput_in;
-	if (!in_file->f_op || !in_file->f_op->splice_read)
-		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos)
 		ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	retval = -EINVAL;
 	if (!out_file->f_op || !out_file->f_op->sendpage)
 		goto fput_out;
+	in_inode = in_file->f_path.dentry->d_inode;
 	out_inode = out_file->f_path.dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 	if (retval < 0)
diff --git a/fs/splice.c b/fs/splice.c
index e405cf552f5..3bd9cb21b38 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,116 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 
 	return ret;
 }
-
 EXPORT_SYMBOL(generic_file_splice_read);
 
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t offset)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	unsigned int nr_pages;
+	unsigned int nr_freed;
+	size_t offset;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct iovec vec[PIPE_BUFFERS];
+	pgoff_t index;
+	ssize_t res;
+	size_t this_len;
+	int error;
+	int i;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &default_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+		struct page *page;
+
+		page = alloc_page(GFP_HIGHUSER);
+		error = -ENOMEM;
+		if (!page)
+			goto err;
+
+		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_len = this_len;
+		pages[i] = page;
+		spd.nr_pages++;
+		len -= this_len;
+		offset = 0;
+	}
+
+	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+	if (res < 0)
+		goto err;
+
+	error = 0;
+	if (!res)
+		goto err;
+
+	nr_freed = 0;
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		this_len = min_t(size_t, vec[i].iov_len, res);
+		partial[i].offset = 0;
+		partial[i].len = this_len;
+		if (!this_len) {
+			__free_page(pages[i]);
+			pages[i] = NULL;
+			nr_freed++;
+		}
+		res -= this_len;
+	}
+	spd.nr_pages -= nr_freed;
+
+	res = splice_to_pipe(pipe, &spd);
+	if (res > 0)
+		*ppos += res;
+
+	return res;
+
+err:
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		__free_page(pages[i]);
+	}
+	return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
+
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
@@ -933,11 +1040,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!in->f_op || !in->f_op->splice_read))
-		return -EINVAL;
-
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
@@ -945,7 +1051,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return in->f_op->splice_read(in, ppos, pipe, len, flags);
+	splice_read = in->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
+
+	return splice_read(in, ppos, pipe, len, flags);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436f435..d926c2bea16 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2204,6 +2204,8 @@ extern int generic_segment_checks(const struct iovec *iov,
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
+extern ssize_t default_file_splice_read(struct file *, loff_t *,
+		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index c8f038554e8..b43a9e03905 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -152,5 +152,6 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
+void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From dc6382ced07d6bad61d0b591fb12ab5da7ca632c Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Wed, 6 May 2009 22:09:37 +0300
Subject: nl80211 : Add support for configuring MFP

NL80211_CMD_ASSOCIATE request must be able to indicate whether
management frame protection (IEEE 802.11w) is being used. mac80211 was
able to use MFP in client mode only with WEXT, but the new
NL80211_ATTR_USE_MFP attribute will allow this to be done with
nl80211, too.

Since we are currently using nl80211 for MFP only with drivers that
use user space SME, only MFP disabled and required values are
used. However, the NL80211_ATTR_USE_MFP attribute is an enum that can
be extended with MFP optional in the future, if that is needed with
some drivers (e.g., if the RSN IE is generated by the driver).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 17 +++++++++++++++++
 include/net/cfg80211.h  |  2 ++
 net/mac80211/cfg.c      |  8 ++++++++
 net/wireless/nl80211.c  | 12 ++++++++++++
 4 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index e9fd13aa79f..58c4ee1822d 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -494,6 +494,11 @@ enum nl80211_commands {
  * @NL80211_ATTR_TIMED_OUT: a flag indicating than an operation timed out; this
  *	is used, e.g., with %NL80211_CMD_AUTHENTICATE event
  *
+ * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is
+ *	used for the association (&enum nl80211_mfp, represented as a u32);
+ *	this attribute can be used
+ *	with %NL80211_CMD_ASSOCIATE request
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -596,6 +601,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_TIMED_OUT,
 
+	NL80211_ATTR_USE_MFP,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -1179,4 +1186,14 @@ enum nl80211_key_type {
 	NL80211_KEYTYPE_PEERKEY,
 };
 
+/**
+ * enum nl80211_mfp - Management frame protection state
+ * @NL80211_MFP_NO: Management frame protection not used
+ * @NL80211_MFP_REQUIRED: Management frame protection required
+ */
+enum nl80211_mfp {
+	NL80211_MFP_NO,
+	NL80211_MFP_REQUIRED,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b8a76764e1c..47e30e1d91f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -672,6 +672,7 @@ struct cfg80211_auth_request {
  * @ssid_len: Length of ssid in octets
  * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
  * @ie_len: Length of ie buffer in octets
+ * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
  */
 struct cfg80211_assoc_request {
 	struct ieee80211_channel *chan;
@@ -680,6 +681,7 @@ struct cfg80211_assoc_request {
 	size_t ssid_len;
 	const u8 *ie;
 	size_t ie_len;
+	bool use_mfp;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d0ca6da33ca..4e627cf2b8c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1253,6 +1253,14 @@ static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev,
 	if (ret)
 		return ret;
 
+	if (req->use_mfp) {
+		sdata->u.mgd.mfp = IEEE80211_MFP_REQUIRED;
+		sdata->u.mgd.flags |= IEEE80211_STA_MFP_ENABLED;
+	} else {
+		sdata->u.mgd.mfp = IEEE80211_MFP_DISABLED;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED;
+	}
+
 	sdata->u.mgd.flags |= IEEE80211_STA_EXT_SME;
 	sdata->u.mgd.state = IEEE80211_STA_MLME_ASSOCIATE;
 	ieee80211_sta_req_auth(sdata);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3c53c5cbc3a..79927706937 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -122,6 +122,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
 	[NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
 	[NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_USE_MFP] = { .type = NLA_U32 },
 };
 
 /* IE validation */
@@ -3012,6 +3013,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
 	}
 
+	if (info->attrs[NL80211_ATTR_USE_MFP]) {
+		enum nl80211_mfp use_mfp =
+			nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
+		if (use_mfp == NL80211_MFP_REQUIRED)
+			req.use_mfp = true;
+		else if (use_mfp != NL80211_MFP_NO) {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
 	err = drv->ops->assoc(&drv->wiphy, dev, &req);
 
 out:
-- 
cgit v1.2.3-70-g09d2


From 528769cf1e422d932052be1487459262f3d75333 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Mon, 11 May 2009 10:20:35 +0300
Subject: mac80211: Robust Action frame categories for MFP

IEEE 802.11w/D9.0 introduces a mechanism for Action field Category
values to be used to select which Action frames are Robust. Public and
Vendor-specific categories are marked as not Robust in IEEE 802.11w;
HT will be marked not Robust in IEEE 802.11n. A new Vendor-specific
Protected category is allocated for Robust vendor-specific Action
frames. Another new category, Protected Dual of Action, is introduced
for protecting some existing Public Action frames (e.g., IEEE 802.11y
protected enablement).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index dc92359f37e..05c29c01174 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1068,8 +1068,12 @@ enum ieee80211_category {
 	WLAN_CATEGORY_DLS = 2,
 	WLAN_CATEGORY_BACK = 3,
 	WLAN_CATEGORY_PUBLIC = 4,
+	WLAN_CATEGORY_HT = 7,
 	WLAN_CATEGORY_SA_QUERY = 8,
+	WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9,
 	WLAN_CATEGORY_WMM = 17,
+	WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126,
+	WLAN_CATEGORY_VENDOR_SPECIFIC = 127,
 };
 
 /* SPECTRUM_MGMT action code */
@@ -1261,7 +1265,9 @@ static inline bool ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr)
 		if (ieee80211_has_protected(hdr->frame_control))
 			return true;
 		category = ((u8 *) hdr) + 24;
-		return *category != WLAN_CATEGORY_PUBLIC;
+		return *category != WLAN_CATEGORY_PUBLIC &&
+			*category != WLAN_CATEGORY_HT &&
+			*category != WLAN_CATEGORY_VENDOR_SPECIFIC;
 	}
 
 	return false;
-- 
cgit v1.2.3-70-g09d2


From e13cf6e04ebc231653e03ebde4799dc55bf62849 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 11 May 2009 18:13:13 -0700
Subject: ntp: fix comment typos

Bernhard Schiffner noticed I had a few comment typos in this patch,
(note: to save embarrassment, when making typos, avoid copying and
pasting them) so this patch corrects them.

[ Impact: cleanup ]

Reported-by: Bernhard Schiffner <bernhard@schiffner-limbach.de>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: riel@redhat.com
Cc: akpm@linux-foundation.org
LKML-Reference: <1242090794.7214.131.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 0daf9611ef4..9910e3bd5b3 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -179,7 +179,7 @@ struct timex {
  * NTP convergence time. A higher value makes it stiffer, increasing
  * convergence time, but making the clock more stable.
  *
- * In David Mills' nanokenrel reference implmentation SHIFT_PLL is 4.
+ * In David Mills' nanokernel reference implementation SHIFT_PLL is 4.
  * However this seems to increase convergence time much too long.
  *
  * https://lists.ntp.org/pipermail/hackers/2008-January/003487.html
@@ -196,7 +196,7 @@ struct timex {
  *
  * SHIFT_FLL is used as a dampening factor to define how much we
  * adjust the frequency correction for a given offset in FLL mode.
- * In David Mills' nanokenrel reference implmentation SHIFT_PLL is 2.
+ * In David Mills' nanokernel reference implementation SHIFT_FLL is 2.
  *
  * MAXTC establishes the maximum time constant of the PLL.
  */
-- 
cgit v1.2.3-70-g09d2


From 2b1ccc0ee918a653d0483fdad9dd6112ce8e9043 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 12 May 2009 11:11:48 +0200
Subject: splice: fix misleading comment

Splice is tied to pipes by design, it'll not change. And now that
the splice stuff is in splice.h (and note pipe.h), the rest of the comment
is out-of-date as well.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/splice.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/splice.h b/include/linux/splice.h
index 5f3faa9d15a..18e7c7c0cae 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -11,8 +11,7 @@
 #include <linux/pipe_fs_i.h>
 
 /*
- * splice is tied to pipes as a transport (at least for now), so we'll just
- * add the splice flags here.
+ * Flags passed in from splice/tee/vmsplice
  */
 #define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
 #define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */
-- 
cgit v1.2.3-70-g09d2


From 597d0275736dad9c3bda6f0a00a1c477dc0f37b1 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:13:26 +0530
Subject: timers: Framework for identifying pinned timers

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch creates a new framework for identifying cpu-pinned timers
and hrtimers.

This framework is needed because pinned timers are expected to fire on
the same CPU on which they are queued. So it is essential to identify
these and not migrate them, in case there are any.

For regular timers, the currently existing add_timer_on() can be used
queue pinned timers and subsequently mod_timer_pinned() can be used
to modify the 'expires' field.

For hrtimers, new modes HRTIMER_ABS_PINNED and HRTIMER_REL_PINNED are
added to queue cpu-pinned hrtimer.

[ tglx: use .._PINNED mode argument instead of creating tons of new
functions ]

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  7 +++++--
 include/linux/timer.h   |  3 +++
 kernel/hrtimer.c        |  7 ++++---
 kernel/timer.c          | 31 +++++++++++++++++++++++++++----
 4 files changed, 39 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0d2f7c8a33d..7400900de94 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -30,8 +30,11 @@ struct hrtimer_cpu_base;
  * Mode arguments of xxx_hrtimer functions:
  */
 enum hrtimer_mode {
-	HRTIMER_MODE_ABS,	/* Time value is absolute */
-	HRTIMER_MODE_REL,	/* Time value is relative to now */
+	HRTIMER_MODE_ABS = 0x0,		/* Time value is absolute */
+	HRTIMER_MODE_REL = 0x1,		/* Time value is relative to now */
+	HRTIMER_MODE_PINNED = 0x02,	/* Timer is bound to CPU */
+	HRTIMER_MODE_ABS_PINNED = 0x02,
+	HRTIMER_MODE_REL_PINNED = 0x03,
 };
 
 /*
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 6cdb6f3331f..ccf882eed8f 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -163,7 +163,10 @@ extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
+extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
+#define TIMER_NOT_PINNED	0
+#define TIMER_PINNED		1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c1958..c71bcd54924 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -193,7 +193,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
  * Switch the timer base to the current CPU when possible.
  */
 static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+		    int pinned)
 {
 	struct hrtimer_clock_base *new_base;
 	struct hrtimer_cpu_base *new_cpu_base;
@@ -907,9 +908,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	ret = remove_hrtimer(timer, base);
 
 	/* Switch the timer base, if necessary: */
-	new_base = switch_hrtimer_base(timer, base);
+	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
 
-	if (mode == HRTIMER_MODE_REL) {
+	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, new_base->get_time());
 		/*
 		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
diff --git a/kernel/timer.c b/kernel/timer.c
index 5c1e84beaf4..3424dfd11d5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -604,7 +604,8 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires,
+						bool pending_only, int pinned)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -668,7 +669,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true);
+	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -702,10 +703,32 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires, false);
+	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 
+/**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and not allow the timer to be migrated to a different CPU.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ *     del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
+	if (timer->expires == expires && timer_pending(timer))
+		return 1;
+
+	return __mod_timer(timer, expires, false, TIMER_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pinned);
+
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
@@ -1356,7 +1379,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false);
+	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3-70-g09d2


From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:15:34 +0530
Subject: timers: /proc/sys sysctl hook to enable timer migration

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch creates the /proc/sys sysctl interface at
/proc/sys/kernel/timer_migration

Timer migration is enabled by default.

To disable timer migration, when CONFIG_SCHED_DEBUG = y,

echo 0 > /proc/sys/kernel/timer_migration

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 1 +
 kernel/sched.c        | 2 ++
 kernel/sysctl.c       | 8 ++++++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049..61850401040 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1766,6 +1766,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index 9c5b4d3f97a..7f1dd56af86 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8731,6 +8731,8 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
+const_debug unsigned int sysctl_timer_migration = 1;
+
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7dd59b..b3ce5813730 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -324,6 +324,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.3-70-g09d2


From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:16:41 +0530
Subject: timers: Logic to move non pinned timers

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.

While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.

[ tglx: cleanups and simplifications ]

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h |  9 ++++++++
 include/linux/sched.h      | 12 +++++++++++
 kernel/hrtimer.c           | 51 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched.c             |  5 +++++
 kernel/time/clockevents.c  | 12 +++++++++++
 kernel/timer.c             | 17 +++++++++++++---
 6 files changed, 101 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 3a1dbba4d3a..20a100fe2b4 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned long reason, void *arg);
 #endif
 
 #endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+	return (ktime_t) { .tv64 = KTIME_MAX };
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61850401040..311dec12397 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -257,6 +257,7 @@ extern void task_rq_unlock_wait(struct task_struct *p);
 extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
+extern int get_nohz_load_balancer(void);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
@@ -1772,6 +1773,17 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return sysctl_timer_migration;
+}
+#else
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return 1;
+}
+#endif
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c71bcd54924..b675a67c9ac 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
 
 #include <asm/uaccess.h>
 
@@ -198,8 +200,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 {
 	struct hrtimer_clock_base *new_base;
 	struct hrtimer_cpu_base *new_cpu_base;
+	int cpu, preferred_cpu = -1;
+
+	cpu = smp_processor_id();
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+		preferred_cpu = get_nohz_load_balancer();
+		if (preferred_cpu >= 0)
+			cpu = preferred_cpu;
+	}
+#endif
 
-	new_cpu_base = &__get_cpu_var(hrtimer_bases);
+again:
+	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
 	new_base = &new_cpu_base->clock_base[base->index];
 
 	if (base != new_base) {
@@ -219,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		timer->base = NULL;
 		spin_unlock(&base->cpu_base->lock);
 		spin_lock(&new_base->cpu_base->lock);
+
+		/* Optimized away for NOHZ=n SMP=n */
+		if (cpu == preferred_cpu) {
+			/* Calculate clock monotonic expiry time */
+#ifdef CONFIG_HIGH_RES_TIMERS
+			ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+							new_base->offset);
+#else
+			ktime_t expires = hrtimer_get_expires(timer);
+#endif
+
+			/*
+			 * Get the next event on target cpu from the
+			 * clock events layer.
+			 * This covers the highres=off nohz=on case as well.
+			 */
+			ktime_t next = clockevents_get_next_event(cpu);
+
+			ktime_t delta = ktime_sub(expires, next);
+
+			/*
+			 * We do not migrate the timer when it is expiring
+			 * before the next event on the target cpu because
+			 * we cannot reprogram the target cpu hardware and
+			 * we would cause it to fire late.
+			 */
+			if (delta.tv64 < 0) {
+				cpu = smp_processor_id();
+				spin_unlock(&new_base->cpu_base->lock);
+				spin_lock(&base->cpu_base->lock);
+				timer->base = base;
+				goto again;
+			}
+		}
 		timer->base = new_base;
 	}
 	return new_base;
@@ -236,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 	return base;
 }
 
-# define switch_hrtimer_base(t, b)	(b)
+# define switch_hrtimer_base(t, b, p)	(b)
 
 #endif	/* !CONFIG_SMP */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 7f1dd56af86..9fe3774a0fd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4244,6 +4244,11 @@ static struct {
 	.load_balancer = ATOMIC_INIT(-1),
 };
 
+int get_nohz_load_balancer(void)
+{
+	return atomic_read(&nohz.load_balancer);
+}
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a79..ab20ded013b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
+#include <linux/tick.h>
 
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
@@ -251,4 +252,15 @@ void clockevents_notify(unsigned long reason, void *arg)
 	spin_unlock(&clockevents_lock);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
+
+ktime_t clockevents_get_next_event(int cpu)
+{
+	struct tick_device *td;
+	struct clock_event_device *dev;
+
+	td = &per_cpu(tick_cpu_device, cpu);
+	dev = td->evtdev;
+
+	return dev->next_event;
+}
 #endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 3424dfd11d5..3f841db5edf 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -609,9 +610,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret;
-
-	ret = 0;
+	int ret = 0 , cpu;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -630,6 +629,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	new_base = __get_cpu_var(tvec_bases);
 
+	cpu = smp_processor_id();
+
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+		int preferred_cpu = get_nohz_load_balancer();
+
+		if (preferred_cpu >= 0)
+			cpu = preferred_cpu;
+	}
+#endif
+	new_base = per_cpu(tvec_bases, cpu);
+
 	if (base != new_base) {
 		/*
 		 * We are trying to schedule the timer on the local CPU.
-- 
cgit v1.2.3-70-g09d2


From eccb8e8f0c3af47aeb6dbe4012eb8d4fc888767a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 11 May 2009 21:57:56 +0300
Subject: nl80211: improve station flags handling

It is currently not possible to modify station flags, but that
capability would be very useful. This patch introduces a new
nl80211 attribute that contains a set/mask for station flags,
and updates the internal API (and mac80211) to mirror that.

The new attribute is parsed before falling back to the old so
that userspace can specify both (if it can) to work on all
kernels.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 21 ++++++++++++++++++++-
 include/net/cfg80211.h  | 28 +++++-----------------------
 net/mac80211/cfg.c      | 28 ++++++++++++++++------------
 net/wireless/nl80211.c  | 38 ++++++++++++++++++++++++++++++--------
 4 files changed, 71 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 58c4ee1822d..aeefccfac0e 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -25,6 +25,8 @@
  *
  */
 
+#include <linux/types.h>
+
 /**
  * DOC: Station handling
  *
@@ -380,7 +382,7 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_STA_AID: Association ID for the station (u16)
  * @NL80211_ATTR_STA_FLAGS: flags, nested element with NLA_FLAG attributes of
- *	&enum nl80211_sta_flags.
+ *	&enum nl80211_sta_flags (deprecated, use %NL80211_ATTR_STA_FLAGS2)
  * @NL80211_ATTR_STA_LISTEN_INTERVAL: listen interval as defined by
  *	IEEE 802.11 7.3.1.6 (u16).
  * @NL80211_ATTR_STA_SUPPORTED_RATES: supported rates, array of supported
@@ -499,6 +501,9 @@ enum nl80211_commands {
  *	this attribute can be used
  *	with %NL80211_CMD_ASSOCIATE request
  *
+ * @NL80211_ATTR_STA_FLAGS2: Attribute containing a
+ *	&struct nl80211_sta_flag_update.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -603,6 +608,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_USE_MFP,
 
+	NL80211_ATTR_STA_FLAGS2,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -691,6 +698,18 @@ enum nl80211_sta_flags {
 	NL80211_STA_FLAG_MAX = __NL80211_STA_FLAG_AFTER_LAST - 1
 };
 
+/**
+ * struct nl80211_sta_flag_update - station flags mask/set
+ * @mask: mask of station flags to set
+ * @set: which values to set them to
+ *
+ * Both mask and set contain bits as per &enum nl80211_sta_flags.
+ */
+struct nl80211_sta_flag_update {
+	__u32 mask;
+	__u32 set;
+} __attribute__((packed));
+
 /**
  * enum nl80211_rate_info - bitrate information
  *
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e69e6c66dd1..0dae6b38294 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -251,27 +251,6 @@ struct beacon_parameters {
 	int head_len, tail_len;
 };
 
-/**
- * enum station_flags - station flags
- *
- * Station capability flags. Note that these must be the bits
- * according to the nl80211 flags.
- *
- * @STATION_FLAG_CHANGED: station flags were changed
- * @STATION_FLAG_AUTHORIZED: station is authorized to send frames (802.1X)
- * @STATION_FLAG_SHORT_PREAMBLE: station is capable of receiving frames
- *	with short preambles
- * @STATION_FLAG_WME: station is WME/QoS capable
- * @STATION_FLAG_MFP: station uses management frame protection
- */
-enum station_flags {
-	STATION_FLAG_CHANGED		= 1<<0,
-	STATION_FLAG_AUTHORIZED		= 1<<NL80211_STA_FLAG_AUTHORIZED,
-	STATION_FLAG_SHORT_PREAMBLE	= 1<<NL80211_STA_FLAG_SHORT_PREAMBLE,
-	STATION_FLAG_WME		= 1<<NL80211_STA_FLAG_WME,
-	STATION_FLAG_MFP		= 1<<NL80211_STA_FLAG_MFP,
-};
-
 /**
  * enum plink_action - actions to perform in mesh peers
  *
@@ -294,14 +273,17 @@ enum plink_actions {
  * @supported_rates: supported rates in IEEE 802.11 format
  *	(or NULL for no change)
  * @supported_rates_len: number of supported rates
- * @station_flags: station flags (see &enum station_flags)
+ * @sta_flags_mask: station flags that changed
+ *	(bitmask of BIT(NL80211_STA_FLAG_...))
+ * @sta_flags_set: station flags values
+ *	(bitmask of BIT(NL80211_STA_FLAG_...))
  * @listen_interval: listen interval or -1 for no change
  * @aid: AID or zero for no change
  */
 struct station_parameters {
 	u8 *supported_rates;
 	struct net_device *vlan;
-	u32 station_flags;
+	u32 sta_flags_mask, sta_flags_set;
 	int listen_interval;
 	u16 aid;
 	u8 supported_rates_len;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index be86e159e6e..d591a936f5c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -629,34 +629,38 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 	int i, j;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u32 mask, set;
 
 	sband = local->hw.wiphy->bands[local->oper_channel->band];
 
-	/*
-	 * FIXME: updating the flags is racy when this function is
-	 *	  called from ieee80211_change_station(), this will
-	 *	  be resolved in a future patch.
-	 */
+	spin_lock_bh(&sta->lock);
+	mask = params->sta_flags_mask;
+	set = params->sta_flags_set;
 
-	if (params->station_flags & STATION_FLAG_CHANGED) {
-		spin_lock_bh(&sta->lock);
+	if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
 		sta->flags &= ~WLAN_STA_AUTHORIZED;
-		if (params->station_flags & STATION_FLAG_AUTHORIZED)
+		if (set & BIT(NL80211_STA_FLAG_AUTHORIZED))
 			sta->flags |= WLAN_STA_AUTHORIZED;
+	}
 
+	if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) {
 		sta->flags &= ~WLAN_STA_SHORT_PREAMBLE;
-		if (params->station_flags & STATION_FLAG_SHORT_PREAMBLE)
+		if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE))
 			sta->flags |= WLAN_STA_SHORT_PREAMBLE;
+	}
 
+	if (mask & BIT(NL80211_STA_FLAG_WME)) {
 		sta->flags &= ~WLAN_STA_WME;
-		if (params->station_flags & STATION_FLAG_WME)
+		if (set & BIT(NL80211_STA_FLAG_WME))
 			sta->flags |= WLAN_STA_WME;
+	}
 
+	if (mask & BIT(NL80211_STA_FLAG_MFP)) {
 		sta->flags &= ~WLAN_STA_MFP;
-		if (params->station_flags & STATION_FLAG_MFP)
+		if (set & BIT(NL80211_STA_FLAG_MFP))
 			sta->flags |= WLAN_STA_MFP;
-		spin_unlock_bh(&sta->lock);
 	}
+	spin_unlock_bh(&sta->lock);
 
 	/*
 	 * FIXME: updating the following information is racy when this
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2353ddbf493..66024ef57ba 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -123,6 +123,9 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
 	[NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_USE_MFP] = { .type = NLA_U32 },
+	[NL80211_ATTR_STA_FLAGS2] = {
+		.len = sizeof(struct nl80211_sta_flag_update),
+	},
 };
 
 /* IE validation */
@@ -1334,13 +1337,33 @@ static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
 	[NL80211_STA_FLAG_MFP] = { .type = NLA_FLAG },
 };
 
-static int parse_station_flags(struct nlattr *nla, u32 *staflags)
+static int parse_station_flags(struct genl_info *info,
+			       struct station_parameters *params)
 {
 	struct nlattr *flags[NL80211_STA_FLAG_MAX + 1];
+	struct nlattr *nla;
 	int flag;
 
-	*staflags = 0;
+	/*
+	 * Try parsing the new attribute first so userspace
+	 * can specify both for older kernels.
+	 */
+	nla = info->attrs[NL80211_ATTR_STA_FLAGS2];
+	if (nla) {
+		struct nl80211_sta_flag_update *sta_flags;
+
+		sta_flags = nla_data(nla);
+		params->sta_flags_mask = sta_flags->mask;
+		params->sta_flags_set = sta_flags->set;
+		if ((params->sta_flags_mask |
+		     params->sta_flags_set) & BIT(__NL80211_STA_FLAG_INVALID))
+			return -EINVAL;
+		return 0;
+	}
+
+	/* if present, parse the old attribute */
 
+	nla = info->attrs[NL80211_ATTR_STA_FLAGS];
 	if (!nla)
 		return 0;
 
@@ -1348,11 +1371,12 @@ static int parse_station_flags(struct nlattr *nla, u32 *staflags)
 			     nla, sta_flags_policy))
 		return -EINVAL;
 
-	*staflags = STATION_FLAG_CHANGED;
+	params->sta_flags_mask = (1 << __NL80211_STA_FLAG_AFTER_LAST) - 1;
+	params->sta_flags_mask &= ~1;
 
 	for (flag = 1; flag <= NL80211_STA_FLAG_MAX; flag++)
 		if (flags[flag])
-			*staflags |= (1<<flag);
+			params->sta_flags_set |= (1<<flag);
 
 	return 0;
 }
@@ -1648,8 +1672,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 		params.ht_capa =
 			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
 
-	if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS],
-				&params.station_flags))
+	if (parse_station_flags(info, &params))
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
@@ -1718,8 +1741,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.ht_capa =
 			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
 
-	if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS],
-				&params.station_flags))
+	if (parse_station_flags(info, &params))
 		return -EINVAL;
 
 	rtnl_lock();
-- 
cgit v1.2.3-70-g09d2


From 3f77316c6b99f596bfbf72c0542f47f7230b702e Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Mon, 11 May 2009 21:57:57 +0300
Subject: nl80211: Add IEEE 802.1X PAE control for station mode

Add a new NL80211_ATTR_CONTROL_PORT flag for NL80211_CMD_ASSOCIATE to
allow user space to indicate that it will control the IEEE 802.1X port
in station mode. Previously, mac80211 was always marking the port
authorized in station mode. This was enough when drop_unencrypted flag
was set. However, drop_unencrypted can currently be controlled only
with WEXT and the current nl80211 design does not allow fully secure
configuration. Fix this by providing a mechanism for user space to
control the IEEE 802.1X port in station mode (i.e., do the same that
we are already doing in AP mode).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    | 9 +++++++++
 include/net/cfg80211.h     | 5 +++++
 net/mac80211/cfg.c         | 5 +++++
 net/mac80211/ieee80211_i.h | 2 +-
 net/mac80211/mlme.c        | 5 +++--
 net/mac80211/wext.c        | 3 +++
 net/wireless/nl80211.c     | 3 +++
 7 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index aeefccfac0e..2781525b03d 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -504,6 +504,13 @@ enum nl80211_commands {
  * @NL80211_ATTR_STA_FLAGS2: Attribute containing a
  *	&struct nl80211_sta_flag_update.
  *
+ * @NL80211_ATTR_CONTROL_PORT: A flag indicating whether user space controls
+ *	IEEE 802.1X port, i.e., sets/clears %NL80211_STA_FLAG_AUTHORIZED, in
+ *	station mode. If the flag is included in %NL80211_CMD_ASSOCIATE
+ *	request, the driver will assume that the port is unauthorized until
+ *	authorized by user space. Otherwise, port is marked authorized by
+ *	default in station mode.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -610,6 +617,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_STA_FLAGS2,
 
+	NL80211_ATTR_CONTROL_PORT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0dae6b38294..9e17a83d343 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -655,6 +655,10 @@ struct cfg80211_auth_request {
  * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
  * @ie_len: Length of ie buffer in octets
  * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
+ * @control_port: Whether user space controls IEEE 802.1X port, i.e.,
+ *	sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
+ *	required to assume that the port is unauthorized until authorized by
+ *	user space. Otherwise, port is marked authorized by default.
  */
 struct cfg80211_assoc_request {
 	struct ieee80211_channel *chan;
@@ -664,6 +668,7 @@ struct cfg80211_assoc_request {
 	const u8 *ie;
 	size_t ie_len;
 	bool use_mfp;
+	bool control_port;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d591a936f5c..6464bfd232c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1265,6 +1265,11 @@ static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev,
 		sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED;
 	}
 
+	if (req->control_port)
+		sdata->u.mgd.flags |= IEEE80211_STA_CONTROL_PORT;
+	else
+		sdata->u.mgd.flags &= ~IEEE80211_STA_CONTROL_PORT;
+
 	sdata->u.mgd.flags |= IEEE80211_STA_EXT_SME;
 	sdata->u.mgd.state = IEEE80211_STA_MLME_ASSOCIATE;
 	ieee80211_sta_req_auth(sdata);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 081c5742730..56a49ef446c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -235,7 +235,7 @@ struct mesh_preq_queue {
 #define IEEE80211_STA_ASSOCIATED	BIT(4)
 #define IEEE80211_STA_PROBEREQ_POLL	BIT(5)
 #define IEEE80211_STA_CREATE_IBSS	BIT(6)
-/* hole at 7, please re-use */
+#define IEEE80211_STA_CONTROL_PORT	BIT(7)
 #define IEEE80211_STA_WMM_ENABLED	BIT(8)
 /* hole at 9, please re-use */
 #define IEEE80211_STA_AUTO_SSID_SEL	BIT(10)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 6d00e3f738c..2806f6af7ae 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1581,8 +1581,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	 *	  to between the sta_info_alloc() and sta_info_insert() above.
 	 */
 
-	set_sta_flags(sta, WLAN_STA_AUTH | WLAN_STA_ASSOC | WLAN_STA_ASSOC_AP |
-			   WLAN_STA_AUTHORIZED);
+	set_sta_flags(sta, WLAN_STA_AUTH | WLAN_STA_ASSOC | WLAN_STA_ASSOC_AP);
+	if (!(ifmgd->flags & IEEE80211_STA_CONTROL_PORT))
+		set_sta_flags(sta, WLAN_STA_AUTHORIZED);
 
 	rates = 0;
 	basic_rates = 0;
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index d8450264468..c14394744a9 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -41,6 +41,7 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
 			return ret;
 		sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
 		sdata->u.mgd.flags &= ~IEEE80211_STA_EXT_SME;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_CONTROL_PORT;
 		ieee80211_sta_req_auth(sdata);
 		return 0;
 	}
@@ -124,6 +125,7 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
 			return ret;
 
 		sdata->u.mgd.flags &= ~IEEE80211_STA_EXT_SME;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_CONTROL_PORT;
 		ieee80211_sta_req_auth(sdata);
 		return 0;
 	}
@@ -181,6 +183,7 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
 		if (ret)
 			return ret;
 		sdata->u.mgd.flags &= ~IEEE80211_STA_EXT_SME;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_CONTROL_PORT;
 		ieee80211_sta_req_auth(sdata);
 		return 0;
 	} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 66024ef57ba..cad281390cf 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -126,6 +126,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_STA_FLAGS2] = {
 		.len = sizeof(struct nl80211_sta_flag_update),
 	},
+	[NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
 };
 
 /* IE validation */
@@ -3040,6 +3041,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	req.control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];
+
 	err = drv->ops->assoc(&drv->wiphy, dev, &req);
 
 out:
-- 
cgit v1.2.3-70-g09d2


From faa8fdc85347cc76d87b43ea718785661c54f656 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Mon, 11 May 2009 21:57:58 +0300
Subject: nl80211: Add RSC configuration for new keys

When setting a key with NL80211_CMD_NEW_KEY, we should allow the key
sequence number (RSC) to be set in order to allow replay protection to
work correctly for group keys. This patch documents this use for
nl80211 and adds the couple of missing pieces in nl80211/cfg80211 and
mac80211 to support this. In addition, WEXT SIOCSIWENCODEEXT compat
processing in cfg80211 is extended to handle the RSC (this was already
specified in WEXT, but just not implemented in cfg80211/mac80211).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    |  4 ++--
 net/mac80211/cfg.c         |  3 ++-
 net/mac80211/key.c         | 21 ++++++++++++++++++++-
 net/mac80211/key.h         |  3 ++-
 net/wireless/nl80211.c     |  5 +++++
 net/wireless/wext-compat.c |  5 +++++
 6 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 2781525b03d..dbea93b694e 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -79,8 +79,8 @@
  * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT,
  *	%NL80211_ATTR_KEY_DEFAULT_MGMT, or %NL80211_ATTR_KEY_THRESHOLD.
  * @NL80211_CMD_NEW_KEY: add a key with given %NL80211_ATTR_KEY_DATA,
- *	%NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC and %NL80211_ATTR_KEY_CIPHER
- *	attributes.
+ *	%NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC, %NL80211_ATTR_KEY_CIPHER,
+ *	and %NL80211_ATTR_KEY_SEQ attributes.
  * @NL80211_CMD_DEL_KEY: delete a key identified by %NL80211_ATTR_KEY_IDX
  *	or %NL80211_ATTR_MAC.
  *
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 6464bfd232c..77e9ff5ec4f 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -141,7 +141,8 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 		return -EINVAL;
 	}
 
-	key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key);
+	key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key,
+				  params->seq_len, params->seq);
 	if (!key)
 		return -ENOMEM;
 
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index b7e1350273b..827ea8e6ee0 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -290,9 +290,11 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 					  int idx,
 					  size_t key_len,
-					  const u8 *key_data)
+					  const u8 *key_data,
+					  size_t seq_len, const u8 *seq)
 {
 	struct ieee80211_key *key;
+	int i, j;
 
 	BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS);
 
@@ -318,14 +320,31 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 	case ALG_TKIP:
 		key->conf.iv_len = TKIP_IV_LEN;
 		key->conf.icv_len = TKIP_ICV_LEN;
+		if (seq && seq_len == 6) {
+			for (i = 0; i < NUM_RX_DATA_QUEUES; i++) {
+				key->u.tkip.rx[i].iv32 =
+					get_unaligned_le32(&seq[2]);
+				key->u.tkip.rx[i].iv16 =
+					get_unaligned_le16(seq);
+			}
+		}
 		break;
 	case ALG_CCMP:
 		key->conf.iv_len = CCMP_HDR_LEN;
 		key->conf.icv_len = CCMP_MIC_LEN;
+		if (seq && seq_len == CCMP_PN_LEN) {
+			for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
+				for (j = 0; j < CCMP_PN_LEN; j++)
+					key->u.ccmp.rx_pn[i][j] =
+						seq[CCMP_PN_LEN - j - 1];
+		}
 		break;
 	case ALG_AES_CMAC:
 		key->conf.iv_len = 0;
 		key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		if (seq && seq_len == 6)
+			for (j = 0; j < 6; j++)
+				key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
 		break;
 	}
 	memcpy(key->conf.key, key_data, key_len);
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 215d3ef42a4..9572e00f532 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -144,7 +144,8 @@ struct ieee80211_key {
 struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 					  int idx,
 					  size_t key_len,
-					  const u8 *key_data);
+					  const u8 *key_data,
+					  size_t seq_len, const u8 *seq);
 /*
  * Insert a key into data structures (sdata, sta if necessary)
  * to make it used, free old key.
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cad281390cf..f0fec2f4982 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1115,6 +1115,11 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 		params.key_len = nla_len(info->attrs[NL80211_ATTR_KEY_DATA]);
 	}
 
+	if (info->attrs[NL80211_ATTR_KEY_SEQ]) {
+		params.seq = nla_data(info->attrs[NL80211_ATTR_KEY_SEQ]);
+		params.seq_len = nla_len(info->attrs[NL80211_ATTR_KEY_SEQ]);
+	}
+
 	if (info->attrs[NL80211_ATTR_KEY_IDX])
 		key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
 
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index ffc98a8d6e5..f98090b90fb 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -663,6 +663,11 @@ int cfg80211_wext_siwencodeext(struct net_device *dev,
 	params.key_len = ext->key_len;
 	params.cipher = cipher;
 
+	if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) {
+		params.seq = ext->rx_seq;
+		params.seq_len = 6;
+	}
+
 	return cfg80211_set_encryption(
 			rdev, dev, addr, remove,
 			ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
-- 
cgit v1.2.3-70-g09d2


From 9fc20f030ba457d20eb994e1def7e2ce7d5ae1a8 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 14 May 2009 15:14:18 +0200
Subject: ALSA: ctxfi - Move PCI ID definitions to linux/pci_ids.h

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h |  7 +++++++
 sound/pci/ctxfi/ctatc.c | 17 +++++++++--------
 sound/pci/ctxfi/ctdrv.h | 30 ------------------------------
 sound/pci/ctxfi/xfi.c   |  6 +++---
 4 files changed, 19 insertions(+), 41 deletions(-)
 delete mode 100644 sound/pci/ctxfi/ctdrv.h

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 06ba90c211a..61915313898 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1314,6 +1314,13 @@
 
 #define PCI_VENDOR_ID_CREATIVE		0x1102 /* duplicate: ECTIVA */
 #define PCI_DEVICE_ID_CREATIVE_EMU10K1	0x0002
+#define PCI_DEVICE_ID_CREATIVE_20K1	0x0005
+#define PCI_DEVICE_ID_CREATIVE_20K2	0x000b
+#define PCI_SUBDEVICE_ID_CREATIVE_SB0760	0x0024
+#define PCI_SUBDEVICE_ID_CREATIVE_SB08801	0x0041
+#define PCI_SUBDEVICE_ID_CREATIVE_SB08802	0x0042
+#define PCI_SUBDEVICE_ID_CREATIVE_SB08803	0x0043
+#define PCI_SUBDEVICE_ID_CREATIVE_HENDRIX	0x6000
 
 #define PCI_VENDOR_ID_ECTIVA		0x1102 /* duplicate: CREATIVE */
 #define PCI_DEVICE_ID_ECTIVA_EV1938	0x8938
diff --git a/sound/pci/ctxfi/ctatc.c b/sound/pci/ctxfi/ctatc.c
index 5f35374863f..073fe2a59da 100644
--- a/sound/pci/ctxfi/ctatc.c
+++ b/sound/pci/ctxfi/ctatc.c
@@ -18,7 +18,6 @@
 #include "ctatc.h"
 #include "ctpcm.h"
 #include "ctmixer.h"
-#include "ctdrv.h"
 #include "cthardware.h"
 #include "ctsrc.h"
 #include "ctamixer.h"
@@ -40,23 +39,25 @@
 			    | ((IEC958_AES3_CON_FS_48000) << 24))
 
 static const struct ct_atc_chip_sub_details atc_sub_details[NUM_CTCARDS] = {
-	[CTSB0760] = {.subsys = PCI_SUBSYS_CREATIVE_SB0760,
+	[CTSB0760] = {.subsys = PCI_SUBDEVICE_ID_CREATIVE_SB0760,
 		      .nm_model = "SB076x"},
-	[CTHENDRIX] = {.subsys = PCI_SUBSYS_CREATIVE_HENDRIX,
+	[CTHENDRIX] = {.subsys = PCI_SUBDEVICE_ID_CREATIVE_HENDRIX,
 		      .nm_model = "Hendrix"},
-	[CTSB08801] = {.subsys = PCI_SUBSYS_CREATIVE_SB08801,
+	[CTSB08801] = {.subsys = PCI_SUBDEVICE_ID_CREATIVE_SB08801,
 		      .nm_model = "SB0880"},
-	[CTSB08802] = {.subsys = PCI_SUBSYS_CREATIVE_SB08802,
+	[CTSB08802] = {.subsys = PCI_SUBDEVICE_ID_CREATIVE_SB08802,
 		      .nm_model = "SB0880"},
-	[CTSB08803] = {.subsys = PCI_SUBSYS_CREATIVE_SB08803,
+	[CTSB08803] = {.subsys = PCI_SUBDEVICE_ID_CREATIVE_SB08803,
 		      .nm_model = "SB0880"}
 };
 
 static struct ct_atc_chip_details atc_chip_details[] = {
-	{.vendor = PCI_VENDOR_CREATIVE, .device = PCI_DEVICE_CREATIVE_20K1,
+	{.vendor = PCI_VENDOR_ID_CREATIVE,
+	 .device = PCI_DEVICE_ID_CREATIVE_20K1,
 	 .sub_details = NULL,
 	 .nm_card = "X-Fi 20k1"},
-	{.vendor = PCI_VENDOR_CREATIVE, .device = PCI_DEVICE_CREATIVE_20K2,
+	{.vendor = PCI_VENDOR_ID_CREATIVE,
+	 .device = PCI_DEVICE_ID_CREATIVE_20K2,
 	 .sub_details = atc_sub_details,
 	 .nm_card = "X-Fi 20k2"},
 	{} /* terminator */
diff --git a/sound/pci/ctxfi/ctdrv.h b/sound/pci/ctxfi/ctdrv.h
deleted file mode 100644
index f776a44f52c..00000000000
--- a/sound/pci/ctxfi/ctdrv.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright (C) 2008, Creative Technology Ltd. All Rights Reserved.
- *
- * This source file is released under GPL v2 license (no other versions).
- * See the COPYING file included in the main directory of this source
- * distribution for the license terms and conditions.
- *
- * @file    ctdrv.h
- *
- * @breaf
- * This file contains the definition of card IDs supported by this driver.
- *
- * @author Liu Chun
- *
- */
-
-#ifndef CTDRV_H
-#define CTDRV_H
-
-#define PCI_VENDOR_CREATIVE		0x1102
-#define PCI_DEVICE_CREATIVE_20K1	0x0005
-#define PCI_DEVICE_CREATIVE_20K2	0x000B
-#define PCI_SUBVENDOR_CREATIVE		0x1102
-#define PCI_SUBSYS_CREATIVE_SB0760	0x0024
-#define PCI_SUBSYS_CREATIVE_SB08801	0x0041
-#define PCI_SUBSYS_CREATIVE_SB08802	0x0042
-#define PCI_SUBSYS_CREATIVE_SB08803	0x0043
-#define PCI_SUBSYS_CREATIVE_HENDRIX	0x6000
-
-#endif /* CTDRV_H */
diff --git a/sound/pci/ctxfi/xfi.c b/sound/pci/ctxfi/xfi.c
index f9114403651..b7368fded53 100644
--- a/sound/pci/ctxfi/xfi.c
+++ b/sound/pci/ctxfi/xfi.c
@@ -11,10 +11,10 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/moduleparam.h>
+#include <linux/pci_ids.h>
 #include <sound/core.h>
 #include <sound/initval.h>
 #include "ctatc.h"
-#include "ctdrv.h"
 
 MODULE_AUTHOR("Creative Technology Ltd");
 MODULE_DESCRIPTION("X-Fi driver version 1.03");
@@ -32,8 +32,8 @@ static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
 
 static struct pci_device_id ct_pci_dev_ids[] = {
 	/* only X-Fi is supported, so... */
-	{ PCI_DEVICE(PCI_VENDOR_CREATIVE, PCI_DEVICE_CREATIVE_20K1) },
-	{ PCI_DEVICE(PCI_VENDOR_CREATIVE, PCI_DEVICE_CREATIVE_20K2) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CREATIVE, PCI_DEVICE_ID_CREATIVE_20K1) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_CREATIVE, PCI_DEVICE_ID_CREATIVE_20K2) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, ct_pci_dev_ids);
-- 
cgit v1.2.3-70-g09d2


From 077e6dba20e74a455a0452379d2a965c7e1b01ad Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Fri, 1 May 2009 21:21:02 +0200
Subject: ide-atapi: switch to rq->resid_len

Now that we have rq->resid_len, use it to account partial completion
amount during the lifetime of an rq, decrementing it on each successful
transfer. As a result, get rid of now unused pc->xferred.

While at it, remove noisy debug call in ide_prep_sense.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
---
 drivers/ide/ide-atapi.c | 19 ++++++++-----------
 drivers/ide/ide-tape.c  | 12 +++++-------
 include/linux/ide.h     |  2 --
 3 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7129495b3e4..1022e421abd 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -172,8 +172,6 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	unsigned int cmd_len, sense_len;
 	int err;
 
-	debug_log("%s: enter\n", __func__);
-
 	switch (drive->media) {
 	case ide_floppy:
 		cmd_len = 255;
@@ -370,7 +368,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
 		} else
-			pc->xferred = blk_rq_bytes(rq);
+			rq->resid_len = 0;
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -379,7 +377,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		int uptodate, error;
 
 		debug_log("Packet command completed, %d bytes transferred\n",
-			  pc->xferred);
+			  blk_rq_bytes(rq));
 
 		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
 
@@ -467,15 +465,15 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	ide_pio_bytes(drive, cmd, write, done);
 
 	/* Update transferred byte count */
-	pc->xferred += done;
+	rq->resid_len -= done;
 
 	bcount -= done;
 
 	if (bcount)
 		ide_pad_transfer(drive, write, bcount);
 
-	debug_log("[cmd %x] transferred %d bytes, padded %d bytes\n",
-		  rq->cmd[0], done, bcount);
+	debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n",
+		  rq->cmd[0], done, bcount, rq->resid_len);
 
 	/* And set the interrupt handler again */
 	ide_set_handler(drive, ide_pc_intr, timeout);
@@ -643,16 +641,15 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 	} else {
 		pc = drive->pc;
 
-		/* We haven't transferred any data yet */
-		pc->xferred = 0;
-
 		valid_tf = IDE_VALID_DEVICE;
 		bytes = blk_rq_bytes(rq);
-
 		bcount = ((drive->media == ide_tape) ? bytes
 						     : min_t(unsigned int,
 							     bytes, 63 * 1024));
 
+		/* We haven't transferred any data yet */
+		rq->resid_len = bcount;
+
 		if (pc->flags & PC_FLAG_DMA_ERROR) {
 			pc->flags &= ~PC_FLAG_DMA_ERROR;
 			ide_dma_off(drive);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aaeef12f80a..c9337099797 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -301,11 +301,9 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	debug_log(DBG_ERR, "pc = %x, sense key = %x, asc = %x, ascq = %x\n",
 		 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
-	/* Correct pc->xferred by asking the tape.	 */
+	/* correct remaining bytes to transfer */
 	if (pc->flags & PC_FLAG_DMA_ERROR)
-		pc->xferred = blk_rq_bytes(rq) -
-			tape->blk_size *
-			get_unaligned_be32(&sense[3]);
+		rq->resid_len = tape->blk_size * get_unaligned_be32(&sense[3]);
 
 	/*
 	 * If error was the result of a zero-length read or write command,
@@ -339,7 +337,7 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 			pc->flags |= PC_FLAG_ABORT;
 		}
 		if (!(pc->flags & PC_FLAG_ABORT) &&
-		    pc->xferred)
+		    (blk_rq_bytes(rq) - rq->resid_len))
 			pc->retries = IDETAPE_MAX_PC_RETRIES + 1;
 	}
 }
@@ -369,7 +367,8 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 			printk(KERN_ERR "ide-tape: Error in REQUEST SENSE "
 					"itself - Aborting request!\n");
 	} else if (pc->c[0] == READ_6 || pc->c[0] == WRITE_6) {
-		int blocks = pc->xferred / tape->blk_size;
+		unsigned int blocks =
+			(blk_rq_bytes(rq) - rq->resid_len) / tape->blk_size;
 
 		tape->avg_size += blocks * tape->blk_size;
 
@@ -381,7 +380,6 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 34c128f0a33..745a393af27 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -357,8 +357,6 @@ struct ide_atapi_pc {
 
 	/* bytes to transfer */
 	int req_xfer;
-	/* bytes actually transferred */
-	int xferred;
 
 	/* data buffer */
 	u8 *buf;
-- 
cgit v1.2.3-70-g09d2


From 5a0e43b5e2ee9a295f864c38f0e853b1a4fc3892 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Fri, 1 May 2009 23:27:11 +0200
Subject: ide-atapi: add a len-parameter to ide_queue_pc_tail

This is in preparation for removing ide_atapi_pc.

There should be no functional change resulting from this patch.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
---
 drivers/ide/ide-atapi.c        | 12 ++++++------
 drivers/ide/ide-floppy.c       |  4 ++--
 drivers/ide/ide-floppy_ioctl.c |  8 ++++----
 drivers/ide/ide-tape.c         | 26 +++++++++++++-------------
 include/linux/ide.h            |  3 ++-
 5 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 1022e421abd..0d4da2c1adc 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(ide_init_pc);
  * and wait for it to be serviced.
  */
 int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
-		      struct ide_atapi_pc *pc)
+		      struct ide_atapi_pc *pc, unsigned int bufflen)
 {
 	struct request *rq;
 	int error;
@@ -93,8 +93,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->special = (char *)pc;
 
-	if (pc->req_xfer) {
-		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+	if (bufflen) {
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, bufflen,
 					GFP_NOIO);
 		if (error)
 			goto put_req;
@@ -117,7 +117,7 @@ int ide_do_test_unit_ready(ide_drive_t *drive, struct gendisk *disk)
 	ide_init_pc(&pc);
 	pc.c[0] = TEST_UNIT_READY;
 
-	return ide_queue_pc_tail(drive, disk, &pc);
+	return ide_queue_pc_tail(drive, disk, &pc, 0);
 }
 EXPORT_SYMBOL_GPL(ide_do_test_unit_ready);
 
@@ -132,7 +132,7 @@ int ide_do_start_stop(ide_drive_t *drive, struct gendisk *disk, int start)
 	if (drive->media == ide_tape)
 		pc.flags |= PC_FLAG_WAIT_FOR_DSC;
 
-	return ide_queue_pc_tail(drive, disk, &pc);
+	return ide_queue_pc_tail(drive, disk, &pc, 0);
 }
 EXPORT_SYMBOL_GPL(ide_do_start_stop);
 
@@ -147,7 +147,7 @@ int ide_set_media_lock(ide_drive_t *drive, struct gendisk *disk, int on)
 	pc.c[0] = ALLOW_MEDIUM_REMOVAL;
 	pc.c[4] = on;
 
-	return ide_queue_pc_tail(drive, disk, &pc);
+	return ide_queue_pc_tail(drive, disk, &pc, 0);
 }
 EXPORT_SYMBOL_GPL(ide_set_media_lock);
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a1c55985d4a..5df00d4ab8d 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -318,7 +318,7 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive,
 
 	ide_floppy_create_mode_sense_cmd(pc, IDEFLOPPY_FLEXIBLE_DISK_PAGE);
 
-	if (ide_queue_pc_tail(drive, disk, pc)) {
+	if (ide_queue_pc_tail(drive, disk, pc, pc->req_xfer)) {
 		printk(KERN_ERR PFX "Can't get flexible disk page params\n");
 		return 1;
 	}
@@ -390,7 +390,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	pc.buf = &pc_buf[0];
 	pc.buf_size = sizeof(pc_buf);
 
-	if (ide_queue_pc_tail(drive, disk, &pc)) {
+	if (ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer)) {
 		printk(KERN_ERR PFX "Can't get floppy parameters\n");
 		return 1;
 	}
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index cd8a42027ed..75f1d50276a 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -50,7 +50,7 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 	pc->buf = &pc_buf[0];
 	pc->buf_size = sizeof(pc_buf);
 
-	if (ide_queue_pc_tail(drive, floppy->disk, pc)) {
+	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
 		return -EIO;
 	}
@@ -124,7 +124,7 @@ static int ide_floppy_get_sfrp_bit(ide_drive_t *drive, struct ide_atapi_pc *pc)
 	ide_floppy_create_mode_sense_cmd(pc, IDEFLOPPY_CAPABILITIES_PAGE);
 	pc->flags |= PC_FLAG_SUPPRESS_ERROR;
 
-	if (ide_queue_pc_tail(drive, floppy->disk, pc))
+	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer))
 		return 1;
 
 	if (pc->buf[8 + 2] & 0x40)
@@ -172,7 +172,7 @@ static int ide_floppy_format_unit(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	ide_floppy_get_sfrp_bit(drive, pc);
 	ide_floppy_create_format_unit_cmd(pc, blocks, length, flags);
 
-	if (ide_queue_pc_tail(drive, floppy->disk, pc))
+	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer))
 		err = -EIO;
 
 out:
@@ -200,7 +200,7 @@ static int ide_floppy_get_format_progress(ide_drive_t *drive,
 
 	if (drive->atapi_flags & IDE_AFLAG_SRFP) {
 		ide_create_request_sense_cmd(drive, pc);
-		if (ide_queue_pc_tail(drive, floppy->disk, pc))
+		if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer))
 			return -EIO;
 
 		if (floppy->sense_key == 2 &&
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index c9337099797..f09a263b72f 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -770,7 +770,7 @@ static int idetape_flush_tape_buffers(ide_drive_t *drive)
 	int rc;
 
 	idetape_create_write_filemark_cmd(drive, &pc, 0);
-	rc = ide_queue_pc_tail(drive, tape->disk, &pc);
+	rc = ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer);
 	if (rc)
 		return rc;
 	idetape_wait_ready(drive, 60 * 5 * HZ);
@@ -793,7 +793,7 @@ static int idetape_read_position(ide_drive_t *drive)
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	idetape_create_read_position_cmd(&pc);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc))
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer))
 		return -1;
 	position = tape->first_frame;
 	return position;
@@ -846,12 +846,12 @@ static int idetape_position_tape(ide_drive_t *drive, unsigned int block,
 		__ide_tape_discard_merge_buffer(drive);
 	idetape_wait_ready(drive, 60 * 5 * HZ);
 	idetape_create_locate_cmd(drive, &pc, block, partition, skip);
-	retval = ide_queue_pc_tail(drive, disk, &pc);
+	retval = ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 	if (retval)
 		return (retval);
 
 	idetape_create_read_position_cmd(&pc);
-	return ide_queue_pc_tail(drive, disk, &pc);
+	return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 }
 
 static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
@@ -1047,12 +1047,12 @@ static int idetape_rewind_tape(ide_drive_t *drive)
 	debug_log(DBG_SENSE, "Enter %s\n", __func__);
 
 	idetape_create_rewind_cmd(drive, &pc);
-	retval = ide_queue_pc_tail(drive, disk, &pc);
+	retval = ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 	if (retval)
 		return retval;
 
 	idetape_create_read_position_cmd(&pc);
-	retval = ide_queue_pc_tail(drive, disk, &pc);
+	retval = ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 	if (retval)
 		return retval;
 	return 0;
@@ -1120,7 +1120,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
 	case MTBSF:
 		idetape_create_space_cmd(&pc, mt_count - count,
 					 IDETAPE_SPACE_OVER_FILEMARK);
-		return ide_queue_pc_tail(drive, disk, &pc);
+		return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 	case MTFSFM:
 	case MTBSFM:
 		if (!sprev)
@@ -1259,7 +1259,7 @@ static int idetape_write_filemark(ide_drive_t *drive)
 
 	/* Write a filemark */
 	idetape_create_write_filemark_cmd(drive, &pc, 1);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: Couldn't write a filemark\n");
 		return -EIO;
 	}
@@ -1344,11 +1344,11 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 			IDETAPE_LU_RETENSION_MASK | IDETAPE_LU_LOAD_MASK);
 	case MTEOM:
 		idetape_create_space_cmd(&pc, 0, IDETAPE_SPACE_TO_EOD);
-		return ide_queue_pc_tail(drive, disk, &pc);
+		return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 	case MTERASE:
 		(void)idetape_rewind_tape(drive);
 		idetape_create_erase_cmd(&pc);
-		return ide_queue_pc_tail(drive, disk, &pc);
+		return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
 	case MTSETBLK:
 		if (mt_count) {
 			if (mt_count < tape->blk_size ||
@@ -1457,7 +1457,7 @@ static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive)
 	struct ide_atapi_pc pc;
 
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_BLOCK_DESCRIPTOR);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: Can't get block descriptor\n");
 		if (tape->blk_size == 0) {
 			printk(KERN_WARNING "ide-tape: Cannot deal with zero "
@@ -1613,7 +1613,7 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 	pc.buf = &pc_buf[0];
 	pc.buf_size = sizeof(pc_buf);
 
-	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
 		return;
@@ -1642,7 +1642,7 @@ static void idetape_get_mode_sense_results(ide_drive_t *drive)
 	u8 speed, max_speed;
 
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_CAPABILITIES_PAGE);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: Can't get tape parameters - assuming"
 				" some default values\n");
 		tape->blk_size = 512;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 745a393af27..7e15bd1eaae 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1160,7 +1160,8 @@ enum {
 	REQ_IDETAPE_WRITE	= (1 << 3),
 };
 
-int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *);
+int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
+		      unsigned int);
 
 int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
-- 
cgit v1.2.3-70-g09d2


From b13345f39dadbabdabaf8819cf6df26913da9e8d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Sat, 2 May 2009 10:26:12 +0200
Subject: ide-atapi: add a buffer-arg to ide_queue_pc_tail

This is in preparation of removing ide_atapi_pc. Expose the buffer as an
argument to ide_queue_pc_tail with later replacing it with local buffer
or even kmalloc'ed one if needed due to stack usage constraints.

Also, add the possibility of passing a NULL-ptr buffer for cmds which
don't transfer data besides the cdb. While at it, switch to local buffer
in idetape_get_mode_sense_results().

There should be no functional change resulting from this patch.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
---
 drivers/ide/ide-atapi.c        | 12 ++++++------
 drivers/ide/ide-floppy.c       | 17 ++++++++---------
 drivers/ide/ide-floppy_ioctl.c | 16 ++++++++--------
 drivers/ide/ide-tape.c         | 37 ++++++++++++++++++-------------------
 include/linux/ide.h            |  2 +-
 5 files changed, 41 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 0d4da2c1adc..b12be1f17f1 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(ide_init_pc);
  * and wait for it to be serviced.
  */
 int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
-		      struct ide_atapi_pc *pc, unsigned int bufflen)
+		      struct ide_atapi_pc *pc, void *buf, unsigned int bufflen)
 {
 	struct request *rq;
 	int error;
@@ -93,8 +93,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->special = (char *)pc;
 
-	if (bufflen) {
-		error = blk_rq_map_kern(drive->queue, rq, pc->buf, bufflen,
+	if (buf && bufflen) {
+		error = blk_rq_map_kern(drive->queue, rq, buf, bufflen,
 					GFP_NOIO);
 		if (error)
 			goto put_req;
@@ -117,7 +117,7 @@ int ide_do_test_unit_ready(ide_drive_t *drive, struct gendisk *disk)
 	ide_init_pc(&pc);
 	pc.c[0] = TEST_UNIT_READY;
 
-	return ide_queue_pc_tail(drive, disk, &pc, 0);
+	return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(ide_do_test_unit_ready);
 
@@ -132,7 +132,7 @@ int ide_do_start_stop(ide_drive_t *drive, struct gendisk *disk, int start)
 	if (drive->media == ide_tape)
 		pc.flags |= PC_FLAG_WAIT_FOR_DSC;
 
-	return ide_queue_pc_tail(drive, disk, &pc, 0);
+	return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(ide_do_start_stop);
 
@@ -147,7 +147,7 @@ int ide_set_media_lock(ide_drive_t *drive, struct gendisk *disk, int on)
 	pc.c[0] = ALLOW_MEDIUM_REMOVAL;
 	pc.c[4] = on;
 
-	return ide_queue_pc_tail(drive, disk, &pc, 0);
+	return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(ide_set_media_lock);
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 5df00d4ab8d..be21cf23f8c 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -318,7 +318,7 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive,
 
 	ide_floppy_create_mode_sense_cmd(pc, IDEFLOPPY_FLEXIBLE_DISK_PAGE);
 
-	if (ide_queue_pc_tail(drive, disk, pc, pc->req_xfer)) {
+	if (ide_queue_pc_tail(drive, disk, pc, pc->buf, pc->req_xfer)) {
 		printk(KERN_ERR PFX "Can't get flexible disk page params\n");
 		return 1;
 	}
@@ -387,22 +387,21 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	drive->capacity64 = 0;
 
 	ide_floppy_create_read_capacity_cmd(&pc);
-	pc.buf = &pc_buf[0];
 	pc.buf_size = sizeof(pc_buf);
 
-	if (ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer)) {
+	if (ide_queue_pc_tail(drive, disk, &pc, pc_buf, pc.req_xfer)) {
 		printk(KERN_ERR PFX "Can't get floppy parameters\n");
 		return 1;
 	}
-	header_len = pc.buf[3];
-	cap_desc = &pc.buf[4];
+	header_len = pc_buf[3];
+	cap_desc = &pc_buf[4];
 	desc_cnt = header_len / 8; /* capacity descriptor of 8 bytes */
 
 	for (i = 0; i < desc_cnt; i++) {
 		unsigned int desc_start = 4 + i*8;
 
-		blocks = be32_to_cpup((__be32 *)&pc.buf[desc_start]);
-		length = be16_to_cpup((__be16 *)&pc.buf[desc_start + 6]);
+		blocks = be32_to_cpup((__be32 *)&pc_buf[desc_start]);
+		length = be16_to_cpup((__be16 *)&pc_buf[desc_start + 6]);
 
 		ide_debug_log(IDE_DBG_PROBE, "Descriptor %d: %dkB, %d blocks, "
 					     "%d sector size",
@@ -415,7 +414,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 		 * the code below is valid only for the 1st descriptor, ie i=0
 		 */
 
-		switch (pc.buf[desc_start + 4] & 0x03) {
+		switch (pc_buf[desc_start + 4] & 0x03) {
 		/* Clik! drive returns this instead of CAPACITY_CURRENT */
 		case CAPACITY_UNFORMATTED:
 			if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE))
@@ -464,7 +463,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 			break;
 		}
 		ide_debug_log(IDE_DBG_PROBE, "Descriptor 0 Code: %d",
-					     pc.buf[desc_start + 4] & 0x03);
+					     pc_buf[desc_start + 4] & 0x03);
 	}
 
 	/* Clik! disk does not support get_flexible_disk_page */
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 75f1d50276a..9c2518d7514 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -47,15 +47,14 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 		return -EINVAL;
 
 	ide_floppy_create_read_capacity_cmd(pc);
-	pc->buf = &pc_buf[0];
 	pc->buf_size = sizeof(pc_buf);
 
-	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer)) {
+	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc_buf, pc->req_xfer)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
 		return -EIO;
 	}
 
-	header_len = pc->buf[3];
+	header_len = pc_buf[3];
 	desc_cnt = header_len / 8; /* capacity descriptor of 8 bytes */
 
 	u_index = 0;
@@ -72,8 +71,8 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 		if (u_index >= u_array_size)
 			break;	/* User-supplied buffer too small */
 
-		blocks = be32_to_cpup((__be32 *)&pc->buf[desc_start]);
-		length = be16_to_cpup((__be16 *)&pc->buf[desc_start + 6]);
+		blocks = be32_to_cpup((__be32 *)&pc_buf[desc_start]);
+		length = be16_to_cpup((__be16 *)&pc_buf[desc_start + 6]);
 
 		if (put_user(blocks, argp))
 			return -EFAULT;
@@ -124,7 +123,7 @@ static int ide_floppy_get_sfrp_bit(ide_drive_t *drive, struct ide_atapi_pc *pc)
 	ide_floppy_create_mode_sense_cmd(pc, IDEFLOPPY_CAPABILITIES_PAGE);
 	pc->flags |= PC_FLAG_SUPPRESS_ERROR;
 
-	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer))
+	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->buf, pc->req_xfer))
 		return 1;
 
 	if (pc->buf[8 + 2] & 0x40)
@@ -172,7 +171,7 @@ static int ide_floppy_format_unit(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	ide_floppy_get_sfrp_bit(drive, pc);
 	ide_floppy_create_format_unit_cmd(pc, blocks, length, flags);
 
-	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer))
+	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->buf, pc->req_xfer))
 		err = -EIO;
 
 out:
@@ -200,7 +199,8 @@ static int ide_floppy_get_format_progress(ide_drive_t *drive,
 
 	if (drive->atapi_flags & IDE_AFLAG_SRFP) {
 		ide_create_request_sense_cmd(drive, pc);
-		if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->req_xfer))
+		if (ide_queue_pc_tail(drive, floppy->disk, pc, pc->buf,
+				      pc->req_xfer))
 			return -EIO;
 
 		if (floppy->sense_key == 2 &&
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index f09a263b72f..1f7f50473a4 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -770,7 +770,7 @@ static int idetape_flush_tape_buffers(ide_drive_t *drive)
 	int rc;
 
 	idetape_create_write_filemark_cmd(drive, &pc, 0);
-	rc = ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer);
+	rc = ide_queue_pc_tail(drive, tape->disk, &pc, NULL, 0);
 	if (rc)
 		return rc;
 	idetape_wait_ready(drive, 60 * 5 * HZ);
@@ -793,7 +793,7 @@ static int idetape_read_position(ide_drive_t *drive)
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	idetape_create_read_position_cmd(&pc);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer))
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.buf, pc.req_xfer))
 		return -1;
 	position = tape->first_frame;
 	return position;
@@ -846,12 +846,12 @@ static int idetape_position_tape(ide_drive_t *drive, unsigned int block,
 		__ide_tape_discard_merge_buffer(drive);
 	idetape_wait_ready(drive, 60 * 5 * HZ);
 	idetape_create_locate_cmd(drive, &pc, block, partition, skip);
-	retval = ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+	retval = ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 	if (retval)
 		return (retval);
 
 	idetape_create_read_position_cmd(&pc);
-	return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+	return ide_queue_pc_tail(drive, disk, &pc, pc.buf, pc.req_xfer);
 }
 
 static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
@@ -1047,12 +1047,12 @@ static int idetape_rewind_tape(ide_drive_t *drive)
 	debug_log(DBG_SENSE, "Enter %s\n", __func__);
 
 	idetape_create_rewind_cmd(drive, &pc);
-	retval = ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+	retval = ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 	if (retval)
 		return retval;
 
 	idetape_create_read_position_cmd(&pc);
-	retval = ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+	retval = ide_queue_pc_tail(drive, disk, &pc, pc.buf, pc.req_xfer);
 	if (retval)
 		return retval;
 	return 0;
@@ -1120,7 +1120,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
 	case MTBSF:
 		idetape_create_space_cmd(&pc, mt_count - count,
 					 IDETAPE_SPACE_OVER_FILEMARK);
-		return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+		return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 	case MTFSFM:
 	case MTBSFM:
 		if (!sprev)
@@ -1259,7 +1259,7 @@ static int idetape_write_filemark(ide_drive_t *drive)
 
 	/* Write a filemark */
 	idetape_create_write_filemark_cmd(drive, &pc, 1);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, NULL, 0)) {
 		printk(KERN_ERR "ide-tape: Couldn't write a filemark\n");
 		return -EIO;
 	}
@@ -1344,11 +1344,11 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 			IDETAPE_LU_RETENSION_MASK | IDETAPE_LU_LOAD_MASK);
 	case MTEOM:
 		idetape_create_space_cmd(&pc, 0, IDETAPE_SPACE_TO_EOD);
-		return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+		return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 	case MTERASE:
 		(void)idetape_rewind_tape(drive);
 		idetape_create_erase_cmd(&pc);
-		return ide_queue_pc_tail(drive, disk, &pc, pc.req_xfer);
+		return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
 	case MTSETBLK:
 		if (mt_count) {
 			if (mt_count < tape->blk_size ||
@@ -1457,7 +1457,7 @@ static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive)
 	struct ide_atapi_pc pc;
 
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_BLOCK_DESCRIPTOR);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.buf, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: Can't get block descriptor\n");
 		if (tape->blk_size == 0) {
 			printk(KERN_WARNING "ide-tape: Cannot deal with zero "
@@ -1610,17 +1610,16 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 	char fw_rev[4], vendor_id[8], product_id[16];
 
 	idetape_create_inquiry_cmd(&pc);
-	pc.buf = &pc_buf[0];
 	pc.buf_size = sizeof(pc_buf);
 
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc_buf, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
 		return;
 	}
-	memcpy(vendor_id, &pc.buf[8], 8);
-	memcpy(product_id, &pc.buf[16], 16);
-	memcpy(fw_rev, &pc.buf[32], 4);
+	memcpy(vendor_id, &pc_buf[8], 8);
+	memcpy(product_id, &pc_buf[16], 16);
+	memcpy(fw_rev, &pc_buf[32], 4);
 
 	ide_fixstring(vendor_id, 8, 0);
 	ide_fixstring(product_id, 16, 0);
@@ -1638,11 +1637,11 @@ static void idetape_get_mode_sense_results(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc pc;
-	u8 *caps;
+	u8 buf[24], *caps;
 	u8 speed, max_speed;
 
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_CAPABILITIES_PAGE);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc.req_xfer)) {
+	if (ide_queue_pc_tail(drive, tape->disk, &pc, buf, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: Can't get tape parameters - assuming"
 				" some default values\n");
 		tape->blk_size = 512;
@@ -1651,7 +1650,7 @@ static void idetape_get_mode_sense_results(ide_drive_t *drive)
 		put_unaligned(6*52, (u16 *)&tape->caps[16]);
 		return;
 	}
-	caps = pc.buf + 4 + pc.buf[3];
+	caps = buf + 4 + buf[3];
 
 	/* convert to host order and save for later use */
 	speed = be16_to_cpup((__be16 *)&caps[14]);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 7e15bd1eaae..4cd7157a403 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1161,7 +1161,7 @@ enum {
 };
 
 int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
-		      unsigned int);
+		      void *, unsigned int);
 
 int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
-- 
cgit v1.2.3-70-g09d2


From 19f52a784f7ecb5b51cd73cc4514614b600b995a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Mon, 4 May 2009 09:53:03 +0200
Subject: ide-atapi: remove pc->buf

Now after all users of pc->buf have been converted, remove the 64B buffer
embedded in each packet command.

There should be no functional change resulting from this patch.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
---
 drivers/ide/ide-atapi.c        |  6 ------
 drivers/ide/ide-floppy.c       |  8 +-------
 drivers/ide/ide-floppy_ioctl.c |  1 -
 drivers/ide/ide-tape.c         |  7 ++-----
 include/linux/ide.h            | 11 -----------
 5 files changed, 3 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 66ea1e7774f..3075b041466 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -74,8 +74,6 @@ EXPORT_SYMBOL_GPL(ide_check_atapi_device);
 void ide_init_pc(struct ide_atapi_pc *pc)
 {
 	memset(pc, 0, sizeof(*pc));
-	pc->buf = pc->pc_buf;
-	pc->buf_size = IDE_PC_BUFFER_SIZE;
 }
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
@@ -254,10 +252,6 @@ void ide_retry_pc(ide_drive_t *drive)
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
 
-	/* pointer to mapped address */
-	pc->buf = bio_data(sense_rq->bio);
-	pc->req_xfer = blk_rq_bytes(sense_rq);
-
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 14e5e9ca2ad..800c83a9db8 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -210,8 +210,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	pc->rq = rq;
 	if (rq->cmd_flags & REQ_RW)
 		pc->flags |= PC_FLAG_WRITING;
-	pc->buf = NULL;
-	pc->buf_size = blk_rq_bytes(rq);
+
 	pc->flags |= PC_FLAG_DMA_OK;
 }
 
@@ -226,9 +225,6 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 		if (rq_data_dir(rq) == WRITE)
 			pc->flags |= PC_FLAG_WRITING;
 	}
-	/* pio will be performed by ide_pio_bytes() which handles sg fine */
-	pc->buf = NULL;
-	pc->buf_size = blk_rq_bytes(rq);
 }
 
 static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
@@ -388,8 +384,6 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	drive->capacity64 = 0;
 
 	ide_floppy_create_read_capacity_cmd(&pc);
-	pc.buf_size = sizeof(pc_buf);
-
 	if (ide_queue_pc_tail(drive, disk, &pc, pc_buf, pc.req_xfer)) {
 		printk(KERN_ERR PFX "Can't get floppy parameters\n");
 		return 1;
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 3a1f9b50b3e..9c2288234de 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -47,7 +47,6 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 		return -EINVAL;
 
 	ide_floppy_create_read_capacity_cmd(pc);
-	pc->buf_size = sizeof(pc_buf);
 
 	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc_buf, pc->req_xfer)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index ead2734bc71..9ca2665faf3 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -568,9 +568,8 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->buf = NULL;
-	pc->buf_size = blk_rq_bytes(rq);
-	if (pc->buf_size == tape->buffer_size)
+
+	if (blk_rq_bytes(rq) == tape->buffer_size)
 		pc->flags |= PC_FLAG_DMA_OK;
 
 	if (opcode == READ_6)
@@ -1608,8 +1607,6 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 	char fw_rev[4], vendor_id[8], product_id[16];
 
 	idetape_create_inquiry_cmd(&pc);
-	pc.buf_size = sizeof(pc_buf);
-
 	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc_buf, pc.req_xfer)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 4cd7157a403..59aedcd7fae 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -341,11 +341,6 @@ enum {
 	PC_FLAG_WRITING			= (1 << 6),
 };
 
-/*
- * With each packet command, we allocate a buffer of IDE_PC_BUFFER_SIZE bytes.
- * This is used for several packet commands (not for READ/WRITE commands).
- */
-#define IDE_PC_BUFFER_SIZE	64
 #define ATAPI_WAIT_PC		(60 * HZ)
 
 struct ide_atapi_pc {
@@ -358,10 +353,6 @@ struct ide_atapi_pc {
 	/* bytes to transfer */
 	int req_xfer;
 
-	/* data buffer */
-	u8 *buf;
-	int buf_size;
-
 	/* the corresponding request */
 	struct request *rq;
 
@@ -371,8 +362,6 @@ struct ide_atapi_pc {
 	 * those are more or less driver-specific and some of them are subject
 	 * to change/removal later.
 	 */
-	u8 pc_buf[IDE_PC_BUFFER_SIZE];
-
 	unsigned long timeout;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 103f7033bd0f7b65ff3e0a5ea72449d08010b031 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Sun, 26 Apr 2009 10:39:07 +0200
Subject: ide: unify interrupt reason checking

Add ide_check_ireason() function that handles all ATAPI devices.
Reorganize all unlikely cases in ireason checking further down in the
code path.

In addition, add PFX for printks originating from ide-atapi. Finally,
remove ide_cd_check_ireason.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
---
 drivers/ide/ide-atapi.c | 95 +++++++++++++++++++++++++++++++++++--------------
 drivers/ide/ide-cd.c    | 47 +-----------------------
 include/linux/ide.h     |  2 ++
 3 files changed, 71 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3075b041466..1125ce29809 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -10,6 +10,9 @@
 
 #include <scsi/scsi.h>
 
+#define DRV_NAME "ide-atapi"
+#define PFX DRV_NAME ": "
+
 #ifdef DEBUG
 #define debug_log(fmt, args...) \
 	printk(KERN_INFO "ide: " fmt, ## args)
@@ -197,8 +200,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 			      GFP_NOIO);
 	if (unlikely(err)) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "%s: failed to map sense buffer\n",
-			       drive->name);
+			printk(KERN_WARNING PFX "%s: failed to map sense "
+					    "buffer\n", drive->name);
 		return;
 	}
 
@@ -219,7 +222,7 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
 	/* deferred failure from ide_prep_sense() */
 	if (!drive->sense_rq_armed) {
-		printk(KERN_WARNING "%s: failed queue sense request\n",
+		printk(KERN_WARNING PFX "%s: error queuing a sense request\n",
 		       drive->name);
 		return -ENOMEM;
 	}
@@ -292,7 +295,7 @@ int ide_cd_expiry(ide_drive_t *drive)
 		break;
 	default:
 		if (!(rq->cmd_flags & REQ_QUIET))
-			printk(KERN_INFO "cmd 0x%x timed out\n",
+			printk(KERN_INFO PFX "cmd 0x%x timed out\n",
 					 rq->cmd[0]);
 		wait = 0;
 		break;
@@ -325,6 +328,55 @@ void ide_read_bcount_and_ireason(ide_drive_t *drive, u16 *bcount, u8 *ireason)
 }
 EXPORT_SYMBOL_GPL(ide_read_bcount_and_ireason);
 
+/*
+ * Check the contents of the interrupt reason register and attempt to recover if
+ * there are problems.
+ *
+ * Returns:
+ * - 0 if everything's ok
+ * - 1 if the request has to be terminated.
+ */
+int ide_check_ireason(ide_drive_t *drive, struct request *rq, int len,
+		      int ireason, int rw)
+{
+	ide_hwif_t *hwif = drive->hwif;
+
+	debug_log("ireason: 0x%x, rw: 0x%x\n", ireason, rw);
+
+	if (ireason == (!rw << 1))
+		return 0;
+	else if (ireason == (rw << 1)) {
+		printk(KERN_ERR PFX "%s: %s: wrong transfer direction!\n",
+				drive->name, __func__);
+
+		if (dev_is_idecd(drive))
+			ide_pad_transfer(drive, rw, len);
+	} else if (!rw && ireason == ATAPI_COD) {
+		if (dev_is_idecd(drive)) {
+			/*
+			 * Some drives (ASUS) seem to tell us that status info
+			 * is available.  Just get it and ignore.
+			 */
+			(void)hwif->tp_ops->read_status(hwif);
+			return 0;
+		}
+	} else {
+		if (ireason & ATAPI_COD)
+			printk(KERN_ERR PFX "%s: CoD != 0 in %s\n", drive->name,
+					__func__);
+
+		/* drive wants a command packet, or invalid ireason... */
+		printk(KERN_ERR PFX "%s: %s: bad interrupt reason 0x%02x\n",
+				drive->name, __func__, ireason);
+	}
+
+	if (dev_is_idecd(drive) && rq->cmd_type == REQ_TYPE_ATA_PC)
+		rq->cmd_flags |= REQ_FAILED;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(ide_check_ireason);
+
 /*
  * This is the usual interrupt handler which will be called during a packet
  * command.  We will transfer some of the data (as requested by the drive)
@@ -359,7 +411,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
-				printk(KERN_ERR "%s: DMA %s error\n",
+				printk(KERN_ERR PFX "%s: DMA %s error\n",
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
@@ -391,8 +443,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 				pc->rq->errors++;
 
 			if (rq->cmd[0] == REQUEST_SENSE) {
-				printk(KERN_ERR "%s: I/O error in request sense"
-						" command\n", drive->name);
+				printk(KERN_ERR PFX "%s: I/O error in request "
+						"sense command\n", drive->name);
 				return ide_do_reset(drive);
 			}
 
@@ -434,8 +486,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
 		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		printk(KERN_ERR "%s: The device wants to issue more interrupts "
-				"in DMA mode\n", drive->name);
+		printk(KERN_ERR PFX "%s: The device wants to issue more "
+				"interrupts in DMA mode\n", drive->name);
 		ide_dma_off(drive);
 		return ide_do_reset(drive);
 	}
@@ -443,19 +495,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	/* Get the number of bytes to transfer on this interrupt. */
 	ide_read_bcount_and_ireason(drive, &bcount, &ireason);
 
-	if (ireason & ATAPI_COD) {
-		printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__);
+	if (ide_check_ireason(drive, rq, bcount, ireason, write))
 		return ide_do_reset(drive);
-	}
-
-	if (((ireason & ATAPI_IO) == ATAPI_IO) == write) {
-		/* Hopefully, we will never get here */
-		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
-				"to %s!\n", drive->name,
-				(ireason & ATAPI_IO) ? "Write" : "Read",
-				(ireason & ATAPI_IO) ? "Read" : "Write");
-		return ide_do_reset(drive);
-	}
 
 	done = min_t(unsigned int, bcount, cmd->nleft);
 	ide_pio_bytes(drive, cmd, write, done);
@@ -503,13 +544,13 @@ static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason)
 
 	while (retries-- && ((ireason & ATAPI_COD) == 0 ||
 		(ireason & ATAPI_IO))) {
-		printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
+		printk(KERN_ERR PFX "%s: (IO,CoD != (0,1) while issuing "
 				"a packet command, retrying\n", drive->name);
 		udelay(100);
 		ireason = ide_read_ireason(drive);
 		if (retries == 0) {
-			printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
-					"a packet command, ignoring\n",
+			printk(KERN_ERR PFX "%s: (IO,CoD != (0,1) while issuing"
+					" a packet command, ignoring\n",
 					drive->name);
 			ireason |= ATAPI_COD;
 			ireason &= ~ATAPI_IO;
@@ -540,7 +581,7 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 	u8 ireason;
 
 	if (ide_wait_stat(&startstop, drive, ATA_DRQ, ATA_BUSY, WAIT_READY)) {
-		printk(KERN_ERR "%s: Strange, packet command initiated yet "
+		printk(KERN_ERR PFX "%s: Strange, packet command initiated yet "
 				"DRQ isn't asserted\n", drive->name);
 		return startstop;
 	}
@@ -582,8 +623,8 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 			ireason = ide_wait_ireason(drive, ireason);
 
 		if ((ireason & ATAPI_COD) == 0 || (ireason & ATAPI_IO)) {
-			printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing "
-					"a packet command\n", drive->name);
+			printk(KERN_ERR PFX "%s: (IO,CoD) != (0,1) while "
+				"issuing a packet command\n", drive->name);
 
 			return ide_do_reset(drive);
 		}
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index dca41ae0d04..d299713bfdc 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -410,50 +410,6 @@ end_request:
 		return 2;
 }
 
-/*
- * Check the contents of the interrupt reason register from the cdrom
- * and attempt to recover if there are problems.  Returns  0 if everything's
- * ok; nonzero if the request has been terminated.
- */
-static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
-				int len, int ireason, int rw)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	ide_debug_log(IDE_DBG_FUNC, "ireason: 0x%x, rw: 0x%x", ireason, rw);
-
-	/*
-	 * ireason == 0: the drive wants to receive data from us
-	 * ireason == 2: the drive is expecting to transfer data to us
-	 */
-	if (ireason == (!rw << 1))
-		return 0;
-	else if (ireason == (rw << 1)) {
-
-		/* whoops... */
-		printk(KERN_ERR PFX "%s: %s: wrong transfer direction!\n",
-				drive->name, __func__);
-
-		ide_pad_transfer(drive, rw, len);
-	} else  if (rw == 0 && ireason == 1) {
-		/*
-		 * Some drives (ASUS) seem to tell us that status info is
-		 * available.  Just get it and ignore.
-		 */
-		(void)hwif->tp_ops->read_status(hwif);
-		return 0;
-	} else {
-		/* drive wants a command packet, or invalid ireason... */
-		printk(KERN_ERR PFX "%s: %s: bad interrupt reason 0x%02x\n",
-				drive->name, __func__, ireason);
-	}
-
-	if (rq->cmd_type == REQ_TYPE_ATA_PC)
-		rq->cmd_flags |= REQ_FAILED;
-
-	return -1;
-}
-
 static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct request *rq = cmd->rq;
@@ -645,8 +601,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		goto out_end;
 	}
 
-	/* check which way to transfer data */
-	rc = ide_cd_check_ireason(drive, rq, len, ireason, write);
+	rc = ide_check_ireason(drive, rq, len, ireason, write);
 	if (rc)
 		goto out_end;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 59aedcd7fae..70a2c94d668 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1125,6 +1125,8 @@ void SELECT_MASK(ide_drive_t *, int);
 u8 ide_read_error(ide_drive_t *);
 void ide_read_bcount_and_ireason(ide_drive_t *, u16 *, u8 *);
 
+int ide_check_ireason(ide_drive_t *, struct request *, int, int, int);
+
 int ide_check_atapi_device(ide_drive_t *, const char *);
 
 void ide_init_pc(struct ide_atapi_pc *);
-- 
cgit v1.2.3-70-g09d2


From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 16:21:38 +0200
Subject: perf_counter: Rework the perf counter disable/enable

The current disable/enable mechanism is:

	token = hw_perf_save_disable();
	...
	/* do bits */
	...
	hw_perf_restore(token);

This works well, provided that the use nests properly. Except we don't.

x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore
provide a reference counter disable/enable interface, where the first disable
disables the hardware, and the last enable enables the hardware again.

[ Impact: refactor, simplify the PMU disable/enable logic ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  24 ++++----
 arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++-----------------------
 drivers/acpi/processor_idle.c      |   6 +-
 include/linux/perf_counter.h       |  10 ++--
 kernel/perf_counter.c              |  76 +++++++++++++++----------
 5 files changed, 109 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 15cdc8e6722..bb1b463c136 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long ret;
@@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void)
 		mb();
 	}
 	local_irq_restore(flags);
-	return ret;
 }
 
 /*
@@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void)
  * If we were previously disabled and counters were added, then
  * put the new config on the PMU.
  */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
 {
 	struct perf_counter *counter;
 	struct cpu_hw_counters *cpuhw;
@@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable)
 	int n_lim;
 	int idx;
 
-	if (disable)
-		return;
 	local_irq_save(flags);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
@@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 /*
  * Add a counter to the PMU.
  * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
 static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
-	u64 pmudis;
 	int n0;
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * Add the counter to the list (if there is room)
@@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter)
 
 	ret = 0;
  out:
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 	return ret;
 }
@@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
-	u64 pmudis;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	power_pmu_read(counter);
 
@@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7601c014f8f..313638cecbb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -31,7 +31,6 @@ struct cpu_hw_counters {
 	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			throttle_ctrl;
 	int			enabled;
 };
 
@@ -42,8 +41,8 @@ struct x86_pmu {
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
-	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
@@ -56,6 +55,7 @@ struct x86_pmu {
 	int		counter_bits;
 	u64		counter_mask;
 	u64		max_period;
+	u64		intel_ctrl;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 intel_pmu_save_disable_all(void)
+static void intel_pmu_disable_all(void)
 {
-	u64 ctrl;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	return ctrl;
 }
 
-static u64 amd_pmu_save_disable_all(void)
+static void amd_pmu_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int enabled, idx;
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
 
-	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
@@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
-	if (!enabled)
-		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void)
 		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
-
-out:
-	return enabled;
 }
 
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	if (!x86_pmu_initialized())
-		return 0;
-	return x86_pmu.save_disable_all();
+		return;
+	return x86_pmu.disable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void intel_pmu_restore_all(u64 ctrl)
+static void intel_pmu_enable_all(void)
 {
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 }
 
-static void amd_pmu_restore_all(u64 ctrl)
+static void amd_pmu_enable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
-	cpuc->enabled = ctrl;
-	barrier();
-	if (!ctrl)
+	if (cpuc->enabled)
 		return;
 
+	cpuc->enabled = 1;
+	barrier();
+
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
@@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl)
 	}
 }
 
-void hw_perf_restore(u64 ctrl)
+void hw_perf_enable(void)
 {
 	if (!x86_pmu_initialized())
 		return;
-	x86_pmu.restore_all(ctrl);
+	x86_pmu.enable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline u64 intel_pmu_get_status(void)
 {
@@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	int ret = 0;
-
-	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
+	perf_disable();
 	status = intel_pmu_get_status();
-	if (!status)
-		goto out;
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
 
-	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -767,19 +751,11 @@ again:
 	status = intel_pmu_get_status();
 	if (status)
 		goto again;
-out:
-	/*
-	 * Restore - do not reenable when global enable is off or throttled:
-	 */
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
-			intel_pmu_restore_all(cpuc->throttle_ctrl);
-		} else {
-			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
-		}
-	}
 
-	return ret;
+	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
+		perf_enable();
+
+	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
@@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct hw_perf_counter *hwc;
 	int idx, throttle = 0;
 
-	cpuc->throttle_ctrl = cpuc->enabled;
-	cpuc->enabled = 0;
-	barrier();
-
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			throttle = 1;
+	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
+		throttle = 1;
+		__perf_disable();
+		cpuc->enabled = 0;
+		barrier();
 	}
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -824,9 +798,6 @@ next:
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
-	if (cpuc->throttle_ctrl && !throttle)
-		cpuc->enabled = 1;
-
 	return handled;
 }
 
@@ -839,13 +810,11 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
-
 		/*
 		 * Clear them before re-enabling irqs/NMIs again:
 		 */
 		cpuc->interrupts = 0;
-		hw_perf_restore(cpuc->throttle_ctrl);
+		perf_enable();
 	} else {
 		cpuc->interrupts = 0;
 	}
@@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
-	.save_disable_all	= intel_pmu_save_disable_all,
-	.restore_all		= intel_pmu_restore_all,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = {
 static struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
-	.save_disable_all	= amd_pmu_save_disable_all,
-	.restore_all		= amd_pmu_restore_all,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
@@ -1003,6 +972,8 @@ static int intel_pmu_init(void)
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d2830f39d46..9645758c047 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -763,11 +763,9 @@ static int acpi_idle_bm_check(void)
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-	u64 perf_flags;
-
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -782,7 +780,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 614f921d616..e543ecc129f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
-extern u64 hw_perf_save_disable(void);
-extern void hw_perf_restore(u64 ctrl);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)				{ }
-static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline void perf_disable(void)					{ }
+static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 985be0b662a..e814ff04d7c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte
 	return NULL;
 }
 
-u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
+void __weak hw_perf_disable(void)		{ barrier(); }
+void __weak hw_perf_enable(void)		{ barrier(); }
+
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
@@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)	{ }
 
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+	__get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+	return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+	__perf_disable();
+	hw_perf_disable();
+}
+EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */
+
+void perf_enable(void)
+{
+	if (__perf_enable())
+		hw_perf_enable();
+}
+EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
-	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
-	u64 perf_flags;
 	int err;
 
 	/*
@@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	add_counter_to_ctx(counter, ctx);
 
@@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info)
 		cpuctx->max_pertask--;
 
  unlock:
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
@@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
-	unsigned long pmuflags;
 	unsigned long flags;
 	int err;
 
@@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info)
 	if (!group_can_go_on(counter, cpuctx, 1)) {
 		err = -EEXIST;
 	} else {
-		pmuflags = hw_perf_save_disable();
+		perf_disable();
 		if (counter == leader)
 			err = group_sched_in(counter, cpuctx, ctx,
 					     smp_processor_id());
 		else
 			err = counter_sched_in(counter, cpuctx, ctx,
 					       smp_processor_id());
-		hw_perf_restore(pmuflags);
+		perf_enable();
 	}
 
 	if (err) {
@@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
-	u64 flags;
 
 	spin_lock(&ctx->lock);
 	ctx->is_active = 0;
@@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 		goto out;
 	update_context_time(ctx);
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
-	u64 flags;
 	int can_add_hw = 1;
 
 	spin_lock(&ctx->lock);
@@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	ctx->timestamp = perf_clock();
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * First go through the list and put on any pinned groups
@@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 				can_add_hw = 0;
 		}
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -955,7 +977,6 @@ int perf_counter_task_disable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
@@ -969,7 +990,7 @@ int perf_counter_task_disable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state != PERF_COUNTER_STATE_ERROR) {
@@ -978,7 +999,7 @@ int perf_counter_task_disable(void)
 		}
 	}
 
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 
@@ -991,7 +1012,6 @@ int perf_counter_task_enable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
@@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state > PERF_COUNTER_STATE_OFF)
@@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void)
 			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 
@@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void)
 static void rotate_ctx(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 perf_flags;
 
 	if (!ctx->nr_counters)
 		return;
@@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_move_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 }
@@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child,
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
-		u64 perf_flags;
 
 		/*
 		 * Disable and unlink this counter.
@@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * could still be processing it:
 		 */
 		local_irq_save(flags);
-		perf_flags = hw_perf_save_disable();
+		perf_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
@@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		child_ctx->nr_counters--;
 
-		hw_perf_restore(perf_flags);
+		perf_enable();
 		local_irq_restore(flags);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:27 +0200
Subject: perf_counter: per user mlock gift

Instead of a per-process mlock gift for perf-counters, use a
per-user gift so that there is less of a DoS potential.

[ Impact: allow less worst-case unprivileged memory consumption ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.496182835@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/perf_counter.c | 22 +++++++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d1857580a13..ff59d123151 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -674,6 +674,10 @@ struct user_struct {
 	struct work_struct work;
 #endif
 #endif
+
+#ifdef CONFIG_PERF_COUNTERS
+	atomic_long_t locked_vm;
+#endif
 };
 
 extern int uids_sysfs_init(void);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0173738dd54..93f4a0e4b87 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
-int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1522,6 +1522,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
+		struct user_struct *user = current_user();
+
+		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= counter->data->nr_locked;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
@@ -1537,11 +1540,13 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
+	struct user_struct *user = current_user();
 	unsigned long vma_size;
 	unsigned long nr_pages;
+	unsigned long user_locked, user_lock_limit;
 	unsigned long locked, lock_limit;
+	long user_extra, extra;
 	int ret = 0;
-	long extra;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
@@ -1569,15 +1574,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	extra = nr_pages /* + 1 only account the data pages */;
-	extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-	if (extra < 0)
-		extra = 0;
+	user_extra = nr_pages + 1;
+	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
-	locked = vma->vm_mm->locked_vm + extra;
+	extra = 0;
+	if (user_locked > user_lock_limit)
+		extra = user_locked - user_lock_limit;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
+	locked = vma->vm_mm->locked_vm + extra;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
@@ -1590,6 +1597,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 
 	atomic_set(&counter->mmap_count, 1);
+	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
 	counter->data->nr_locked = extra;
 unlock:
-- 
cgit v1.2.3-70-g09d2


From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:28 +0200
Subject: perf_counter: frequency based adaptive irq_period

Instead of specifying the irq_period for a counter, provide a target interrupt
frequency and dynamically adapt the irq_period to match this frequency.

[ Impact: new perf-counter attribute/feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.646195868@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 13 ++++----
 arch/x86/kernel/cpu/perf_counter.c |  9 ++----
 include/linux/perf_counter.h       | 10 ++++--
 kernel/perf_counter.c              | 63 ++++++++++++++++++++++++++++++--------
 4 files changed, 68 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb1b463c136..db8d5cafc15 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -534,7 +534,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw_event.irq_period) {
+		if (counter->hw.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if ((s64)counter->hw_event.irq_period < 0)
-		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
@@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
+	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
 
@@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	val = 0;
 	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (counter->hw_event.irq_period) {
+	if (period) {
 		if (left <= 0) {
-			left += counter->hw_event.irq_period;
+			left += period;
 			if (left <= 0)
-				left = counter->hw_event.irq_period;
+				left = period;
 			record = 1;
 		}
 		if (left < 0x80000000L)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a7f718eb1e..886dcf334bc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
-	hwc->irq_period	= hw_event->irq_period;
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-		hwc->irq_period = x86_pmu.max_period;
-
-	atomic64_set(&hwc->period_left, hwc->irq_period);
+	atomic64_set(&hwc->period_left,
+			min(x86_pmu.max_period, hwc->irq_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = min(x86_pmu.max_period, hwc->irq_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e543ecc129f..004b6e162b9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -130,7 +130,11 @@ struct perf_counter_hw_event {
 	 */
 	__u64			config;
 
-	__u64			irq_period;
+	union {
+		__u64		irq_period;
+		__u64		irq_freq;
+	};
+
 	__u32			record_type;
 	__u32			read_format;
 
@@ -146,8 +150,9 @@ struct perf_counter_hw_event {
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -337,6 +342,7 @@ struct hw_perf_counter {
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
+	u64				interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 93f4a0e4b87..0ad1db4f3d6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
+void perf_adjust_freq(struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+	u64 irq_period;
+	u64 events, period;
+	s64 delta;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+			continue;
+
+		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+			continue;
+
+		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		period = div64_u64(events, counter->hw_event.irq_freq);
+
+		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta >>= 1;
+
+		irq_period = counter->hw.irq_period + delta;
+
+		if (!irq_period)
+			irq_period = 1;
+
+		counter->hw.irq_period = irq_period;
+		counter->hw.interrupts = 0;
+	}
+	spin_unlock(&ctx->lock);
+}
+
 /*
  * Round-robin a context's counters:
  */
@@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = &curr->perf_counter_ctx;
 
+	perf_adjust_freq(&cpuctx->ctx);
+	perf_adjust_freq(ctx);
+
 	perf_counter_cpu_sched_out(cpuctx);
 	__perf_counter_task_sched_out(ctx);
 
@@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->hw.interrupts++;
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
+	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
@@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+	period = max_t(u64, 10000, counter->hw.irq_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
 }
@@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct pmu *pmu = NULL;
-	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
 		/*
@@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		else
 			pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
@@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		break;
 	}
 
-	if (pmu)
-		hwc->irq_period = hw_event->irq_period;
-
 	return pmu;
 }
 
@@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
 	const struct pmu *pmu;
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	long err;
 
 	counter = kzalloc(sizeof(*counter), gfpflags);
@@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	pmu = NULL;
 
+	hwc = &counter->hw;
+	if (hw_event->freq && hw_event->irq_freq)
+		hwc->irq_period = TICK_NSEC / hw_event->irq_freq;
+	else
+		hwc->irq_period = hw_event->irq_period;
+
 	/*
 	 * we currently do not support PERF_RECORD_GROUP on inherited counters
 	 */
-- 
cgit v1.2.3-70-g09d2


From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 11 Apr 2009 10:43:41 +0200
Subject: sched, timers: move calc_load() to scheduler

Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems.

The load average calculation is an rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.

Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().

[ Impact: reduce xtime_lock write locked section ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/sched.h     |  2 +-
 kernel/sched.c            | 84 +++++++++++++++++++++++++++++++++++++++++------
 kernel/sched_idletask.c   |  3 +-
 kernel/time/timekeeping.c |  2 +-
 kernel/timer.c            | 54 ++----------------------------
 5 files changed, 80 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049..6eb4892efe4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern void calc_global_load(void);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 8908d190a34..f4eb88153bd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+	/* calc_load related fields */
+	unsigned long calc_load_update;
+	long calc_load_active;
+
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
+static void calc_load_account_active(struct rq *this_rq);
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	unsigned long i, running = 0, uninterruptible = 0;
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
 
-	for_each_online_cpu(i) {
-		running += cpu_rq(i)->nr_running;
-		uninterruptible += cpu_rq(i)->nr_uninterruptible;
-	}
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+	unsigned long upd = calc_load_update + 10;
+	long active;
+
+	if (time_before(jiffies, upd))
+		return;
 
-	if (unlikely((long)uninterruptible < 0))
-		uninterruptible = 0;
+	active = atomic_long_read(&calc_load_tasks);
+	active = active > 0 ? active * FIXED_1 : 0;
 
-	return running + uninterruptible;
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+	long nr_active, delta;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long) this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+		atomic_long_add(delta, &calc_load_tasks);
+	}
 }
 
 /*
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
+
+	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+		this_rq->calc_load_update += LOAD_FREQ;
+		calc_load_account_active(this_rq);
+	}
 }
 
 #ifdef CONFIG_SMP
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 
 	}
 }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
+		rq->calc_load_update = calc_load_update;
+		rq->calc_load_active = 0;
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
-
+		calc_global_load_remove(rq);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -9059,6 +9118,8 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
+		rq->calc_load_active = 0;
+		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9166,6 +9227,9 @@ void __init sched_init(void)
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
+
+	calc_load_update = jiffies + LOAD_FREQ;
+
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c1..499672c10cb 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-
+	/* adjust the active tasks as we might go into a long sleep */
+	calc_load_account_active(rq);
 	return rq->idle;
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e..52a8bf8931f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c3..6a21d7af962 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1122,47 +1122,6 @@ void update_process_times(int user_tick)
 	run_posix_cpu_timers(p);
 }
 
-/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-	return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
-
-	count -= ticks;
-	if (unlikely(count < 0)) {
-		active_tasks = count_active_tasks();
-		do {
-			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
-}
-
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
@@ -1186,16 +1145,6 @@ void run_local_timers(void)
 	softlockup_tick();
 }
 
-/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-	update_wall_time();
-	calc_load(ticks);
-}
-
 /*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	update_times(ticks);
+	update_wall_time();
+	calc_global_load();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3-70-g09d2


From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:08:52 +0200
Subject: sched, timers: cleanup avenrun users

avenrun is an rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 fs/proc/loadavg.c     | 18 ++++++------------
 include/linux/sched.h |  1 +
 kernel/sched.c        | 15 +++++++++++++++
 kernel/timer.c        | 32 ++++++--------------------------
 4 files changed, 28 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99e..1afa4dd4cae 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-	int a, b, c;
-	unsigned long seq;
+	unsigned long avnrun[3];
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
+	get_avenrun(avnrun, FIXED_1/200, 0);
 
-	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eb4892efe4..de7b3b21777 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
diff --git a/kernel/sched.c b/kernel/sched.c
index f4eb88153bd..497c09ba61e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2868,6 +2868,21 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 6a21d7af962..a26ed294f93 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1356,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
-	unsigned long seq;
+	struct timespec tp;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
-	do {
-		struct timespec tp;
-		seq = read_seqbegin(&xtime_lock);
-
-		/*
-		 * This is annoying.  The below is the same thing
-		 * posix_get_clock_monotonic() does, but it wants to
-		 * take the lock which we want to cover the loads stuff
-		 * too.
-		 */
-
-		getnstimeofday(&tp);
-		tp.tv_sec += wall_to_monotonic.tv_sec;
-		tp.tv_nsec += wall_to_monotonic.tv_nsec;
-		monotonic_to_bootbased(&tp);
-		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-			tp.tv_sec++;
-		}
-		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+	ktime_get_ts(&tp);
+	monotonic_to_bootbased(&tp);
+	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-		info->procs = nr_threads;
-	} while (read_seqretry(&xtime_lock, seq));
+	info->procs = nr_threads;
 
 	si_meminfo(info);
 	si_swapinfo(info);
-- 
cgit v1.2.3-70-g09d2


From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 21:48:08 +1000
Subject: perf_counter: allow arch to supply event misc flags and instruction
 pointer

At present the values we put in overflow events for the misc
flags indicating processor mode and the instruction pointer are
obtained using the standard user_mode() and
instruction_pointer() functions. Those functions tell you where
the performance monitor interrupt was taken, which might not be
exactly where the counter overflow occurred, for example
because interrupts were disabled at the point where the
overflow occurred, or because the processor had many
instructions in flight and chose to complete some more
instructions beyond the one that caused the counter overflow.

Some architectures (e.g. powerpc) can supply more precise
information about where the counter overflow occurred and the
processor mode at that point.  This introduces new functions,
perf_misc_flags() and perf_instruction_pointer(), which arch
code can override to provide more precise information if
available.  They have default implementations which are
identical to the existing code.

This also adds a new misc flag value,
PERF_EVENT_MISC_HYPERVISOR, for the case where a counter
overflow occurred in the hypervisor.  We encode the processor
mode in the 2 bits previously used to indicate user or kernel
mode; the values for user and kernel mode are unchanged and
hypervisor mode is indicated by both bits being set.

[ Impact: generalize perfcounter core facilities ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 ++++++++++-
 kernel/perf_counter.c        |  5 ++---
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 004b6e162b9..c8c1dfc22c9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -215,8 +215,11 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_EVENT_MISC_KERNEL		(1 << 0)
-#define PERF_EVENT_MISC_USER		(1 << 1)
+#define PERF_EVENT_MISC_USER		(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
 #define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
 
 struct perf_event_header {
@@ -596,6 +599,12 @@ extern int sysctl_perf_counter_mlock;
 
 extern void perf_counter_init(void);
 
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
+				 PERF_EVENT_MISC_KERNEL)
+#define perf_instruction_pointer(regs)	instruction_pointer(regs)
+#endif
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 728a595399b..57840a94b16 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2042,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= user_mode(regs) ?
-		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+	header.misc |= perf_misc_flags(regs);
 
 	if (record_type & PERF_RECORD_IP) {
-		ip = instruction_pointer(regs);
+		ip = perf_instruction_pointer(regs);
 		header.type |= PERF_RECORD_IP;
 		header.size += sizeof(ip);
 	}
-- 
cgit v1.2.3-70-g09d2


From a48b2d4a0091904b4cf57d667adc2faf689750d3 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <me@felipebalbi.com>
Date: Fri, 15 May 2009 20:12:47 -0700
Subject: Input: introduce lm8323 keypad driver

lm8323 is the keypad driver used in n810 device.

[akpm@linux-foundation.org: coding-style fixes]
[dtor@mail.ru: various cleanups]
Signed-off-by: Felipe Balbi <felipe.balbi@nokia.com>
Reviewed-by: Trilok Soni <soni.trilok@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/Kconfig  |  13 +-
 drivers/input/keyboard/Makefile |   1 +
 drivers/input/keyboard/lm8323.c | 878 ++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/lm8323.h      |  46 +++
 4 files changed, 937 insertions(+), 1 deletion(-)
 create mode 100644 drivers/input/keyboard/lm8323.c
 create mode 100644 include/linux/i2c/lm8323.h

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 54775aaa7be..9d8f796c674 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -250,6 +250,17 @@ config KEYBOARD_HP7XX
 	  To compile this driver as a module, choose M here: the
 	  module will be called jornada720_kbd.
 
+config KEYBOARD_LM8323
+	tristate "LM8323 keypad chip"
+	depends on I2C
+	depends on LEDS_CLASS
+	help
+	  If you say yes here you get support for the National Semiconductor
+	  LM8323 keypad controller.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called lm8323.
+
 config KEYBOARD_OMAP
 	tristate "TI OMAP keypad support"
 	depends on (ARCH_OMAP1 || ARCH_OMAP2)
@@ -332,7 +343,7 @@ config KEYBOARD_SH_KEYSC
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called sh_keysc.
-+
+
 config KEYBOARD_EP93XX
 	tristate "EP93xx Matrix Keypad support"
 	depends on ARCH_EP93XX
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index 13ba9c95493..156b647a259 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_KEYBOARD_SPITZ)		+= spitzkbd.o
 obj-$(CONFIG_KEYBOARD_TOSA)		+= tosakbd.o
 obj-$(CONFIG_KEYBOARD_HIL)		+= hil_kbd.o
 obj-$(CONFIG_KEYBOARD_HIL_OLD)		+= hilkbd.o
+obj-$(CONFIG_KEYBOARD_LM8323)		+= lm8323.o
 obj-$(CONFIG_KEYBOARD_OMAP)		+= omap-keypad.o
 obj-$(CONFIG_KEYBOARD_PXA27x)		+= pxa27x_keypad.o
 obj-$(CONFIG_KEYBOARD_PXA930_ROTARY)	+= pxa930_rotary.o
diff --git a/drivers/input/keyboard/lm8323.c b/drivers/input/keyboard/lm8323.c
new file mode 100644
index 00000000000..574eda2a495
--- /dev/null
+++ b/drivers/input/keyboard/lm8323.c
@@ -0,0 +1,878 @@
+/*
+ * drivers/i2c/chips/lm8323.c
+ *
+ * Copyright (C) 2007-2009 Nokia Corporation
+ *
+ * Written by Daniel Stone <daniel.stone@nokia.com>
+ *            Timo O. Karjalainen <timo.o.karjalainen@nokia.com>
+ *
+ * Updated by Felipe Balbi <felipe.balbi@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation (version 2 of the License only).
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/input.h>
+#include <linux/leds.h>
+#include <linux/i2c/lm8323.h>
+
+/* Commands to send to the chip. */
+#define LM8323_CMD_READ_ID		0x80 /* Read chip ID. */
+#define LM8323_CMD_WRITE_CFG		0x81 /* Set configuration item. */
+#define LM8323_CMD_READ_INT		0x82 /* Get interrupt status. */
+#define LM8323_CMD_RESET		0x83 /* Reset, same as external one */
+#define LM8323_CMD_WRITE_PORT_SEL	0x85 /* Set GPIO in/out. */
+#define LM8323_CMD_WRITE_PORT_STATE	0x86 /* Set GPIO pullup. */
+#define LM8323_CMD_READ_PORT_SEL	0x87 /* Get GPIO in/out. */
+#define LM8323_CMD_READ_PORT_STATE	0x88 /* Get GPIO pullup. */
+#define LM8323_CMD_READ_FIFO		0x89 /* Read byte from FIFO. */
+#define LM8323_CMD_RPT_READ_FIFO	0x8a /* Read FIFO (no increment). */
+#define LM8323_CMD_SET_ACTIVE		0x8b /* Set active time. */
+#define LM8323_CMD_READ_ERR		0x8c /* Get error status. */
+#define LM8323_CMD_READ_ROTATOR		0x8e /* Read rotator status. */
+#define LM8323_CMD_SET_DEBOUNCE		0x8f /* Set debouncing time. */
+#define LM8323_CMD_SET_KEY_SIZE		0x90 /* Set keypad size. */
+#define LM8323_CMD_READ_KEY_SIZE	0x91 /* Get keypad size. */
+#define LM8323_CMD_READ_CFG		0x92 /* Get configuration item. */
+#define LM8323_CMD_WRITE_CLOCK		0x93 /* Set clock config. */
+#define LM8323_CMD_READ_CLOCK		0x94 /* Get clock config. */
+#define LM8323_CMD_PWM_WRITE		0x95 /* Write PWM script. */
+#define LM8323_CMD_START_PWM		0x96 /* Start PWM engine. */
+#define LM8323_CMD_STOP_PWM		0x97 /* Stop PWM engine. */
+
+/* Interrupt status. */
+#define INT_KEYPAD			0x01 /* Key event. */
+#define INT_ROTATOR			0x02 /* Rotator event. */
+#define INT_ERROR			0x08 /* Error: use CMD_READ_ERR. */
+#define INT_NOINIT			0x10 /* Lost configuration. */
+#define INT_PWM1			0x20 /* PWM1 stopped. */
+#define INT_PWM2			0x40 /* PWM2 stopped. */
+#define INT_PWM3			0x80 /* PWM3 stopped. */
+
+/* Errors (signalled by INT_ERROR, read with CMD_READ_ERR). */
+#define ERR_BADPAR			0x01 /* Bad parameter. */
+#define ERR_CMDUNK			0x02 /* Unknown command. */
+#define ERR_KEYOVR			0x04 /* Too many keys pressed. */
+#define ERR_FIFOOVER			0x40 /* FIFO overflow. */
+
+/* Configuration keys (CMD_{WRITE,READ}_CFG). */
+#define CFG_MUX1SEL			0x01 /* Select MUX1_OUT input. */
+#define CFG_MUX1EN			0x02 /* Enable MUX1_OUT. */
+#define CFG_MUX2SEL			0x04 /* Select MUX2_OUT input. */
+#define CFG_MUX2EN			0x08 /* Enable MUX2_OUT. */
+#define CFG_PSIZE			0x20 /* Package size (must be 0). */
+#define CFG_ROTEN			0x40 /* Enable rotator. */
+
+/* Clock settings (CMD_{WRITE,READ}_CLOCK). */
+#define CLK_RCPWM_INTERNAL		0x00
+#define CLK_RCPWM_EXTERNAL		0x03
+#define CLK_SLOWCLKEN			0x08 /* Enable 32.768kHz clock. */
+#define CLK_SLOWCLKOUT			0x40 /* Enable slow pulse output. */
+
+/* The possible addresses corresponding to CONFIG1 and CONFIG2 pin wirings. */
+#define LM8323_I2C_ADDR00		(0x84 >> 1)	/* 1000 010x */
+#define LM8323_I2C_ADDR01		(0x86 >> 1)	/* 1000 011x */
+#define LM8323_I2C_ADDR10		(0x88 >> 1)	/* 1000 100x */
+#define LM8323_I2C_ADDR11		(0x8A >> 1)	/* 1000 101x */
+
+/* Key event fifo length */
+#define LM8323_FIFO_LEN			15
+
+/* Commands for PWM engine; feed in with PWM_WRITE. */
+/* Load ramp counter from duty cycle field (range 0 - 0xff). */
+#define PWM_SET(v)			(0x4000 | ((v) & 0xff))
+/* Go to start of script. */
+#define PWM_GOTOSTART			0x0000
+/*
+ * Stop engine (generates interrupt).  If reset is 1, clear the program
+ * counter, else leave it.
+ */
+#define PWM_END(reset)			(0xc000 | (!!(reset) << 11))
+/*
+ * Ramp.  If s is 1, divide clock by 512, else divide clock by 16.
+ * Take t clock scales (up to 63) per step, for n steps (up to 126).
+ * If u is set, ramp up, else ramp down.
+ */
+#define PWM_RAMP(s, t, n, u)		((!!(s) << 14) | ((t) & 0x3f) << 8 | \
+					 ((n) & 0x7f) | ((u) ? 0 : 0x80))
+/*
+ * Loop (i.e. jump back to pos) for a given number of iterations (up to 63).
+ * If cnt is zero, execute until PWM_END is encountered.
+ */
+#define PWM_LOOP(cnt, pos)		(0xa000 | (((cnt) & 0x3f) << 7) | \
+					 ((pos) & 0x3f))
+/*
+ * Wait for trigger.  Argument is a mask of channels, shifted by the channel
+ * number, e.g. 0xa for channels 3 and 1.  Note that channels are numbered
+ * from 1, not 0.
+ */
+#define PWM_WAIT_TRIG(chans)		(0xe000 | (((chans) & 0x7) << 6))
+/* Send trigger.  Argument is same as PWM_WAIT_TRIG. */
+#define PWM_SEND_TRIG(chans)		(0xe000 | ((chans) & 0x7))
+
+struct lm8323_pwm {
+	int			id;
+	int			fade_time;
+	int			brightness;
+	int			desired_brightness;
+	bool			enabled;
+	bool			running;
+	/* pwm lock */
+	struct mutex		lock;
+	struct work_struct	work;
+	struct led_classdev	cdev;
+	struct lm8323_chip	*chip;
+};
+
+struct lm8323_chip {
+	/* device lock */
+	struct mutex		lock;
+	struct i2c_client	*client;
+	struct work_struct	work;
+	struct input_dev	*idev;
+	bool			kp_enabled;
+	bool			pm_suspend;
+	unsigned		keys_down;
+	char			phys[32];
+	unsigned short		keymap[LM8323_KEYMAP_SIZE];
+	int			size_x;
+	int			size_y;
+	int			debounce_time;
+	int			active_time;
+	struct lm8323_pwm	pwm[LM8323_NUM_PWMS];
+};
+
+#define client_to_lm8323(c)	container_of(c, struct lm8323_chip, client)
+#define dev_to_lm8323(d)	container_of(d, struct lm8323_chip, client->dev)
+#define work_to_lm8323(w)	container_of(w, struct lm8323_chip, work)
+#define cdev_to_pwm(c)		container_of(c, struct lm8323_pwm, cdev)
+#define work_to_pwm(w)		container_of(w, struct lm8323_pwm, work)
+
+#define LM8323_MAX_DATA 8
+
+/*
+ * To write, we just access the chip's address in write mode, and dump the
+ * command and data out on the bus.  The command byte and data are taken as
+ * sequential u8s out of varargs, to a maximum of LM8323_MAX_DATA.
+ */
+static int lm8323_write(struct lm8323_chip *lm, int len, ...)
+{
+	int ret, i;
+	va_list ap;
+	u8 data[LM8323_MAX_DATA];
+
+	va_start(ap, len);
+
+	if (unlikely(len > LM8323_MAX_DATA)) {
+		dev_err(&lm->client->dev, "tried to send %d bytes\n", len);
+		va_end(ap);
+		return 0;
+	}
+
+	for (i = 0; i < len; i++)
+		data[i] = va_arg(ap, int);
+
+	va_end(ap);
+
+	/*
+	 * If the host is asleep while we send the data, we can get a NACK
+	 * back while it wakes up, so try again, once.
+	 */
+	ret = i2c_master_send(lm->client, data, len);
+	if (unlikely(ret == -EREMOTEIO))
+		ret = i2c_master_send(lm->client, data, len);
+	if (unlikely(ret != len))
+		dev_err(&lm->client->dev, "sent %d bytes of %d total\n",
+			len, ret);
+
+	return ret;
+}
+
+/*
+ * To read, we first send the command byte to the chip and end the transaction,
+ * then access the chip in read mode, at which point it will send the data.
+ */
+static int lm8323_read(struct lm8323_chip *lm, u8 cmd, u8 *buf, int len)
+{
+	int ret;
+
+	/*
+	 * If the host is asleep while we send the byte, we can get a NACK
+	 * back while it wakes up, so try again, once.
+	 */
+	ret = i2c_master_send(lm->client, &cmd, 1);
+	if (unlikely(ret == -EREMOTEIO))
+		ret = i2c_master_send(lm->client, &cmd, 1);
+	if (unlikely(ret != 1)) {
+		dev_err(&lm->client->dev, "sending read cmd 0x%02x failed\n",
+			cmd);
+		return 0;
+	}
+
+	ret = i2c_master_recv(lm->client, buf, len);
+	if (unlikely(ret != len))
+		dev_err(&lm->client->dev, "wanted %d bytes, got %d\n",
+			len, ret);
+
+	return ret;
+}
+
+/*
+ * Set the chip active time (idle time before it enters halt).
+ */
+static void lm8323_set_active_time(struct lm8323_chip *lm, int time)
+{
+	lm8323_write(lm, 2, LM8323_CMD_SET_ACTIVE, time >> 2);
+}
+
+/*
+ * The signals are AT-style: the low 7 bits are the keycode, and the top
+ * bit indicates the state (1 for down, 0 for up).
+ */
+static inline u8 lm8323_whichkey(u8 event)
+{
+	return event & 0x7f;
+}
+
+static inline int lm8323_ispress(u8 event)
+{
+	return (event & 0x80) ? 1 : 0;
+}
+
+static void process_keys(struct lm8323_chip *lm)
+{
+	u8 event;
+	u8 key_fifo[LM8323_FIFO_LEN + 1];
+	int old_keys_down = lm->keys_down;
+	int ret;
+	int i = 0;
+
+	/*
+	 * Read all key events from the FIFO at once. Next READ_FIFO clears the
+	 * FIFO even if we didn't read all events previously.
+	 */
+	ret = lm8323_read(lm, LM8323_CMD_READ_FIFO, key_fifo, LM8323_FIFO_LEN);
+
+	if (ret < 0) {
+		dev_err(&lm->client->dev, "Failed reading fifo \n");
+		return;
+	}
+	key_fifo[ret] = 0;
+
+	while ((event = key_fifo[i++])) {
+		u8 key = lm8323_whichkey(event);
+		int isdown = lm8323_ispress(event);
+		unsigned short keycode = lm->keymap[key];
+
+		dev_vdbg(&lm->client->dev, "key 0x%02x %s\n",
+			 key, isdown ? "down" : "up");
+
+		if (lm->kp_enabled) {
+			input_event(lm->idev, EV_MSC, MSC_SCAN, key);
+			input_report_key(lm->idev, keycode, isdown);
+			input_sync(lm->idev);
+		}
+
+		if (isdown)
+			lm->keys_down++;
+		else
+			lm->keys_down--;
+	}
+
+	/*
+	 * Errata: We need to ensure that the chip never enters halt mode
+	 * during a keypress, so set active time to 0.  When it's released,
+	 * we can enter halt again, so set the active time back to normal.
+	 */
+	if (!old_keys_down && lm->keys_down)
+		lm8323_set_active_time(lm, 0);
+	if (old_keys_down && !lm->keys_down)
+		lm8323_set_active_time(lm, lm->active_time);
+}
+
+static void lm8323_process_error(struct lm8323_chip *lm)
+{
+	u8 error;
+
+	if (lm8323_read(lm, LM8323_CMD_READ_ERR, &error, 1) == 1) {
+		if (error & ERR_FIFOOVER)
+			dev_vdbg(&lm->client->dev, "fifo overflow!\n");
+		if (error & ERR_KEYOVR)
+			dev_vdbg(&lm->client->dev,
+					"more than two keys pressed\n");
+		if (error & ERR_CMDUNK)
+			dev_vdbg(&lm->client->dev,
+					"unknown command submitted\n");
+		if (error & ERR_BADPAR)
+			dev_vdbg(&lm->client->dev, "bad command parameter\n");
+	}
+}
+
+static void lm8323_reset(struct lm8323_chip *lm)
+{
+	/* The docs say we must pass 0xAA as the data byte. */
+	lm8323_write(lm, 2, LM8323_CMD_RESET, 0xAA);
+}
+
+static int lm8323_configure(struct lm8323_chip *lm)
+{
+	int keysize = (lm->size_x << 4) | lm->size_y;
+	int clock = (CLK_SLOWCLKEN | CLK_RCPWM_EXTERNAL);
+	int debounce = lm->debounce_time >> 2;
+	int active = lm->active_time >> 2;
+
+	/*
+	 * Active time must be greater than the debounce time: if it's
+	 * a close-run thing, give ourselves a 12ms buffer.
+	 */
+	if (debounce >= active)
+		active = debounce + 3;
+
+	lm8323_write(lm, 2, LM8323_CMD_WRITE_CFG, 0);
+	lm8323_write(lm, 2, LM8323_CMD_WRITE_CLOCK, clock);
+	lm8323_write(lm, 2, LM8323_CMD_SET_KEY_SIZE, keysize);
+	lm8323_set_active_time(lm, lm->active_time);
+	lm8323_write(lm, 2, LM8323_CMD_SET_DEBOUNCE, debounce);
+	lm8323_write(lm, 3, LM8323_CMD_WRITE_PORT_STATE, 0xff, 0xff);
+	lm8323_write(lm, 3, LM8323_CMD_WRITE_PORT_SEL, 0, 0);
+
+	/*
+	 * Not much we can do about errors at this point, so just hope
+	 * for the best.
+	 */
+
+	return 0;
+}
+
+static void pwm_done(struct lm8323_pwm *pwm)
+{
+	mutex_lock(&pwm->lock);
+	pwm->running = false;
+	if (pwm->desired_brightness != pwm->brightness)
+		schedule_work(&pwm->work);
+	mutex_unlock(&pwm->lock);
+}
+
+/*
+ * Bottom half: handle the interrupt by posting key events, or dealing with
+ * errors appropriately.
+ */
+static void lm8323_work(struct work_struct *work)
+{
+	struct lm8323_chip *lm = work_to_lm8323(work);
+	u8 ints;
+	int i;
+
+	mutex_lock(&lm->lock);
+
+	while ((lm8323_read(lm, LM8323_CMD_READ_INT, &ints, 1) == 1) && ints) {
+		if (likely(ints & INT_KEYPAD))
+			process_keys(lm);
+		if (ints & INT_ROTATOR) {
+			/* We don't currently support the rotator. */
+			dev_vdbg(&lm->client->dev, "rotator fired\n");
+		}
+		if (ints & INT_ERROR) {
+			dev_vdbg(&lm->client->dev, "error!\n");
+			lm8323_process_error(lm);
+		}
+		if (ints & INT_NOINIT) {
+			dev_err(&lm->client->dev, "chip lost config; "
+						  "reinitialising\n");
+			lm8323_configure(lm);
+		}
+		for (i = 0; i < LM8323_NUM_PWMS; i++) {
+			if (ints & (1 << (INT_PWM1 + i))) {
+				dev_vdbg(&lm->client->dev,
+					 "pwm%d engine completed\n", i);
+				pwm_done(&lm->pwm[i]);
+			}
+		}
+	}
+
+	mutex_unlock(&lm->lock);
+}
+
+/*
+ * We cannot use I2C in interrupt context, so we just schedule work.
+ */
+static irqreturn_t lm8323_irq(int irq, void *data)
+{
+	struct lm8323_chip *lm = data;
+
+	schedule_work(&lm->work);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Read the chip ID.
+ */
+static int lm8323_read_id(struct lm8323_chip *lm, u8 *buf)
+{
+	int bytes;
+
+	bytes = lm8323_read(lm, LM8323_CMD_READ_ID, buf, 2);
+	if (unlikely(bytes != 2))
+		return -EIO;
+
+	return 0;
+}
+
+static void lm8323_write_pwm_one(struct lm8323_pwm *pwm, int pos, u16 cmd)
+{
+	lm8323_write(pwm->chip, 4, LM8323_CMD_PWM_WRITE, (pos << 2) | pwm->id,
+		     (cmd & 0xff00) >> 8, cmd & 0x00ff);
+}
+
+/*
+ * Write a script into a given PWM engine, concluding with PWM_END.
+ * If 'kill' is nonzero, the engine will be shut down at the end
+ * of the script, producing a zero output. Otherwise the engine
+ * will be kept running at the final PWM level indefinitely.
+ */
+static void lm8323_write_pwm(struct lm8323_pwm *pwm, int kill,
+			     int len, const u16 *cmds)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		lm8323_write_pwm_one(pwm, i, cmds[i]);
+
+	lm8323_write_pwm_one(pwm, i++, PWM_END(kill));
+	lm8323_write(pwm->chip, 2, LM8323_CMD_START_PWM, pwm->id);
+	pwm->running = true;
+}
+
+static void lm8323_pwm_work(struct work_struct *work)
+{
+	struct lm8323_pwm *pwm = work_to_pwm(work);
+	int div512, perstep, steps, hz, up, kill;
+	u16 pwm_cmds[3];
+	int num_cmds = 0;
+
+	mutex_lock(&pwm->lock);
+
+	/*
+	 * Do nothing if we're already at the requested level,
+	 * or previous setting is not yet complete. In the latter
+	 * case we will be called again when the previous PWM script
+	 * finishes.
+	 */
+	if (pwm->running || pwm->desired_brightness == pwm->brightness)
+		goto out;
+
+	kill = (pwm->desired_brightness == 0);
+	up = (pwm->desired_brightness > pwm->brightness);
+	steps = abs(pwm->desired_brightness - pwm->brightness);
+
+	/*
+	 * Convert time (in ms) into a divisor (512 or 16 on a refclk of
+	 * 32768Hz), and number of ticks per step.
+	 */
+	if ((pwm->fade_time / steps) > (32768 / 512)) {
+		div512 = 1;
+		hz = 32768 / 512;
+	} else {
+		div512 = 0;
+		hz = 32768 / 16;
+	}
+
+	perstep = (hz * pwm->fade_time) / (steps * 1000);
+
+	if (perstep == 0)
+		perstep = 1;
+	else if (perstep > 63)
+		perstep = 63;
+
+	while (steps) {
+		int s;
+
+		s = min(126, steps);
+		pwm_cmds[num_cmds++] = PWM_RAMP(div512, perstep, s, up);
+		steps -= s;
+	}
+
+	lm8323_write_pwm(pwm, kill, num_cmds, pwm_cmds);
+	pwm->brightness = pwm->desired_brightness;
+
+ out:
+	mutex_unlock(&pwm->lock);
+}
+
+static void lm8323_pwm_set_brightness(struct led_classdev *led_cdev,
+				      enum led_brightness brightness)
+{
+	struct lm8323_pwm *pwm = cdev_to_pwm(led_cdev);
+	struct lm8323_chip *lm = pwm->chip;
+
+	mutex_lock(&pwm->lock);
+	pwm->desired_brightness = brightness;
+	mutex_unlock(&pwm->lock);
+
+	if (in_interrupt()) {
+		schedule_work(&pwm->work);
+	} else {
+		/*
+		 * Schedule PWM work as usual unless we are going into suspend
+		 */
+		mutex_lock(&lm->lock);
+		if (likely(!lm->pm_suspend))
+			schedule_work(&pwm->work);
+		else
+			lm8323_pwm_work(&pwm->work);
+		mutex_unlock(&lm->lock);
+	}
+}
+
+static ssize_t lm8323_pwm_show_time(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm8323_pwm *pwm = cdev_to_pwm(led_cdev);
+
+	return sprintf(buf, "%d\n", pwm->fade_time);
+}
+
+static ssize_t lm8323_pwm_store_time(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm8323_pwm *pwm = cdev_to_pwm(led_cdev);
+	int ret;
+	unsigned long time;
+
+	ret = strict_strtoul(buf, 10, &time);
+	/* Numbers only, please. */
+	if (ret)
+		return -EINVAL;
+
+	pwm->fade_time = time;
+
+	return strlen(buf);
+}
+static DEVICE_ATTR(time, 0644, lm8323_pwm_show_time, lm8323_pwm_store_time);
+
+static int init_pwm(struct lm8323_chip *lm, int id, struct device *dev,
+		    const char *name)
+{
+	struct lm8323_pwm *pwm;
+
+	BUG_ON(id > 3);
+
+	pwm = &lm->pwm[id - 1];
+
+	pwm->id = id;
+	pwm->fade_time = 0;
+	pwm->brightness = 0;
+	pwm->desired_brightness = 0;
+	pwm->running = false;
+	pwm->enabled = false;
+	INIT_WORK(&pwm->work, lm8323_pwm_work);
+	mutex_init(&pwm->lock);
+	pwm->chip = lm;
+
+	if (name) {
+		pwm->cdev.name = name;
+		pwm->cdev.brightness_set = lm8323_pwm_set_brightness;
+		if (led_classdev_register(dev, &pwm->cdev) < 0) {
+			dev_err(dev, "couldn't register PWM %d\n", id);
+			return -1;
+		}
+		if (device_create_file(pwm->cdev.dev,
+					&dev_attr_time) < 0) {
+			dev_err(dev, "couldn't register time attribute\n");
+			led_classdev_unregister(&pwm->cdev);
+			return -1;
+		}
+		pwm->enabled = true;
+	}
+
+	return 0;
+}
+
+static struct i2c_driver lm8323_i2c_driver;
+
+static ssize_t lm8323_show_disable(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct lm8323_chip *lm = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", !lm->kp_enabled);
+}
+
+static ssize_t lm8323_set_disable(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct lm8323_chip *lm = dev_get_drvdata(dev);
+	int ret;
+	unsigned long i;
+
+	ret = strict_strtoul(buf, 10, &i);
+
+	mutex_lock(&lm->lock);
+	lm->kp_enabled = !i;
+	mutex_unlock(&lm->lock);
+
+	return count;
+}
+static DEVICE_ATTR(disable_kp, 0644, lm8323_show_disable, lm8323_set_disable);
+
+static int __devinit lm8323_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct lm8323_platform_data *pdata = client->dev.platform_data;
+	struct input_dev *idev;
+	struct lm8323_chip *lm;
+	int i, err;
+	unsigned long tmo;
+	u8 data[2];
+
+	if (!pdata || !pdata->size_x || !pdata->size_y) {
+		dev_err(&client->dev, "missing platform_data\n");
+		return -EINVAL;
+	}
+
+	if (pdata->size_x > 8) {
+		dev_err(&client->dev, "invalid x size %d specified\n",
+			pdata->size_x);
+		return -EINVAL;
+	}
+
+	if (pdata->size_y > 12) {
+		dev_err(&client->dev, "invalid y size %d specified\n",
+			pdata->size_y);
+		return -EINVAL;
+	}
+
+	lm = kzalloc(sizeof *lm, GFP_KERNEL);
+	idev = input_allocate_device();
+	if (!lm || !idev) {
+		err = -ENOMEM;
+		goto fail1;
+	}
+
+	i2c_set_clientdata(client, lm);
+
+	lm->client = client;
+	lm->idev = idev;
+	mutex_init(&lm->lock);
+	INIT_WORK(&lm->work, lm8323_work);
+
+	lm->size_x = pdata->size_x;
+	lm->size_y = pdata->size_y;
+	dev_vdbg(&client->dev, "Keypad size: %d x %d\n",
+		 lm->size_x, lm->size_y);
+
+	lm->debounce_time = pdata->debounce_time;
+	lm->active_time = pdata->active_time;
+
+	lm8323_reset(lm);
+
+	/* Nothing's set up to service the IRQ yet, so just spin for max.
+	 * 100ms until we can configure. */
+	tmo = jiffies + msecs_to_jiffies(100);
+	while (lm8323_read(lm, LM8323_CMD_READ_INT, data, 1) == 1) {
+		if (data[0] & INT_NOINIT)
+			break;
+
+		if (time_after(jiffies, tmo)) {
+			dev_err(&client->dev,
+				"timeout waiting for initialisation\n");
+			break;
+		}
+
+		msleep(1);
+	}
+
+	lm8323_configure(lm);
+
+	/* If a true probe check the device */
+	if (lm8323_read_id(lm, data) != 0) {
+		dev_err(&client->dev, "device not found\n");
+		err = -ENODEV;
+		goto fail1;
+	}
+
+	for (i = 0; i < LM8323_NUM_PWMS; i++) {
+		err = init_pwm(lm, i + 1, &client->dev, pdata->pwm_names[i]);
+		if (err < 0)
+			goto fail2;
+	}
+
+	lm->kp_enabled = true;
+	err = device_create_file(&client->dev, &dev_attr_disable_kp);
+	if (err < 0)
+		goto fail2;
+
+	idev->name = pdata->name ? : "LM8323 keypad";
+	snprintf(lm->phys, sizeof(lm->phys),
+		 "%s/input-kp", dev_name(&client->dev));
+	idev->phys = lm->phys;
+
+	idev->evbit[0] = BIT(EV_KEY) | BIT(EV_MSC);
+	__set_bit(MSC_SCAN, idev->mscbit);
+	for (i = 0; i < LM8323_KEYMAP_SIZE; i++) {
+		__set_bit(pdata->keymap[i], idev->keybit);
+		lm->keymap[i] = pdata->keymap[i];
+	}
+	__clear_bit(KEY_RESERVED, idev->keybit);
+
+	if (pdata->repeat)
+		__set_bit(EV_REP, idev->evbit);
+
+	err = input_register_device(idev);
+	if (err) {
+		dev_dbg(&client->dev, "error registering input device\n");
+		goto fail3;
+	}
+
+	err = request_irq(client->irq, lm8323_irq,
+			  IRQF_TRIGGER_FALLING | IRQF_DISABLED,
+			  "lm8323", lm);
+	if (err) {
+		dev_err(&client->dev, "could not get IRQ %d\n", client->irq);
+		goto fail4;
+	}
+
+	device_init_wakeup(&client->dev, 1);
+	enable_irq_wake(client->irq);
+
+	return 0;
+
+fail4:
+	input_unregister_device(idev);
+	idev = NULL;
+fail3:
+	device_remove_file(&client->dev, &dev_attr_disable_kp);
+fail2:
+	while (--i >= 0)
+		if (lm->pwm[i].enabled)
+			led_classdev_unregister(&lm->pwm[i].cdev);
+fail1:
+	input_free_device(idev);
+	kfree(lm);
+	return err;
+}
+
+static int __devexit lm8323_remove(struct i2c_client *client)
+{
+	struct lm8323_chip *lm = i2c_get_clientdata(client);
+	int i;
+
+	disable_irq_wake(client->irq);
+	free_irq(client->irq, lm);
+	cancel_work_sync(&lm->work);
+
+	input_unregister_device(lm->idev);
+
+	device_remove_file(&lm->client->dev, &dev_attr_disable_kp);
+
+	for (i = 0; i < 3; i++)
+		if (lm->pwm[i].enabled)
+			led_classdev_unregister(&lm->pwm[i].cdev);
+
+	kfree(lm);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+/*
+ * We don't need to explicitly suspend the chip, as it already switches off
+ * when there's no activity.
+ */
+static int lm8323_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	struct lm8323_chip *lm = i2c_get_clientdata(client);
+	int i;
+
+	set_irq_wake(client->irq, 0);
+	disable_irq(client->irq);
+
+	mutex_lock(&lm->lock);
+	lm->pm_suspend = true;
+	mutex_unlock(&lm->lock);
+
+	for (i = 0; i < 3; i++)
+		if (lm->pwm[i].enabled)
+			led_classdev_suspend(&lm->pwm[i].cdev);
+
+	return 0;
+}
+
+static int lm8323_resume(struct i2c_client *client)
+{
+	struct lm8323_chip *lm = i2c_get_clientdata(client);
+	int i;
+
+	mutex_lock(&lm->lock);
+	lm->pm_suspend = false;
+	mutex_unlock(&lm->lock);
+
+	for (i = 0; i < 3; i++)
+		if (lm->pwm[i].enabled)
+			led_classdev_resume(&lm->pwm[i].cdev);
+
+	enable_irq(client->irq);
+	set_irq_wake(client->irq, 1);
+
+	return 0;
+}
+#else
+#define lm8323_suspend	NULL
+#define lm8323_resume	NULL
+#endif
+
+static const struct i2c_device_id lm8323_id[] = {
+	{ "lm8323", 0 },
+	{ }
+};
+
+static struct i2c_driver lm8323_i2c_driver = {
+	.driver = {
+		.name	= "lm8323",
+	},
+	.probe		= lm8323_probe,
+	.remove		= __devexit_p(lm8323_remove),
+	.suspend	= lm8323_suspend,
+	.resume		= lm8323_resume,
+	.id_table	= lm8323_id,
+};
+MODULE_DEVICE_TABLE(i2c, lm8323_id);
+
+static int __init lm8323_init(void)
+{
+	return i2c_add_driver(&lm8323_i2c_driver);
+}
+module_init(lm8323_init);
+
+static void __exit lm8323_exit(void)
+{
+	i2c_del_driver(&lm8323_i2c_driver);
+}
+module_exit(lm8323_exit);
+
+MODULE_AUTHOR("Timo O. Karjalainen <timo.o.karjalainen@nokia.com>");
+MODULE_AUTHOR("Daniel Stone");
+MODULE_AUTHOR("Felipe Balbi <felipe.balbi@nokia.com>");
+MODULE_DESCRIPTION("LM8323 keypad driver");
+MODULE_LICENSE("GPL");
+
diff --git a/include/linux/i2c/lm8323.h b/include/linux/i2c/lm8323.h
new file mode 100644
index 00000000000..478d668bc59
--- /dev/null
+++ b/include/linux/i2c/lm8323.h
@@ -0,0 +1,46 @@
+/*
+ * lm8323.h - Configuration for LM8323 keypad driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation (version 2 of the License only).
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __LINUX_LM8323_H
+#define __LINUX_LM8323_H
+
+#include <linux/types.h>
+
+/*
+ * Largest keycode that the chip can send, plus one,
+ * so keys can be mapped directly at the index of the
+ * LM8323 keycode instead of subtracting one.
+ */
+#define LM8323_KEYMAP_SIZE	(0x7f + 1)
+
+#define LM8323_NUM_PWMS		3
+
+struct lm8323_platform_data {
+	int debounce_time; /* Time to watch for key bouncing, in ms. */
+	int active_time; /* Idle time until sleep, in ms. */
+
+	int size_x;
+	int size_y;
+	bool repeat;
+	const unsigned short *keymap;
+
+	const char *pwm_names[LM8323_NUM_PWMS];
+
+	const char *name; /* Device name. */
+};
+
+#endif /* __LINUX_LM8323_H */
-- 
cgit v1.2.3-70-g09d2


From ca1b96e00ab5d1b0838965834469a0284c81a517 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 17 May 2009 19:12:21 +0200
Subject: ide: replace special_t typedef by IDE_SFLAG_* flags

Replace:
- special_t typedef by IDE_SFLAG_* flags
- 'special_t special' ide_drive_t's field by 'u8 special_flags' one

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-disk.c     |  4 ++--
 drivers/ide/ide-eh.c       |  9 ++++-----
 drivers/ide/ide-io.c       | 21 +++++++++++----------
 drivers/ide/ide-probe.c    |  6 +++---
 drivers/ide/ide-taskfile.c |  2 +-
 drivers/ide/siimage.c      |  4 ++--
 include/linux/ide.h        | 21 ++++++---------------
 7 files changed, 29 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c2438804d3c..d345f5f23f0 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -428,14 +428,14 @@ static int set_multcount(ide_drive_t *drive, int arg)
 	if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
 		return -EINVAL;
 
-	if (drive->special.b.set_multmode)
+	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
 		return -EBUSY;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 
 	drive->mult_req = arg;
-	drive->special.b.set_multmode = 1;
+	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
 
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 5d5fb961b5c..39d589254d4 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -52,7 +52,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 	}
 
 	if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
-		drive->special.b.recalibrate = 1;
+		drive->special_flags |= IDE_SFLAG_RECALIBRATE;
 
 	++rq->errors;
 
@@ -268,9 +268,8 @@ static void ide_disk_pre_reset(ide_drive_t *drive)
 {
 	int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1;
 
-	drive->special.all = 0;
-	drive->special.b.set_geometry = legacy;
-	drive->special.b.recalibrate  = legacy;
+	drive->special_flags =
+		legacy ? (IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE) : 0;
 
 	drive->mult_count = 0;
 	drive->dev_flags &= ~IDE_DFLAG_PARKED;
@@ -280,7 +279,7 @@ static void ide_disk_pre_reset(ide_drive_t *drive)
 		drive->mult_req = 0;
 
 	if (drive->mult_req != drive->mult_count)
-		drive->special.b.set_multmode = 1;
+		drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
 }
 
 static void pre_reset(ide_drive_t *drive)
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 18557683ed5..644d7b4454a 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -194,14 +194,14 @@ static void ide_tf_set_setmult_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
 
 static ide_startstop_t do_special(ide_drive_t *drive)
 {
-	special_t *s = &drive->special;
 	struct ide_cmd cmd;
 
 #ifdef DEBUG
-	printk(KERN_DEBUG "%s: %s: 0x%02x\n", drive->name, __func__, s->all);
+	printk(KERN_DEBUG "%s: %s: 0x%02x\n", drive->name, __func__,
+		drive->special_flags);
 #endif
 	if (drive->media != ide_disk) {
-		s->all = 0;
+		drive->special_flags = 0;
 		drive->mult_req = 0;
 		return ide_stopped;
 	}
@@ -209,14 +209,14 @@ static ide_startstop_t do_special(ide_drive_t *drive)
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.protocol = ATA_PROT_NODATA;
 
-	if (s->b.set_geometry) {
-		s->b.set_geometry = 0;
+	if (drive->special_flags & IDE_SFLAG_SET_GEOMETRY) {
+		drive->special_flags &= ~IDE_SFLAG_SET_GEOMETRY;
 		ide_tf_set_specify_cmd(drive, &cmd.tf);
-	} else if (s->b.recalibrate) {
-		s->b.recalibrate = 0;
+	} else if (drive->special_flags & IDE_SFLAG_RECALIBRATE) {
+		drive->special_flags &= ~IDE_SFLAG_RECALIBRATE;
 		ide_tf_set_restore_cmd(drive, &cmd.tf);
-	} else if (s->b.set_multmode) {
-		s->b.set_multmode = 0;
+	} else if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) {
+		drive->special_flags &= ~IDE_SFLAG_SET_MULTMODE;
 		ide_tf_set_setmult_cmd(drive, &cmd.tf);
 	} else
 		BUG();
@@ -339,7 +339,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
 		return startstop;
 	}
-	if (!drive->special.all) {
+
+	if (drive->special_flags == 0) {
 		struct ide_driver *drv;
 
 		/*
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index b609a581df4..727a67109ff 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -97,7 +97,7 @@ static void ide_disk_init_mult_count(ide_drive_t *drive)
 		drive->mult_req = id[ATA_ID_MULTSECT] & 0xff;
 
 		if (drive->mult_req)
-			drive->special.b.set_multmode = 1;
+			drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
 	}
 }
 
@@ -1138,8 +1138,8 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 		drive->hwif			= hwif;
 		drive->ready_stat		= ATA_DRDY;
 		drive->bad_wstat		= BAD_W_STAT;
-		drive->special.b.recalibrate	= 1;
-		drive->special.b.set_geometry	= 1;
+		drive->special_flags		= IDE_SFLAG_RECALIBRATE |
+						  IDE_SFLAG_SET_GEOMETRY;
 		drive->name[0]			= 'h';
 		drive->name[1]			= 'd';
 		drive->name[2]			= 'a' + j;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index f400eb4d4af..8cab3c26acd 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -166,7 +166,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 	if (!OK_STAT(stat, ATA_DRDY, BAD_STAT)) {
 		if (custom && tf->command == ATA_CMD_SET_MULTI) {
 			drive->mult_req = drive->mult_count = 0;
-			drive->special.b.recalibrate = 1;
+			drive->special_flags |= IDE_SFLAG_RECALIBRATE;
 			(void)ide_dump_status(drive, __func__, stat);
 			return ide_stopped;
 		} else if (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) {
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index e4973cd1fba..bd82d228608 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -451,8 +451,8 @@ static int sil_sata_reset_poll(ide_drive_t *drive)
 static void sil_sata_pre_reset(ide_drive_t *drive)
 {
 	if (drive->media == ide_disk) {
-		drive->special.b.set_geometry = 0;
-		drive->special.b.recalibrate = 0;
+		drive->special_flags &=
+			~(IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 34c128f0a33..fc61328a4cd 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -218,21 +218,12 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 
 /*
  * Special Driver Flags
- *
- * set_geometry	: respecify drive geometry
- * recalibrate	: seek to cyl 0
- * set_multmode	: set multmode count
- * reserved	: unused
  */
-typedef union {
-	unsigned all			: 8;
-	struct {
-		unsigned set_geometry	: 1;
-		unsigned recalibrate	: 1;
-		unsigned set_multmode	: 1;
-		unsigned reserved	: 5;
-	} b;
-} special_t;
+enum {
+	IDE_SFLAG_SET_GEOMETRY		= (1 << 0),
+	IDE_SFLAG_RECALIBRATE		= (1 << 1),
+	IDE_SFLAG_SET_MULTMODE		= (1 << 2),
+};
 
 /*
  * Status returned from various ide_ functions
@@ -530,7 +521,7 @@ struct ide_drive_s {
 	unsigned long sleep;		/* sleep until this time */
 	unsigned long timeout;		/* max time to wait for irq */
 
-	special_t	special;	/* special action flags */
+	u8	special_flags;		/* special action flags */
 
 	u8	select;			/* basic drive/head select reg value */
 	u8	retry_pio;		/* retrying dma capable host in pio */
-- 
cgit v1.2.3-70-g09d2


From 29e52cf793ded6bece50de50e738596f94f07d9f Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 17 May 2009 19:12:22 +0200
Subject: ide: remove chipset field from hw_regs_t

* Convert host drivers that still use hw_regs_t's chipset field to use
  the one in struct ide_port_info instead.

* Move special handling of ide_pci chipset type from ide_hw_configure()
  to ide_init_port().

* Remove chipset field from hw_regs_t.

While at it:
- remove stale comment in delkin_cb.c

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     | 2 +-
 drivers/ide/au1xxx-ide.c   | 2 +-
 drivers/ide/buddha.c       | 3 +--
 drivers/ide/cmd640.c       | 2 --
 drivers/ide/delkin_cb.c    | 2 +-
 drivers/ide/falconide.c    | 3 +--
 drivers/ide/gayle.c        | 3 +--
 drivers/ide/icside.c       | 3 ++-
 drivers/ide/ide-4drives.c  | 2 +-
 drivers/ide/ide-cs.c       | 2 +-
 drivers/ide/ide-generic.c  | 3 +--
 drivers/ide/ide-h8300.c    | 2 +-
 drivers/ide/ide-legacy.c   | 1 -
 drivers/ide/ide-pnp.c      | 2 +-
 drivers/ide/ide-probe.c    | 4 +---
 drivers/ide/ide_platform.c | 3 +--
 drivers/ide/macide.c       | 3 +--
 drivers/ide/palm_bk3710.c  | 2 +-
 drivers/ide/q40ide.c       | 3 +--
 drivers/ide/rapide.c       | 2 +-
 drivers/ide/scc_pata.c     | 2 +-
 drivers/ide/setup-pci.c    | 1 -
 drivers/ide/sgiioc4.c      | 1 -
 include/linux/ide.h        | 1 -
 24 files changed, 20 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 403d0e4265d..8d39cc9bdf9 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -216,6 +216,7 @@ static const struct ide_port_info at91_ide_port_info __initdata = {
 	.host_flags 	= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA | IDE_HFLAG_SINGLE |
 			  IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_UNMASK_IRQS,
 	.pio_mask 	= ATA_PIO6,
+	.chipset	= ide_generic,
 };
 
 /*
@@ -304,7 +305,6 @@ static int __init at91_ide_probe(struct platform_device *pdev)
 		ide_std_init_ports(&hw, tf_base, ctl_base + 6);
 
 	hw.irq = board->irq_pin;
-	hw.chipset = ide_generic;
 	hw.dev = &pdev->dev;
 
 	host = ide_host_alloc(&at91_ide_port_info, hws);
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 46013644c96..9b31f830e2f 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -499,6 +499,7 @@ static const struct ide_port_info au1xxx_port_info = {
 #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
 	.mwdma_mask		= ATA_MWDMA2,
 #endif
+	.chipset		= ide_au1xxx,
 };
 
 static int au_ide_probe(struct platform_device *dev)
@@ -548,7 +549,6 @@ static int au_ide_probe(struct platform_device *dev)
 	auide_setup_ports(&hw, ahwif);
 	hw.irq = ahwif->irq;
 	hw.dev = &dev->dev;
-	hw.chipset = ide_au1xxx;
 
 	ret = ide_host_add(&au1xxx_port_info, hws, &host);
 	if (ret)
diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
index d028f8864bc..9aa2cd9be31 100644
--- a/drivers/ide/buddha.c
+++ b/drivers/ide/buddha.c
@@ -139,13 +139,12 @@ static void __init buddha_setup_ports(hw_regs_t *hw, unsigned long base,
 
 	hw->irq = IRQ_AMIGA_PORTS;
 	hw->ack_intr = ack_intr;
-
-	hw->chipset = ide_generic;
 }
 
 static const struct ide_port_info buddha_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
+	.chipset		= ide_generic,
 };
 
     /*
diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c
index 8890276fef7..e862a2503ab 100644
--- a/drivers/ide/cmd640.c
+++ b/drivers/ide/cmd640.c
@@ -762,11 +762,9 @@ static int __init cmd640x_init(void)
 
 	ide_std_init_ports(&hw[0], 0x1f0, 0x3f6);
 	hw[0].irq = 14;
-	hw[0].chipset = ide_cmd640;
 
 	ide_std_init_ports(&hw[1], 0x170, 0x376);
 	hw[1].irq = 15;
-	hw[1].chipset = ide_cmd640;
 
 	printk(KERN_INFO "cmd640: buggy cmd640%c interface on %s, config=0x%02x"
 			 "\n", 'a' + cmd640_chip_version - 1, bus_type, cfr);
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
index f153b95619b..a0de834a81c 100644
--- a/drivers/ide/delkin_cb.c
+++ b/drivers/ide/delkin_cb.c
@@ -68,6 +68,7 @@ static const struct ide_port_info delkin_cb_port_info = {
 				  IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
 	.init_chipset		= delkin_cb_init_chipset,
+	.chipset		= ide_pci,
 };
 
 static int __devinit
@@ -97,7 +98,6 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id)
 	ide_std_init_ports(&hw, base + 0x10, base + 0x1e);
 	hw.irq = dev->irq;
 	hw.dev = &dev->dev;
-	hw.chipset = ide_pci;		/* this enables IRQ sharing */
 
 	rc = ide_host_add(&delkin_cb_port_info, hws, &host);
 	if (rc)
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 0e2df6755ec..770cfa67bdc 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -111,6 +111,7 @@ static const struct ide_port_info falconide_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
 				  IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
+	.chipset		= ide_generic,
 };
 
 static void __init falconide_setup_ports(hw_regs_t *hw)
@@ -128,8 +129,6 @@ static void __init falconide_setup_ports(hw_regs_t *hw)
 
 	hw->irq = IRQ_MFP_IDE;
 	hw->ack_intr = NULL;
-
-	hw->chipset = ide_generic;
 }
 
     /*
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index c7119516c5a..71db2f9c336 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -106,14 +106,13 @@ static void __init gayle_setup_ports(hw_regs_t *hw, unsigned long base,
 
 	hw->irq = IRQ_AMIGA_PORTS;
 	hw->ack_intr = ack_intr;
-
-	hw->chipset = ide_generic;
 }
 
 static const struct ide_port_info gayle_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
 				  IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
+	.chipset		= ide_generic,
 };
 
     /*
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 36da913cc55..6352a44ed17 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -398,11 +398,11 @@ static void icside_setup_ports(hw_regs_t *hw, void __iomem *base,
 
 	hw->irq = ec->irq;
 	hw->dev = &ec->dev;
-	hw->chipset = ide_acorn;
 }
 
 static const struct ide_port_info icside_v5_port_info = {
 	.host_flags		= IDE_HFLAG_NO_DMA,
+	.chipset		= ide_acorn,
 };
 
 static int __devinit
@@ -457,6 +457,7 @@ static const struct ide_port_info icside_v6_port_info __initdata = {
 	.host_flags		= IDE_HFLAG_SERIALIZE | IDE_HFLAG_MMIO,
 	.mwdma_mask		= ATA_MWDMA2,
 	.swdma_mask		= ATA_SWDMA2,
+	.chipset		= ide_acorn,
 };
 
 static int __devinit
diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
index 78aca75a2c4..617ca7a5ec8 100644
--- a/drivers/ide/ide-4drives.c
+++ b/drivers/ide/ide-4drives.c
@@ -25,6 +25,7 @@ static const struct ide_port_info ide_4drives_port_info = {
 	.port_ops		= &ide_4drives_port_ops,
 	.host_flags		= IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA |
 				  IDE_HFLAG_4DRIVES,
+	.chipset		= ide_4drives,
 };
 
 static int __init ide_4drives_init(void)
@@ -52,7 +53,6 @@ static int __init ide_4drives_init(void)
 
 	ide_std_init_ports(&hw, base, ctl);
 	hw.irq = 14;
-	hw.chipset = ide_4drives;
 
 	return ide_host_add(&ide_4drives_port_info, hws, NULL);
 }
diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c
index 9e47f3529d5..43d09dcae28 100644
--- a/drivers/ide/ide-cs.c
+++ b/drivers/ide/ide-cs.c
@@ -155,6 +155,7 @@ static const struct ide_port_info idecs_port_info = {
 	.port_ops		= &idecs_port_ops,
 	.host_flags		= IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
+	.chipset		= ide_pci,
 };
 
 static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
@@ -181,7 +182,6 @@ static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
     memset(&hw, 0, sizeof(hw));
     ide_std_init_ports(&hw, io, ctl);
     hw.irq = irq;
-    hw.chipset = ide_pci;
     hw.dev = &handle->dev;
 
     rc = ide_host_add(&idecs_port_info, hws, &host);
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 7812ca0be13..0427759d018 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -29,6 +29,7 @@ MODULE_PARM_DESC(probe_mask, "probe mask for legacy ISA IDE ports");
 
 static const struct ide_port_info ide_generic_port_info = {
 	.host_flags		= IDE_HFLAG_NO_DMA,
+	.chipset		= ide_generic,
 };
 
 #ifdef CONFIG_ARM
@@ -132,8 +133,6 @@ static int __init ide_generic_init(void)
 #else
 			hw.irq = legacy_irqs[i];
 #endif
-			hw.chipset = ide_generic;
-
 			rc = ide_host_add(&ide_generic_port_info, hws, NULL);
 			if (rc) {
 				release_region(io_addr + 0x206, 1);
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index c06ebdc4a13..40eff6c9759 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -73,12 +73,12 @@ static inline void hw_setup(hw_regs_t *hw)
 		hw->io_ports_array[i] = CONFIG_H8300_IDE_BASE + H8300_IDE_GAP*i;
 	hw->io_ports.ctl_addr = CONFIG_H8300_IDE_ALT;
 	hw->irq = EXT_IRQ0 + CONFIG_H8300_IDE_IRQ;
-	hw->chipset = ide_generic;
 }
 
 static const struct ide_port_info h8300_port_info = {
 	.tp_ops			= &h8300_tp_ops,
 	.host_flags		= IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_NO_DMA,
+	.chipset		= ide_generic,
 };
 
 static int __init h8300_ide_init(void)
diff --git a/drivers/ide/ide-legacy.c b/drivers/ide/ide-legacy.c
index 8c5dcbf2254..0c5b29c56cb 100644
--- a/drivers/ide/ide-legacy.c
+++ b/drivers/ide/ide-legacy.c
@@ -33,7 +33,6 @@ static void ide_legacy_init_one(hw_regs_t **hws, hw_regs_t *hw,
 
 	ide_std_init_ports(hw, base, ctl);
 	hw->irq = irq;
-	hw->chipset = d->chipset;
 	hw->config = config;
 
 	hws[port_no] = hw;
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
index 6e80b774e88..47043fda239 100644
--- a/drivers/ide/ide-pnp.c
+++ b/drivers/ide/ide-pnp.c
@@ -29,6 +29,7 @@ static struct pnp_device_id idepnp_devices[] = {
 
 static const struct ide_port_info ide_pnp_port_info = {
 	.host_flags		= IDE_HFLAG_NO_DMA,
+	.chipset		= ide_generic,
 };
 
 static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
@@ -62,7 +63,6 @@ static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
 	memset(&hw, 0, sizeof(hw));
 	ide_std_init_ports(&hw, base, ctl);
 	hw.irq = pnp_irq(dev, 0);
-	hw.chipset = ide_generic;
 
 	rc = ide_host_add(&ide_pnp_port_info, hws, &host);
 	if (rc)
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 727a67109ff..f17ba1932ad 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1048,8 +1048,7 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
 {
 	hwif->channel = port;
 
-	if (d->chipset)
-		hwif->chipset = d->chipset;
+	hwif->chipset = d->chipset ? d->chipset : ide_pci;
 
 	if (d->init_iops)
 		d->init_iops(hwif);
@@ -1178,7 +1177,6 @@ static void ide_init_port_hw(ide_hwif_t *hwif, hw_regs_t *hw)
 {
 	memcpy(&hwif->io_ports, &hw->io_ports, sizeof(hwif->io_ports));
 	hwif->irq = hw->irq;
-	hwif->chipset = hw->chipset;
 	hwif->dev = hw->dev;
 	hwif->gendev.parent = hw->parent ? hw->parent : hw->dev;
 	hwif->ack_intr = hw->ack_intr;
diff --git a/drivers/ide/ide_platform.c b/drivers/ide/ide_platform.c
index 051b4ab0f35..813653362a2 100644
--- a/drivers/ide/ide_platform.c
+++ b/drivers/ide/ide_platform.c
@@ -40,12 +40,11 @@ static void __devinit plat_ide_setup_ports(hw_regs_t *hw,
 	hw->io_ports.ctl_addr = (unsigned long)ctrl;
 
 	hw->irq = irq;
-
-	hw->chipset = ide_generic;
 }
 
 static const struct ide_port_info platform_ide_port_info = {
 	.host_flags		= IDE_HFLAG_NO_DMA,
+	.chipset		= ide_generic,
 };
 
 static int __devinit plat_ide_probe(struct platform_device *pdev)
diff --git a/drivers/ide/macide.c b/drivers/ide/macide.c
index 4b1718e8328..3af9e96da61 100644
--- a/drivers/ide/macide.c
+++ b/drivers/ide/macide.c
@@ -76,13 +76,12 @@ static void __init macide_setup_ports(hw_regs_t *hw, unsigned long base,
 
 	hw->irq = irq;
 	hw->ack_intr = ack_intr;
-
-	hw->chipset = ide_generic;
 }
 
 static const struct ide_port_info macide_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
+	.chipset		= ide_generic,
 };
 
 static const char *mac_ide_name[] =
diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c
index 09d813d313f..a455c25f43c 100644
--- a/drivers/ide/palm_bk3710.c
+++ b/drivers/ide/palm_bk3710.c
@@ -306,6 +306,7 @@ static struct ide_port_info __devinitdata palm_bk3710_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO,
 	.pio_mask		= ATA_PIO4,
 	.mwdma_mask		= ATA_MWDMA2,
+	.chipset		= ide_palm3710,
 };
 
 static int __init palm_bk3710_probe(struct platform_device *pdev)
@@ -363,7 +364,6 @@ static int __init palm_bk3710_probe(struct platform_device *pdev)
 			(base + IDE_PALM_ATA_PRI_CTL_OFFSET);
 	hw.irq = irq->start;
 	hw.dev = &pdev->dev;
-	hw.chipset = ide_palm3710;
 
 	palm_bk3710_port_info.udma_mask = rate < 100000000 ? ATA_UDMA4 :
 							     ATA_UDMA5;
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index c7934667924..7488d4ff3d7 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -70,8 +70,6 @@ static void q40_ide_setup_ports(hw_regs_t *hw, unsigned long base,
 
 	hw->irq = irq;
 	hw->ack_intr = ack_intr;
-
-	hw->chipset = ide_generic;
 }
 
 static void q40ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
@@ -119,6 +117,7 @@ static const struct ide_port_info q40ide_port_info = {
 	.tp_ops			= &q40ide_tp_ops,
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
+	.chipset		= ide_generic,
 };
 
 /* 
diff --git a/drivers/ide/rapide.c b/drivers/ide/rapide.c
index d5003ca6980..bd4d7a8a666 100644
--- a/drivers/ide/rapide.c
+++ b/drivers/ide/rapide.c
@@ -13,6 +13,7 @@
 
 static const struct ide_port_info rapide_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
+	.chipset		= ide_generic,
 };
 
 static void rapide_setup_ports(hw_regs_t *hw, void __iomem *base,
@@ -49,7 +50,6 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id)
 
 	memset(&hw, 0, sizeof(hw));
 	rapide_setup_ports(&hw, base, base + 0x818, 1 << 6, ec->irq);
-	hw.chipset = ide_generic;
 	hw.dev = &ec->dev;
 
 	ret = ide_host_add(&rapide_port_info, hws, &host);
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 5be41f25204..9e3aef31733 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -567,7 +567,6 @@ static int scc_ide_setup_pci_device(struct pci_dev *dev,
 		hw.io_ports_array[i] = ports->dma + 0x20 + i * 4;
 	hw.irq = dev->irq;
 	hw.dev = &dev->dev;
-	hw.chipset = ide_pci;
 
 	rc = ide_host_add(d, hws, &host);
 	if (rc)
@@ -823,6 +822,7 @@ static const struct ide_port_info scc_chipset __devinitdata = {
 	.host_flags	= IDE_HFLAG_SINGLE,
 	.irq_flags	= IRQF_SHARED,
 	.pio_mask	= ATA_PIO4,
+	.chipset	= ide_pci,
 };
 
 /**
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 7a3a12d6e63..82519ddc910 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -344,7 +344,6 @@ static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d,
 
 	memset(hw, 0, sizeof(*hw));
 	hw->dev = &dev->dev;
-	hw->chipset = d->chipset ? d->chipset : ide_pci;
 	ide_std_init_ports(hw, base, ctl | 2);
 
 	return 0;
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index e5d2a48a84d..676d41c7add 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -575,7 +575,6 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev)
 	memset(&hw, 0, sizeof(hw));
 	sgiioc4_init_hwif_ports(&hw, cmd_base, ctl, irqport);
 	hw.irq = dev->irq;
-	hw.chipset = ide_pci;
 	hw.dev = &dev->dev;
 
 	/* Initializing chipset IRQ Registers */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index fc61328a4cd..9652edbd26a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -186,7 +186,6 @@ typedef struct hw_regs_s {
 
 	int		irq;			/* our irq number */
 	ide_ack_intr_t	*ack_intr;		/* acknowledge interrupt */
-	hwif_chipset_t  chipset;
 	struct device	*dev, *parent;
 	unsigned long	config;
 } hw_regs_t;
-- 
cgit v1.2.3-70-g09d2


From dca3983059a4481e4ae97bbf0ac4b4c21429e1a5 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 17 May 2009 19:12:24 +0200
Subject: ide: pass number of ports to ide_host_{alloc,add}() (v2)

Pass number of ports to ide_host_{alloc,add}() and then update
all users accordingly.

v2:
- drop no longer needed NULL initializers in buddha.c, cmd640.c and gayle.c
  (noticed by Sergei)

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     | 5 ++---
 drivers/ide/au1xxx-ide.c   | 4 ++--
 drivers/ide/buddha.c       | 4 ++--
 drivers/ide/cmd640.c       | 5 +++--
 drivers/ide/cs5520.c       | 4 ++--
 drivers/ide/delkin_cb.c    | 4 ++--
 drivers/ide/falconide.c    | 4 ++--
 drivers/ide/gayle.c        | 4 ++--
 drivers/ide/icside.c       | 8 ++++----
 drivers/ide/ide-4drives.c  | 4 ++--
 drivers/ide/ide-cs.c       | 4 ++--
 drivers/ide/ide-generic.c  | 4 ++--
 drivers/ide/ide-h8300.c    | 4 ++--
 drivers/ide/ide-legacy.c   | 4 ++--
 drivers/ide/ide-pnp.c      | 4 ++--
 drivers/ide/ide-probe.c    | 9 +++++----
 drivers/ide/ide_platform.c | 4 ++--
 drivers/ide/macide.c       | 4 ++--
 drivers/ide/palm_bk3710.c  | 4 ++--
 drivers/ide/pmac.c         | 4 ++--
 drivers/ide/q40ide.c       | 4 ++--
 drivers/ide/rapide.c       | 4 ++--
 drivers/ide/scc_pata.c     | 4 ++--
 drivers/ide/setup-pci.c    | 6 +++---
 drivers/ide/sgiioc4.c      | 4 ++--
 drivers/ide/tx4938ide.c    | 5 ++---
 drivers/ide/tx4939ide.c    | 5 ++---
 include/linux/ide.h        | 5 +++--
 28 files changed, 64 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 8d39cc9bdf9..11fe1ffdff7 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -247,8 +247,7 @@ irqreturn_t at91_irq_handler(int irq, void *dev_id)
 static int __init at91_ide_probe(struct platform_device *pdev)
 {
 	int ret;
-	hw_regs_t hw;
-	hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	struct ide_host *host;
 	struct resource *res;
 	unsigned long tf_base = 0, ctl_base = 0;
@@ -307,7 +306,7 @@ static int __init at91_ide_probe(struct platform_device *pdev)
 	hw.irq = board->irq_pin;
 	hw.dev = &pdev->dev;
 
-	host = ide_host_alloc(&at91_ide_port_info, hws);
+	host = ide_host_alloc(&at91_ide_port_info, hws, 1);
 	if (!host) {
 		perr("failed to allocate ide host\n");
 		return -ENOMEM;
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 9b31f830e2f..32f5be68601 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -508,7 +508,7 @@ static int au_ide_probe(struct platform_device *dev)
 	struct resource *res;
 	struct ide_host *host;
 	int ret = 0;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA)
 	char *mode = "MWDMA2";
@@ -550,7 +550,7 @@ static int au_ide_probe(struct platform_device *dev)
 	hw.irq = ahwif->irq;
 	hw.dev = &dev->dev;
 
-	ret = ide_host_add(&au1xxx_port_info, hws, &host);
+	ret = ide_host_add(&au1xxx_port_info, hws, 1, &host);
 	if (ret)
 		goto out;
 
diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
index 9aa2cd9be31..0450652cdab 100644
--- a/drivers/ide/buddha.c
+++ b/drivers/ide/buddha.c
@@ -160,7 +160,7 @@ static int __init buddha_init(void)
 
 	while ((z = zorro_find_device(ZORRO_WILDCARD, z))) {
 		unsigned long board;
-		hw_regs_t hw[MAX_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL };
+		hw_regs_t hw[MAX_NUM_HWIFS], *hws[MAX_NUM_HWIFS];
 
 		if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_BUDDHA) {
 			buddha_num_hwifs = BUDDHA_NUM_HWIFS;
@@ -224,7 +224,7 @@ fail_base2:
 			hws[i] = &hw[i];
 		}
 
-		ide_host_add(&buddha_port_info, hws, NULL);
+		ide_host_add(&buddha_port_info, hws, i, NULL);
 	}
 
 	return 0;
diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c
index e862a2503ab..edb3a7a35c8 100644
--- a/drivers/ide/cmd640.c
+++ b/drivers/ide/cmd640.c
@@ -708,7 +708,7 @@ static int __init cmd640x_init(void)
 	int second_port_cmd640 = 0, rc;
 	const char *bus_type, *port2;
 	u8 b, cfr;
-	hw_regs_t hw[2], *hws[] = { NULL, NULL, NULL, NULL };
+	hw_regs_t hw[2], *hws[2];
 
 	if (cmd640_vlb && probe_for_cmd640_vlb()) {
 		bus_type = "VLB";
@@ -822,7 +822,8 @@ static int __init cmd640x_init(void)
 	cmd640_dump_regs();
 #endif
 
-	return ide_host_add(&cmd640_port_info, hws, NULL);
+	return ide_host_add(&cmd640_port_info, hws, second_port_cmd640 ? 2 : 1,
+			    NULL);
 }
 
 module_param_named(probe_vlb, cmd640_vlb, bool, 0);
diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c
index 87987a7d36c..a9023d7843f 100644
--- a/drivers/ide/cs5520.c
+++ b/drivers/ide/cs5520.c
@@ -110,7 +110,7 @@ static const struct ide_port_info cyrix_chipset __devinitdata = {
 static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	const struct ide_port_info *d = &cyrix_chipset;
-	hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL };
+	hw_regs_t hw[2], *hws[] = { NULL, NULL };
 
 	ide_setup_pci_noise(dev, d);
 
@@ -136,7 +136,7 @@ static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_devic
 	ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
 	hw[0].irq = 14;
 
-	return ide_host_add(d, hws, NULL);
+	return ide_host_add(d, hws, 2, NULL);
 }
 
 static const struct pci_device_id cs5520_pci_tbl[] = {
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
index a0de834a81c..d4a76f22ed1 100644
--- a/drivers/ide/delkin_cb.c
+++ b/drivers/ide/delkin_cb.c
@@ -77,7 +77,7 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id)
 	struct ide_host *host;
 	unsigned long base;
 	int rc;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 	rc = pci_enable_device(dev);
 	if (rc) {
@@ -99,7 +99,7 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id)
 	hw.irq = dev->irq;
 	hw.dev = &dev->dev;
 
-	rc = ide_host_add(&delkin_cb_port_info, hws, &host);
+	rc = ide_host_add(&delkin_cb_port_info, hws, 1, &host);
 	if (rc)
 		goto out_disable;
 
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 770cfa67bdc..adb5b0cf762 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -138,7 +138,7 @@ static void __init falconide_setup_ports(hw_regs_t *hw)
 static int __init falconide_init(void)
 {
 	struct ide_host *host;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	int rc;
 
 	if (!MACH_IS_ATARI || !ATARIHW_PRESENT(IDE))
@@ -153,7 +153,7 @@ static int __init falconide_init(void)
 
 	falconide_setup_ports(&hw);
 
-	host = ide_host_alloc(&falconide_port_info, hws);
+	host = ide_host_alloc(&falconide_port_info, hws, 1);
 	if (host == NULL) {
 		rc = -ENOMEM;
 		goto err;
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index 71db2f9c336..253ff34afd8 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -125,7 +125,7 @@ static int __init gayle_init(void)
     unsigned long base, ctrlport, irqport;
     ide_ack_intr_t *ack_intr;
     int a4000, i, rc;
-    hw_regs_t hw[GAYLE_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL };
+    hw_regs_t hw[GAYLE_NUM_HWIFS], *hws[GAYLE_NUM_HWIFS];
 
     if (!MACH_IS_AMIGA)
 	return -ENODEV;
@@ -170,7 +170,7 @@ found:
 	hws[i] = &hw[i];
     }
 
-    rc = ide_host_add(&gayle_port_info, hws, NULL);
+    rc = ide_host_add(&gayle_port_info, hws, i, NULL);
     if (rc)
 	release_mem_region(res_start, res_n);
 
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 6352a44ed17..6223b80beb3 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -410,7 +410,7 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec)
 {
 	void __iomem *base;
 	struct ide_host *host;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	int ret;
 
 	base = ecardm_iomap(ec, ECARD_RES_MEMC, 0, 0);
@@ -431,7 +431,7 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec)
 
 	icside_setup_ports(&hw, base, &icside_cardinfo_v5, ec);
 
-	host = ide_host_alloc(&icside_v5_port_info, hws);
+	host = ide_host_alloc(&icside_v5_port_info, hws, 1);
 	if (host == NULL)
 		return -ENODEV;
 
@@ -467,7 +467,7 @@ icside_register_v6(struct icside_state *state, struct expansion_card *ec)
 	struct ide_host *host;
 	unsigned int sel = 0;
 	int ret;
-	hw_regs_t hw[2], *hws[] = { &hw[0], &hw[1], NULL, NULL };
+	hw_regs_t hw[2], *hws[] = { &hw[0], &hw[1] };
 	struct ide_port_info d = icside_v6_port_info;
 
 	ioc_base = ecardm_iomap(ec, ECARD_RES_IOCFAST, 0, 0);
@@ -507,7 +507,7 @@ icside_register_v6(struct icside_state *state, struct expansion_card *ec)
 	icside_setup_ports(&hw[0], easi_base, &icside_cardinfo_v6_1, ec);
 	icside_setup_ports(&hw[1], easi_base, &icside_cardinfo_v6_2, ec);
 
-	host = ide_host_alloc(&d, hws);
+	host = ide_host_alloc(&d, hws, 2);
 	if (host == NULL)
 		return -ENODEV;
 
diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
index 617ca7a5ec8..189b8bd9957 100644
--- a/drivers/ide/ide-4drives.c
+++ b/drivers/ide/ide-4drives.c
@@ -31,7 +31,7 @@ static const struct ide_port_info ide_4drives_port_info = {
 static int __init ide_4drives_init(void)
 {
 	unsigned long base = 0x1f0, ctl = 0x3f6;
-	hw_regs_t hw, *hws[] = { &hw, &hw, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw, &hw };
 
 	if (probe_4drives == 0)
 		return -ENODEV;
@@ -54,7 +54,7 @@ static int __init ide_4drives_init(void)
 	ide_std_init_ports(&hw, base, ctl);
 	hw.irq = 14;
 
-	return ide_host_add(&ide_4drives_port_info, hws, NULL);
+	return ide_host_add(&ide_4drives_port_info, hws, 2, NULL);
 }
 
 module_init(ide_4drives_init);
diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c
index 43d09dcae28..63309ad04cb 100644
--- a/drivers/ide/ide-cs.c
+++ b/drivers/ide/ide-cs.c
@@ -164,7 +164,7 @@ static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
     struct ide_host *host;
     ide_hwif_t *hwif;
     int i, rc;
-    hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+    hw_regs_t hw, *hws[] = { &hw };
 
     if (!request_region(io, 8, DRV_NAME)) {
 	printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
@@ -184,7 +184,7 @@ static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
     hw.irq = irq;
     hw.dev = &handle->dev;
 
-    rc = ide_host_add(&idecs_port_info, hws, &host);
+    rc = ide_host_add(&idecs_port_info, hws, 1, &host);
     if (rc)
 	goto out_release;
 
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 0427759d018..0d40848540d 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -86,7 +86,7 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
 
 static int __init ide_generic_init(void)
 {
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	unsigned long io_addr;
 	int i, rc = 0, primary = 0, secondary = 0;
 
@@ -133,7 +133,7 @@ static int __init ide_generic_init(void)
 #else
 			hw.irq = legacy_irqs[i];
 #endif
-			rc = ide_host_add(&ide_generic_port_info, hws, NULL);
+			rc = ide_host_add(&ide_generic_port_info, hws, 1, NULL);
 			if (rc) {
 				release_region(io_addr + 0x206, 1);
 				release_region(io_addr, 8);
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 40eff6c9759..0b5fabe2806 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -83,7 +83,7 @@ static const struct ide_port_info h8300_port_info = {
 
 static int __init h8300_ide_init(void)
 {
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 	printk(KERN_INFO DRV_NAME ": H8/300 generic IDE interface\n");
 
@@ -96,7 +96,7 @@ static int __init h8300_ide_init(void)
 
 	hw_setup(&hw);
 
-	return ide_host_add(&h8300_port_info, hws, NULL);
+	return ide_host_add(&h8300_port_info, hws, 1, NULL);
 
 out_busy:
 	printk(KERN_ERR "ide-h8300: IDE I/F resource already used.\n");
diff --git a/drivers/ide/ide-legacy.c b/drivers/ide/ide-legacy.c
index 0c5b29c56cb..98389e53990 100644
--- a/drivers/ide/ide-legacy.c
+++ b/drivers/ide/ide-legacy.c
@@ -40,7 +40,7 @@ static void ide_legacy_init_one(hw_regs_t **hws, hw_regs_t *hw,
 
 int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config)
 {
-	hw_regs_t hw[2], *hws[] = { NULL, NULL, NULL, NULL };
+	hw_regs_t hw[2], *hws[] = { NULL, NULL };
 
 	memset(&hw, 0, sizeof(hw));
 
@@ -52,6 +52,6 @@ int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config)
 	    (d->host_flags & IDE_HFLAG_SINGLE))
 		return -ENOENT;
 
-	return ide_host_add(d, hws, NULL);
+	return ide_host_add(d, hws, 2, NULL);
 }
 EXPORT_SYMBOL_GPL(ide_legacy_device_add);
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
index 47043fda239..6bca0f05ee9 100644
--- a/drivers/ide/ide-pnp.c
+++ b/drivers/ide/ide-pnp.c
@@ -37,7 +37,7 @@ static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
 	struct ide_host *host;
 	unsigned long base, ctl;
 	int rc;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 	printk(KERN_INFO DRV_NAME ": generic PnP IDE interface\n");
 
@@ -64,7 +64,7 @@ static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
 	ide_std_init_ports(&hw, base, ctl);
 	hw.irq = pnp_irq(dev, 0);
 
-	rc = ide_host_add(&ide_pnp_port_info, hws, &host);
+	rc = ide_host_add(&ide_pnp_port_info, hws, 1, &host);
 	if (rc)
 		goto out;
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index f17ba1932ad..6c7451a6e60 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1261,7 +1261,8 @@ out_nomem:
 	return -ENOMEM;
 }
 
-struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws)
+struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws,
+				unsigned int n_ports)
 {
 	struct ide_host *host;
 	struct device *dev = hws[0] ? hws[0]->dev : NULL;
@@ -1272,7 +1273,7 @@ struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws)
 	if (host == NULL)
 		return NULL;
 
-	for (i = 0; i < MAX_HOST_PORTS; i++) {
+	for (i = 0; i < n_ports; i++) {
 		ide_hwif_t *hwif;
 		int idx;
 
@@ -1443,12 +1444,12 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 EXPORT_SYMBOL_GPL(ide_host_register);
 
 int ide_host_add(const struct ide_port_info *d, hw_regs_t **hws,
-		 struct ide_host **hostp)
+		 unsigned int n_ports, struct ide_host **hostp)
 {
 	struct ide_host *host;
 	int rc;
 
-	host = ide_host_alloc(d, hws);
+	host = ide_host_alloc(d, hws, n_ports);
 	if (host == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/ide/ide_platform.c b/drivers/ide/ide_platform.c
index 813653362a2..47413c2b5f8 100644
--- a/drivers/ide/ide_platform.c
+++ b/drivers/ide/ide_platform.c
@@ -54,7 +54,7 @@ static int __devinit plat_ide_probe(struct platform_device *pdev)
 	struct pata_platform_info *pdata;
 	struct ide_host *host;
 	int ret = 0, mmio = 0;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	struct ide_port_info d = platform_ide_port_info;
 
 	pdata = pdev->dev.platform_data;
@@ -98,7 +98,7 @@ static int __devinit plat_ide_probe(struct platform_device *pdev)
 	if (mmio)
 		d.host_flags |= IDE_HFLAG_MMIO;
 
-	ret = ide_host_add(&d, hws, &host);
+	ret = ide_host_add(&d, hws, 1, &host);
 	if (ret)
 		goto out;
 
diff --git a/drivers/ide/macide.c b/drivers/ide/macide.c
index 3af9e96da61..31aa2781860 100644
--- a/drivers/ide/macide.c
+++ b/drivers/ide/macide.c
@@ -96,7 +96,7 @@ static int __init macide_init(void)
 	ide_ack_intr_t *ack_intr;
 	unsigned long base;
 	int irq;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 	if (!MACH_IS_MAC)
 		return -ENODEV;
@@ -126,7 +126,7 @@ static int __init macide_init(void)
 
 	macide_setup_ports(&hw, base, irq, ack_intr);
 
-	return ide_host_add(&macide_port_info, hws, NULL);
+	return ide_host_add(&macide_port_info, hws, 1, NULL);
 }
 
 module_init(macide_init);
diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c
index a455c25f43c..4507a6d801b 100644
--- a/drivers/ide/palm_bk3710.c
+++ b/drivers/ide/palm_bk3710.c
@@ -316,7 +316,7 @@ static int __init palm_bk3710_probe(struct platform_device *pdev)
 	void __iomem *base;
 	unsigned long rate, mem_size;
 	int i, rc;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 	clk = clk_get(&pdev->dev, "IDECLK");
 	if (IS_ERR(clk))
@@ -369,7 +369,7 @@ static int __init palm_bk3710_probe(struct platform_device *pdev)
 							     ATA_UDMA5;
 
 	/* Register the IDE interface with Linux */
-	rc = ide_host_add(&palm_bk3710_port_info, hws, NULL);
+	rc = ide_host_add(&palm_bk3710_port_info, hws, 1, NULL);
 	if (rc)
 		goto out;
 
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index f76e4e6b408..f4f806476e0 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1029,7 +1029,7 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
 	const int *bidp;
 	struct ide_host *host;
 	ide_hwif_t *hwif;
-	hw_regs_t *hws[] = { hw, NULL, NULL, NULL };
+	hw_regs_t *hws[] = { hw };
 	struct ide_port_info d = pmac_port_info;
 	int rc;
 
@@ -1077,7 +1077,7 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
 	/* Make sure we have sane timings */
 	sanitize_timings(pmif);
 
-	host = ide_host_alloc(&d, hws);
+	host = ide_host_alloc(&d, hws, 1);
 	if (host == NULL)
 		return -ENOMEM;
 	hwif = host->ports[0];
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 7488d4ff3d7..e46229fe5ea 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -135,7 +135,7 @@ static const char *q40_ide_names[Q40IDE_NUM_HWIFS]={
 static int __init q40ide_init(void)
 {
     int i;
-    hw_regs_t hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL, NULL, NULL };
+    hw_regs_t hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL };
 
     if (!MACH_IS_Q40)
       return -ENODEV;
@@ -162,7 +162,7 @@ static int __init q40ide_init(void)
 	hws[i] = &hw[i];
     }
 
-    return ide_host_add(&q40ide_port_info, hws, NULL);
+    return ide_host_add(&q40ide_port_info, hws, Q40IDE_NUM_HWIFS, NULL);
 }
 
 module_init(q40ide_init);
diff --git a/drivers/ide/rapide.c b/drivers/ide/rapide.c
index bd4d7a8a666..c4da3dd39f5 100644
--- a/drivers/ide/rapide.c
+++ b/drivers/ide/rapide.c
@@ -36,7 +36,7 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id)
 	void __iomem *base;
 	struct ide_host *host;
 	int ret;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 
 	ret = ecard_request_resources(ec);
 	if (ret)
@@ -52,7 +52,7 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id)
 	rapide_setup_ports(&hw, base, base + 0x818, 1 << 6, ec->irq);
 	hw.dev = &ec->dev;
 
-	ret = ide_host_add(&rapide_port_info, hws, &host);
+	ret = ide_host_add(&rapide_port_info, hws, 1, &host);
 	if (ret)
 		goto release;
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 9e3aef31733..9415f8c8a41 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -559,7 +559,7 @@ static int scc_ide_setup_pci_device(struct pci_dev *dev,
 {
 	struct scc_ports *ports = pci_get_drvdata(dev);
 	struct ide_host *host;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	int i, rc;
 
 	memset(&hw, 0, sizeof(hw));
@@ -568,7 +568,7 @@ static int scc_ide_setup_pci_device(struct pci_dev *dev,
 	hw.irq = dev->irq;
 	hw.dev = &dev->dev;
 
-	rc = ide_host_add(d, hws, &host);
+	rc = ide_host_add(d, hws, 1, &host);
 	if (rc)
 		return rc;
 
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 82519ddc910..d78f4c99451 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -538,7 +538,7 @@ int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
 		     void *priv)
 {
 	struct ide_host *host;
-	hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL };
+	hw_regs_t hw[2], *hws[] = { NULL, NULL };
 	int ret;
 
 	ret = ide_setup_pci_controller(dev, d, 1);
@@ -547,7 +547,7 @@ int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
 
 	ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
 
-	host = ide_host_alloc(d, hws);
+	host = ide_host_alloc(d, hws, 2);
 	if (host == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -596,7 +596,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
 		ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]);
 	}
 
-	host = ide_host_alloc(d, hws);
+	host = ide_host_alloc(d, hws, 4);
 	if (host == NULL) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 676d41c7add..3f8ee357ffb 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -546,7 +546,7 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev)
 	unsigned long cmd_base, irqport;
 	unsigned long bar0, cmd_phys_base, ctl;
 	void __iomem *virt_base;
-	hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	int rc;
 
 	/*  Get the CmdBlk and CtrlBlk Base Registers */
@@ -580,7 +580,7 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev)
 	/* Initializing chipset IRQ Registers */
 	writel(0x03, (void __iomem *)(irqport + IOC4_INTR_SET * 4));
 
-	rc = ide_host_add(&sgiioc4_port_info, hws, NULL);
+	rc = ide_host_add(&sgiioc4_port_info, hws, 1, NULL);
 	if (!rc)
 		return 0;
 
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index e33d764e294..16adc18499f 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -130,8 +130,7 @@ static const struct ide_port_info tx4938ide_port_info __initdata = {
 
 static int __init tx4938ide_probe(struct platform_device *pdev)
 {
-	hw_regs_t hw;
-	hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	struct ide_host *host;
 	struct resource *res;
 	struct tx4938ide_platform_info *pdata = pdev->dev.platform_data;
@@ -183,7 +182,7 @@ static int __init tx4938ide_probe(struct platform_device *pdev)
 		tx4938ide_tune_ebusc(pdata->ebus_ch, pdata->gbus_clock, 0);
 	else
 		d.port_ops = NULL;
-	ret = ide_host_add(&d, hws, &host);
+	ret = ide_host_add(&d, hws, 1, &host);
 	if (!ret)
 		platform_set_drvdata(pdev, host);
 	return ret;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 564422d2397..fa57920d003 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -537,8 +537,7 @@ static const struct ide_port_info tx4939ide_port_info __initdata = {
 
 static int __init tx4939ide_probe(struct platform_device *pdev)
 {
-	hw_regs_t hw;
-	hw_regs_t *hws[] = { &hw, NULL, NULL, NULL };
+	hw_regs_t hw, *hws[] = { &hw };
 	struct ide_host *host;
 	struct resource *res;
 	int irq, ret;
@@ -581,7 +580,7 @@ static int __init tx4939ide_probe(struct platform_device *pdev)
 	hw.dev = &pdev->dev;
 
 	pr_info("TX4939 IDE interface (base %#lx, irq %d)\n", mapbase, irq);
-	host = ide_host_alloc(&tx4939ide_port_info, hws);
+	host = ide_host_alloc(&tx4939ide_port_info, hws, 1);
 	if (!host)
 		return -ENOMEM;
 	/* use extra_base for base address of the all registers */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9652edbd26a..a3cd568553d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1456,11 +1456,12 @@ void ide_undecoded_slave(ide_drive_t *);
 void ide_port_apply_params(ide_hwif_t *);
 int ide_sysfs_register_port(ide_hwif_t *);
 
-struct ide_host *ide_host_alloc(const struct ide_port_info *, hw_regs_t **);
+struct ide_host *ide_host_alloc(const struct ide_port_info *, hw_regs_t **,
+				unsigned int);
 void ide_host_free(struct ide_host *);
 int ide_host_register(struct ide_host *, const struct ide_port_info *,
 		      hw_regs_t **);
-int ide_host_add(const struct ide_port_info *, hw_regs_t **,
+int ide_host_add(const struct ide_port_info *, hw_regs_t **, unsigned int,
 		 struct ide_host **);
 void ide_host_remove(struct ide_host *);
 int ide_legacy_device_add(const struct ide_port_info *, unsigned long);
-- 
cgit v1.2.3-70-g09d2


From 9f36d31437922354d104a2db407f397e79e4027e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 17 May 2009 19:12:25 +0200
Subject: ide: remove hw_regs_t typedef

Remove hw_regs_t typedef and rename struct hw_regs_s to struct ide_hw.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     |  2 +-
 drivers/ide/au1xxx-ide.c   |  4 ++--
 drivers/ide/buddha.c       |  4 ++--
 drivers/ide/cmd640.c       |  2 +-
 drivers/ide/cs5520.c       |  2 +-
 drivers/ide/delkin_cb.c    |  2 +-
 drivers/ide/falconide.c    |  4 ++--
 drivers/ide/gayle.c        |  4 ++--
 drivers/ide/icside.c       |  6 +++---
 drivers/ide/ide-4drives.c  |  2 +-
 drivers/ide/ide-cs.c       |  2 +-
 drivers/ide/ide-generic.c  |  2 +-
 drivers/ide/ide-h8300.c    |  6 +++---
 drivers/ide/ide-legacy.c   |  4 ++--
 drivers/ide/ide-pnp.c      |  2 +-
 drivers/ide/ide-probe.c    | 10 +++++-----
 drivers/ide/ide_platform.c |  4 ++--
 drivers/ide/macide.c       |  4 ++--
 drivers/ide/palm_bk3710.c  |  2 +-
 drivers/ide/pmac.c         | 11 ++++++-----
 drivers/ide/q40ide.c       |  6 +++---
 drivers/ide/rapide.c       |  4 ++--
 drivers/ide/scc_pata.c     |  2 +-
 drivers/ide/setup-pci.c    | 16 ++++++++--------
 drivers/ide/sgiioc4.c      |  4 ++--
 drivers/ide/tx4938ide.c    |  2 +-
 drivers/ide/tx4939ide.c    |  2 +-
 include/linux/ide.h        | 14 +++++++-------
 28 files changed, 65 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 11fe1ffdff7..fc0949a8cfd 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -247,7 +247,7 @@ irqreturn_t at91_irq_handler(int irq, void *dev_id)
 static int __init at91_ide_probe(struct platform_device *pdev)
 {
 	int ret;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	struct ide_host *host;
 	struct resource *res;
 	unsigned long tf_base = 0, ctl_base = 0;
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 32f5be68601..58121bd6c11 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -449,7 +449,7 @@ static int auide_ddma_init(ide_hwif_t *hwif, const struct ide_port_info *d)
 }
 #endif
 
-static void auide_setup_ports(hw_regs_t *hw, _auide_hwif *ahwif)
+static void auide_setup_ports(struct ide_hw *hw, _auide_hwif *ahwif)
 {
 	int i;
 	unsigned long *ata_regs = hw->io_ports_array;
@@ -508,7 +508,7 @@ static int au_ide_probe(struct platform_device *dev)
 	struct resource *res;
 	struct ide_host *host;
 	int ret = 0;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA)
 	char *mode = "MWDMA2";
diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
index 0450652cdab..e3c6a591330 100644
--- a/drivers/ide/buddha.c
+++ b/drivers/ide/buddha.c
@@ -121,7 +121,7 @@ static int xsurf_ack_intr(ide_hwif_t *hwif)
     return 1;
 }
 
-static void __init buddha_setup_ports(hw_regs_t *hw, unsigned long base,
+static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base,
 				      unsigned long ctl, unsigned long irq_port,
 				      ide_ack_intr_t *ack_intr)
 {
@@ -160,7 +160,7 @@ static int __init buddha_init(void)
 
 	while ((z = zorro_find_device(ZORRO_WILDCARD, z))) {
 		unsigned long board;
-		hw_regs_t hw[MAX_NUM_HWIFS], *hws[MAX_NUM_HWIFS];
+		struct ide_hw hw[MAX_NUM_HWIFS], *hws[MAX_NUM_HWIFS];
 
 		if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_BUDDHA) {
 			buddha_num_hwifs = BUDDHA_NUM_HWIFS;
diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c
index edb3a7a35c8..1683ed5c732 100644
--- a/drivers/ide/cmd640.c
+++ b/drivers/ide/cmd640.c
@@ -708,7 +708,7 @@ static int __init cmd640x_init(void)
 	int second_port_cmd640 = 0, rc;
 	const char *bus_type, *port2;
 	u8 b, cfr;
-	hw_regs_t hw[2], *hws[2];
+	struct ide_hw hw[2], *hws[2];
 
 	if (cmd640_vlb && probe_for_cmd640_vlb()) {
 		bus_type = "VLB";
diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c
index a9023d7843f..bd066bb9d61 100644
--- a/drivers/ide/cs5520.c
+++ b/drivers/ide/cs5520.c
@@ -110,7 +110,7 @@ static const struct ide_port_info cyrix_chipset __devinitdata = {
 static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	const struct ide_port_info *d = &cyrix_chipset;
-	hw_regs_t hw[2], *hws[] = { NULL, NULL };
+	struct ide_hw hw[2], *hws[] = { NULL, NULL };
 
 	ide_setup_pci_noise(dev, d);
 
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
index d4a76f22ed1..1e10eba62ce 100644
--- a/drivers/ide/delkin_cb.c
+++ b/drivers/ide/delkin_cb.c
@@ -77,7 +77,7 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id)
 	struct ide_host *host;
 	unsigned long base;
 	int rc;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 	rc = pci_enable_device(dev);
 	if (rc) {
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index adb5b0cf762..22fa27389c3 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -114,7 +114,7 @@ static const struct ide_port_info falconide_port_info = {
 	.chipset		= ide_generic,
 };
 
-static void __init falconide_setup_ports(hw_regs_t *hw)
+static void __init falconide_setup_ports(struct ide_hw *hw)
 {
 	int i;
 
@@ -138,7 +138,7 @@ static void __init falconide_setup_ports(hw_regs_t *hw)
 static int __init falconide_init(void)
 {
 	struct ide_host *host;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	int rc;
 
 	if (!MACH_IS_ATARI || !ATARIHW_PRESENT(IDE))
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index 253ff34afd8..4451a6a5dfe 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -88,7 +88,7 @@ static int gayle_ack_intr_a1200(ide_hwif_t *hwif)
     return 1;
 }
 
-static void __init gayle_setup_ports(hw_regs_t *hw, unsigned long base,
+static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base,
 				     unsigned long ctl, unsigned long irq_port,
 				     ide_ack_intr_t *ack_intr)
 {
@@ -125,7 +125,7 @@ static int __init gayle_init(void)
     unsigned long base, ctrlport, irqport;
     ide_ack_intr_t *ack_intr;
     int a4000, i, rc;
-    hw_regs_t hw[GAYLE_NUM_HWIFS], *hws[GAYLE_NUM_HWIFS];
+    struct ide_hw hw[GAYLE_NUM_HWIFS], *hws[GAYLE_NUM_HWIFS];
 
     if (!MACH_IS_AMIGA)
 	return -ENODEV;
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 6223b80beb3..c5269fa1f73 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -381,7 +381,7 @@ static int icside_dma_off_init(ide_hwif_t *hwif, const struct ide_port_info *d)
 	return -EOPNOTSUPP;
 }
 
-static void icside_setup_ports(hw_regs_t *hw, void __iomem *base,
+static void icside_setup_ports(struct ide_hw *hw, void __iomem *base,
 			       struct cardinfo *info, struct expansion_card *ec)
 {
 	unsigned long port = (unsigned long)base + info->dataoffset;
@@ -410,7 +410,7 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec)
 {
 	void __iomem *base;
 	struct ide_host *host;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	int ret;
 
 	base = ecardm_iomap(ec, ECARD_RES_MEMC, 0, 0);
@@ -467,7 +467,7 @@ icside_register_v6(struct icside_state *state, struct expansion_card *ec)
 	struct ide_host *host;
 	unsigned int sel = 0;
 	int ret;
-	hw_regs_t hw[2], *hws[] = { &hw[0], &hw[1] };
+	struct ide_hw hw[2], *hws[] = { &hw[0], &hw[1] };
 	struct ide_port_info d = icside_v6_port_info;
 
 	ioc_base = ecardm_iomap(ec, ECARD_RES_IOCFAST, 0, 0);
diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
index 189b8bd9957..979d342c338 100644
--- a/drivers/ide/ide-4drives.c
+++ b/drivers/ide/ide-4drives.c
@@ -31,7 +31,7 @@ static const struct ide_port_info ide_4drives_port_info = {
 static int __init ide_4drives_init(void)
 {
 	unsigned long base = 0x1f0, ctl = 0x3f6;
-	hw_regs_t hw, *hws[] = { &hw, &hw };
+	struct ide_hw hw, *hws[] = { &hw, &hw };
 
 	if (probe_4drives == 0)
 		return -ENODEV;
diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c
index 63309ad04cb..527908ff298 100644
--- a/drivers/ide/ide-cs.c
+++ b/drivers/ide/ide-cs.c
@@ -164,7 +164,7 @@ static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
     struct ide_host *host;
     ide_hwif_t *hwif;
     int i, rc;
-    hw_regs_t hw, *hws[] = { &hw };
+    struct ide_hw hw, *hws[] = { &hw };
 
     if (!request_region(io, 8, DRV_NAME)) {
 	printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 0d40848540d..54d7c4685d2 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -86,7 +86,7 @@ static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
 
 static int __init ide_generic_init(void)
 {
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	unsigned long io_addr;
 	int i, rc = 0, primary = 0, secondary = 0;
 
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 0b5fabe2806..520f42c5445 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -64,11 +64,11 @@ static const struct ide_tp_ops h8300_tp_ops = {
 
 #define H8300_IDE_GAP (2)
 
-static inline void hw_setup(hw_regs_t *hw)
+static inline void hw_setup(struct ide_hw *hw)
 {
 	int i;
 
-	memset(hw, 0, sizeof(hw_regs_t));
+	memset(hw, 0, sizeof(*hw));
 	for (i = 0; i <= 7; i++)
 		hw->io_ports_array[i] = CONFIG_H8300_IDE_BASE + H8300_IDE_GAP*i;
 	hw->io_ports.ctl_addr = CONFIG_H8300_IDE_ALT;
@@ -83,7 +83,7 @@ static const struct ide_port_info h8300_port_info = {
 
 static int __init h8300_ide_init(void)
 {
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 	printk(KERN_INFO DRV_NAME ": H8/300 generic IDE interface\n");
 
diff --git a/drivers/ide/ide-legacy.c b/drivers/ide/ide-legacy.c
index 98389e53990..b9654a7bb7b 100644
--- a/drivers/ide/ide-legacy.c
+++ b/drivers/ide/ide-legacy.c
@@ -1,7 +1,7 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
 
-static void ide_legacy_init_one(hw_regs_t **hws, hw_regs_t *hw,
+static void ide_legacy_init_one(struct ide_hw **hws, struct ide_hw *hw,
 				u8 port_no, const struct ide_port_info *d,
 				unsigned long config)
 {
@@ -40,7 +40,7 @@ static void ide_legacy_init_one(hw_regs_t **hws, hw_regs_t *hw,
 
 int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config)
 {
-	hw_regs_t hw[2], *hws[] = { NULL, NULL };
+	struct ide_hw hw[2], *hws[] = { NULL, NULL };
 
 	memset(&hw, 0, sizeof(hw));
 
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
index 6bca0f05ee9..017b1df3b80 100644
--- a/drivers/ide/ide-pnp.c
+++ b/drivers/ide/ide-pnp.c
@@ -37,7 +37,7 @@ static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
 	struct ide_host *host;
 	unsigned long base, ctl;
 	int rc;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 	printk(KERN_INFO DRV_NAME ": generic PnP IDE interface\n");
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 6c7451a6e60..29363829a3f 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1173,7 +1173,7 @@ static void ide_init_port_data(ide_hwif_t *hwif, unsigned int index)
 	ide_port_init_devices_data(hwif);
 }
 
-static void ide_init_port_hw(ide_hwif_t *hwif, hw_regs_t *hw)
+static void ide_init_port_hw(ide_hwif_t *hwif, struct ide_hw *hw)
 {
 	memcpy(&hwif->io_ports, &hw->io_ports, sizeof(hwif->io_ports));
 	hwif->irq = hw->irq;
@@ -1261,8 +1261,8 @@ out_nomem:
 	return -ENOMEM;
 }
 
-struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws,
-				unsigned int n_ports)
+struct ide_host *ide_host_alloc(const struct ide_port_info *d,
+				struct ide_hw **hws, unsigned int n_ports)
 {
 	struct ide_host *host;
 	struct device *dev = hws[0] ? hws[0]->dev : NULL;
@@ -1349,7 +1349,7 @@ static void ide_disable_port(ide_hwif_t *hwif)
 }
 
 int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
-		      hw_regs_t **hws)
+		      struct ide_hw **hws)
 {
 	ide_hwif_t *hwif, *mate = NULL;
 	int i, j = 0;
@@ -1443,7 +1443,7 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 }
 EXPORT_SYMBOL_GPL(ide_host_register);
 
-int ide_host_add(const struct ide_port_info *d, hw_regs_t **hws,
+int ide_host_add(const struct ide_port_info *d, struct ide_hw **hws,
 		 unsigned int n_ports, struct ide_host **hostp)
 {
 	struct ide_host *host;
diff --git a/drivers/ide/ide_platform.c b/drivers/ide/ide_platform.c
index 47413c2b5f8..ee9b55ecc62 100644
--- a/drivers/ide/ide_platform.c
+++ b/drivers/ide/ide_platform.c
@@ -21,7 +21,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 
-static void __devinit plat_ide_setup_ports(hw_regs_t *hw,
+static void __devinit plat_ide_setup_ports(struct ide_hw *hw,
 					   void __iomem *base,
 					   void __iomem *ctrl,
 					   struct pata_platform_info *pdata,
@@ -54,7 +54,7 @@ static int __devinit plat_ide_probe(struct platform_device *pdev)
 	struct pata_platform_info *pdata;
 	struct ide_host *host;
 	int ret = 0, mmio = 0;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	struct ide_port_info d = platform_ide_port_info;
 
 	pdata = pdev->dev.platform_data;
diff --git a/drivers/ide/macide.c b/drivers/ide/macide.c
index 31aa2781860..1447c8c9056 100644
--- a/drivers/ide/macide.c
+++ b/drivers/ide/macide.c
@@ -62,7 +62,7 @@ int macide_ack_intr(ide_hwif_t* hwif)
 	return 0;
 }
 
-static void __init macide_setup_ports(hw_regs_t *hw, unsigned long base,
+static void __init macide_setup_ports(struct ide_hw *hw, unsigned long base,
 				      int irq, ide_ack_intr_t *ack_intr)
 {
 	int i;
@@ -96,7 +96,7 @@ static int __init macide_init(void)
 	ide_ack_intr_t *ack_intr;
 	unsigned long base;
 	int irq;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 	if (!MACH_IS_MAC)
 		return -ENODEV;
diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c
index 4507a6d801b..3c1dc015215 100644
--- a/drivers/ide/palm_bk3710.c
+++ b/drivers/ide/palm_bk3710.c
@@ -316,7 +316,7 @@ static int __init palm_bk3710_probe(struct platform_device *pdev)
 	void __iomem *base;
 	unsigned long rate, mem_size;
 	int i, rc;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 	clk = clk_get(&pdev->dev, "IDECLK");
 	if (IS_ERR(clk))
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index f4f806476e0..97642a7a79c 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1023,13 +1023,14 @@ static const struct ide_port_info pmac_port_info = {
  * Setup, register & probe an IDE channel driven by this driver, this is
  * called by one of the 2 probe functions (macio or PCI).
  */
-static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
+static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif,
+					   struct ide_hw *hw)
 {
 	struct device_node *np = pmif->node;
 	const int *bidp;
 	struct ide_host *host;
 	ide_hwif_t *hwif;
-	hw_regs_t *hws[] = { hw };
+	struct ide_hw *hws[] = { hw };
 	struct ide_port_info d = pmac_port_info;
 	int rc;
 
@@ -1124,7 +1125,7 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
 	return 0;
 }
 
-static void __devinit pmac_ide_init_ports(hw_regs_t *hw, unsigned long base)
+static void __devinit pmac_ide_init_ports(struct ide_hw *hw, unsigned long base)
 {
 	int i;
 
@@ -1144,7 +1145,7 @@ pmac_ide_macio_attach(struct macio_dev *mdev, const struct of_device_id *match)
 	unsigned long regbase;
 	pmac_ide_hwif_t *pmif;
 	int irq, rc;
-	hw_regs_t hw;
+	struct ide_hw hw;
 
 	pmif = kzalloc(sizeof(*pmif), GFP_KERNEL);
 	if (pmif == NULL)
@@ -1268,7 +1269,7 @@ pmac_ide_pci_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	void __iomem *base;
 	unsigned long rbase, rlen;
 	int rc;
-	hw_regs_t hw;
+	struct ide_hw hw;
 
 	np = pci_device_to_OF_node(pdev);
 	if (np == NULL) {
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index e46229fe5ea..ab49a97023d 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -51,11 +51,11 @@ static int q40ide_default_irq(unsigned long base)
 /*
  * Addresses are pretranslated for Q40 ISA access.
  */
-static void q40_ide_setup_ports(hw_regs_t *hw, unsigned long base,
+static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base,
 			ide_ack_intr_t *ack_intr,
 			int irq)
 {
-	memset(hw, 0, sizeof(hw_regs_t));
+	memset(hw, 0, sizeof(*hw));
 	/* BIG FAT WARNING: 
 	   assumption: only DATA port is ever used in 16 bit mode */
 	hw->io_ports.data_addr = Q40_ISA_IO_W(base);
@@ -135,7 +135,7 @@ static const char *q40_ide_names[Q40IDE_NUM_HWIFS]={
 static int __init q40ide_init(void)
 {
     int i;
-    hw_regs_t hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL };
+    struct ide_hw hw[Q40IDE_NUM_HWIFS], *hws[] = { NULL, NULL };
 
     if (!MACH_IS_Q40)
       return -ENODEV;
diff --git a/drivers/ide/rapide.c b/drivers/ide/rapide.c
index c4da3dd39f5..00f54248f41 100644
--- a/drivers/ide/rapide.c
+++ b/drivers/ide/rapide.c
@@ -16,7 +16,7 @@ static const struct ide_port_info rapide_port_info = {
 	.chipset		= ide_generic,
 };
 
-static void rapide_setup_ports(hw_regs_t *hw, void __iomem *base,
+static void rapide_setup_ports(struct ide_hw *hw, void __iomem *base,
 			       void __iomem *ctrl, unsigned int sz, int irq)
 {
 	unsigned long port = (unsigned long)base;
@@ -36,7 +36,7 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id)
 	void __iomem *base;
 	struct ide_host *host;
 	int ret;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 
 	ret = ecard_request_resources(ec);
 	if (ret)
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 9415f8c8a41..1104bb301eb 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -559,7 +559,7 @@ static int scc_ide_setup_pci_device(struct pci_dev *dev,
 {
 	struct scc_ports *ports = pci_get_drvdata(dev);
 	struct ide_host *host;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	int i, rc;
 
 	memset(&hw, 0, sizeof(hw));
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index d78f4c99451..5314edffc30 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -301,11 +301,11 @@ static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info *
 }
 
 /**
- *	ide_hw_configure	-	configure a hw_regs_t instance
+ *	ide_hw_configure	-	configure a struct ide_hw instance
  *	@dev: PCI device holding interface
  *	@d: IDE port info
  *	@port: port number
- *	@hw: hw_regs_t instance corresponding to this port
+ *	@hw: struct ide_hw instance corresponding to this port
  *
  *	Perform the initial set up for the hardware interface structure. This
  *	is done per interface port rather than per PCI device. There may be
@@ -315,7 +315,7 @@ static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info *
  */
 
 static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d,
-			    unsigned int port, hw_regs_t *hw)
+			    unsigned int port, struct ide_hw *hw)
 {
 	unsigned long ctl = 0, base = 0;
 
@@ -445,8 +445,8 @@ out:
  *	ide_pci_setup_ports	-	configure ports/devices on PCI IDE
  *	@dev: PCI device
  *	@d: IDE port info
- *	@hw: hw_regs_t instances corresponding to this PCI IDE device
- *	@hws: hw_regs_t pointers table to update
+ *	@hw: struct ide_hw instances corresponding to this PCI IDE device
+ *	@hws: struct ide_hw pointers table to update
  *
  *	Scan the interfaces attached to this device and do any
  *	necessary per port setup. Attach the devices and ask the
@@ -458,7 +458,7 @@ out:
  */
 
 void ide_pci_setup_ports(struct pci_dev *dev, const struct ide_port_info *d,
-			 hw_regs_t *hw, hw_regs_t **hws)
+			 struct ide_hw *hw, struct ide_hw **hws)
 {
 	int channels = (d->host_flags & IDE_HFLAG_SINGLE) ? 1 : 2, port;
 	u8 tmp;
@@ -538,7 +538,7 @@ int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
 		     void *priv)
 {
 	struct ide_host *host;
-	hw_regs_t hw[2], *hws[] = { NULL, NULL };
+	struct ide_hw hw[2], *hws[] = { NULL, NULL };
 	int ret;
 
 	ret = ide_setup_pci_controller(dev, d, 1);
@@ -586,7 +586,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
 	struct pci_dev *pdev[] = { dev1, dev2 };
 	struct ide_host *host;
 	int ret, i;
-	hw_regs_t hw[4], *hws[] = { NULL, NULL, NULL, NULL };
+	struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL };
 
 	for (i = 0; i < 2; i++) {
 		ret = ide_setup_pci_controller(pdev[i], d, !i);
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 3f8ee357ffb..5f37f168f94 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -91,7 +91,7 @@ typedef struct {
 
 
 static void
-sgiioc4_init_hwif_ports(hw_regs_t * hw, unsigned long data_port,
+sgiioc4_init_hwif_ports(struct ide_hw *hw, unsigned long data_port,
 			unsigned long ctrl_port, unsigned long irq_port)
 {
 	unsigned long reg = data_port;
@@ -546,7 +546,7 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev)
 	unsigned long cmd_base, irqport;
 	unsigned long bar0, cmd_phys_base, ctl;
 	void __iomem *virt_base;
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	int rc;
 
 	/*  Get the CmdBlk and CtrlBlk Base Registers */
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 16adc18499f..ea89fddeed9 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -130,7 +130,7 @@ static const struct ide_port_info tx4938ide_port_info __initdata = {
 
 static int __init tx4938ide_probe(struct platform_device *pdev)
 {
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	struct ide_host *host;
 	struct resource *res;
 	struct tx4938ide_platform_info *pdata = pdev->dev.platform_data;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index fa57920d003..9f73fd43d1f 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -537,7 +537,7 @@ static const struct ide_port_info tx4939ide_port_info __initdata = {
 
 static int __init tx4939ide_probe(struct platform_device *pdev)
 {
-	hw_regs_t hw, *hws[] = { &hw };
+	struct ide_hw hw, *hws[] = { &hw };
 	struct ide_host *host;
 	struct resource *res;
 	int irq, ret;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a3cd568553d..b1b903a0dac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -178,7 +178,7 @@ typedef u8 hwif_chipset_t;
 /*
  * Structure to hold all information about the location of this port
  */
-typedef struct hw_regs_s {
+struct ide_hw {
 	union {
 		struct ide_io_ports	io_ports;
 		unsigned long		io_ports_array[IDE_NR_PORTS];
@@ -188,9 +188,9 @@ typedef struct hw_regs_s {
 	ide_ack_intr_t	*ack_intr;		/* acknowledge interrupt */
 	struct device	*dev, *parent;
 	unsigned long	config;
-} hw_regs_t;
+};
 
-static inline void ide_std_init_ports(hw_regs_t *hw,
+static inline void ide_std_init_ports(struct ide_hw *hw,
 				      unsigned long io_addr,
 				      unsigned long ctl_addr)
 {
@@ -1212,7 +1212,7 @@ static inline int ide_pci_is_in_compatibility_mode(struct pci_dev *dev)
 }
 
 void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *,
-			 hw_regs_t *, hw_regs_t **);
+			 struct ide_hw *, struct ide_hw **);
 void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_PCI
@@ -1456,12 +1456,12 @@ void ide_undecoded_slave(ide_drive_t *);
 void ide_port_apply_params(ide_hwif_t *);
 int ide_sysfs_register_port(ide_hwif_t *);
 
-struct ide_host *ide_host_alloc(const struct ide_port_info *, hw_regs_t **,
+struct ide_host *ide_host_alloc(const struct ide_port_info *, struct ide_hw **,
 				unsigned int);
 void ide_host_free(struct ide_host *);
 int ide_host_register(struct ide_host *, const struct ide_port_info *,
-		      hw_regs_t **);
-int ide_host_add(const struct ide_port_info *, hw_regs_t **, unsigned int,
+		      struct ide_hw **);
+int ide_host_add(const struct ide_port_info *, struct ide_hw **, unsigned int,
 		 struct ide_host **);
 void ide_host_remove(struct ide_host *);
 int ide_legacy_device_add(const struct ide_port_info *, unsigned long);
-- 
cgit v1.2.3-70-g09d2


From 9d21493b4beb8f918ba248032fefa393074a5e2b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 17 May 2009 20:55:16 -0700
Subject: net: tx scalability works : trans_start

struct net_device trans_start field is a hot spot on SMP and high performance
devices, particularly multi queues ones, because every transmitter dirties
it. Is main use is tx watchdog and bonding alive checks.

But as most devices dont use NETIF_F_LLTX, we have to lock
a netdev_queue before calling their ndo_start_xmit(). So it makes
sense to move trans_start from net_device to netdev_queue. Its update
will occur on a already present (and in exclusive state) cache line, for
free.

We can do this transition smoothly. An old driver continue to
update dev->trans_start, while an updated one updates txq->trans_start.

Further patches could also put tx_bytes/tx_packets counters in
netdev_queue to avoid dirtying dev->stats (vlan device comes to mind)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  8 ++++----
 include/linux/netdevice.h       | 11 +++++++++++
 net/sched/sch_generic.c         | 40 +++++++++++++++++++++++++++++++---------
 3 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 815191dd03c..30b9ea6d62b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2795,7 +2795,7 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 	 */
 	bond_for_each_slave(bond, slave, i) {
 		if (slave->link != BOND_LINK_UP) {
-			if (time_before_eq(jiffies, slave->dev->trans_start + delta_in_ticks) &&
+			if (time_before_eq(jiffies, dev_trans_start(slave->dev) + delta_in_ticks) &&
 			    time_before_eq(jiffies, slave->dev->last_rx + delta_in_ticks)) {
 
 				slave->link  = BOND_LINK_UP;
@@ -2827,7 +2827,7 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
 			 * when the source ip is 0, so don't take the link down
 			 * if we don't know our ip yet
 			 */
-			if (time_after_eq(jiffies, slave->dev->trans_start + 2*delta_in_ticks) ||
+			if (time_after_eq(jiffies, dev_trans_start(slave->dev) + 2*delta_in_ticks) ||
 			    (time_after_eq(jiffies, slave->dev->last_rx + 2*delta_in_ticks))) {
 
 				slave->link  = BOND_LINK_DOWN;
@@ -2938,7 +2938,7 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
 		 *    the bond has an IP address)
 		 */
 		if ((slave->state == BOND_STATE_ACTIVE) &&
-		    (time_after_eq(jiffies, slave->dev->trans_start +
+		    (time_after_eq(jiffies, dev_trans_start(slave->dev) +
 				    2 * delta_in_ticks) ||
 		      (time_after_eq(jiffies, slave_last_rx(bond, slave)
 				     + 2 * delta_in_ticks)))) {
@@ -2982,7 +2982,7 @@ static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
 			write_lock_bh(&bond->curr_slave_lock);
 
 			if (!bond->curr_active_slave &&
-			    time_before_eq(jiffies, slave->dev->trans_start +
+			    time_before_eq(jiffies, dev_trans_start(slave->dev) +
 					   delta_in_ticks)) {
 				slave->link = BOND_LINK_UP;
 				bond_change_active_slave(bond, slave);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2af89b662ca..cd547d04a8c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -470,6 +470,10 @@ struct netdev_queue {
  */
 	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
 	int			xmit_lock_owner;
+	/*
+	 * please use this field instead of dev->trans_start
+	 */
+	unsigned long		trans_start;
 } ____cacheline_aligned_in_smp;
 
 
@@ -819,6 +823,11 @@ struct net_device
  * One part is mostly used on xmit path (device)
  */
 	/* These may be needed for future network-power-down code. */
+
+	/*
+	 * trans_start here is expensive for high speed devices on SMP,
+	 * please use netdev_queue->trans_start instead.
+	 */
 	unsigned long		trans_start;	/* Time (in jiffies) of last Tx	*/
 
 	int			watchdog_timeo; /* used by dev_watchdog() */
@@ -1541,6 +1550,8 @@ static inline int netif_carrier_ok(const struct net_device *dev)
 	return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
 }
 
+extern unsigned long dev_trans_start(struct net_device *dev);
+
 extern void __netdev_watchdog_up(struct net_device *dev);
 
 extern void netif_carrier_on(struct net_device *dev);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5f5efe4e607..27d03816ec3 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -196,6 +196,21 @@ void __qdisc_run(struct Qdisc *q)
 	clear_bit(__QDISC_STATE_RUNNING, &q->state);
 }
 
+unsigned long dev_trans_start(struct net_device *dev)
+{
+	unsigned long val, res = dev->trans_start;
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		val = netdev_get_tx_queue(dev, i)->trans_start;
+		if (val && time_after(val, res))
+			res = val;
+	}
+	dev->trans_start = res;
+	return res;
+}
+EXPORT_SYMBOL(dev_trans_start);
+
 static void dev_watchdog(unsigned long arg)
 {
 	struct net_device *dev = (struct net_device *)arg;
@@ -205,25 +220,30 @@ static void dev_watchdog(unsigned long arg)
 		if (netif_device_present(dev) &&
 		    netif_running(dev) &&
 		    netif_carrier_ok(dev)) {
-			int some_queue_stopped = 0;
+			int some_queue_timedout = 0;
 			unsigned int i;
+			unsigned long trans_start;
 
 			for (i = 0; i < dev->num_tx_queues; i++) {
 				struct netdev_queue *txq;
 
 				txq = netdev_get_tx_queue(dev, i);
-				if (netif_tx_queue_stopped(txq)) {
-					some_queue_stopped = 1;
+				/*
+				 * old device drivers set dev->trans_start
+				 */
+				trans_start = txq->trans_start ? : dev->trans_start;
+				if (netif_tx_queue_stopped(txq) &&
+				    time_after(jiffies, (trans_start +
+							 dev->watchdog_timeo))) {
+					some_queue_timedout = 1;
 					break;
 				}
 			}
 
-			if (some_queue_stopped &&
-			    time_after(jiffies, (dev->trans_start +
-						 dev->watchdog_timeo))) {
+			if (some_queue_timedout) {
 				char drivername[64];
-				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n",
-				       dev->name, netdev_drivername(dev, drivername, 64));
+				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
+				       dev->name, netdev_drivername(dev, drivername, 64), i);
 				dev->netdev_ops->ndo_tx_timeout(dev);
 			}
 			if (!mod_timer(&dev->watchdog_timer,
@@ -602,8 +622,10 @@ static void transition_one_qdisc(struct net_device *dev,
 		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
 
 	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
-	if (need_watchdog_p && new_qdisc != &noqueue_qdisc)
+	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+		dev_queue->trans_start = 0;
 		*need_watchdog_p = 1;
+	}
 }
 
 void dev_activate(struct net_device *dev)
-- 
cgit v1.2.3-70-g09d2


From f2a3e626202a87734a47153935ec9d15c7fcf761 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 15 May 2009 06:04:12 +0000
Subject: mdio: Add 10GBASE-T SNR register definition

These do not have an in-kernel user but may be useful to user-space.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 26b4eb3bbee..825f1e2e2ec 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -46,6 +46,8 @@
 
 /* Media-dependent registers. */
 #define MDIO_PMA_10GBT_TXPWR	131	/* 10GBASE-T TX power control */
+#define MDIO_PMA_10GBT_SNR	133	/* 10GBASE-T SNR margin, lane A.
+					 * Lanes B-D are numbered 134-136. */
 #define MDIO_PMA_10GBR_FECABLE	170	/* 10GBASE-R FEC ability */
 #define MDIO_PCS_10GBX_STAT1	24	/* 10GBASE-X PCS status 1 */
 #define MDIO_PCS_10GBRT_STAT1	32	/* 10GBASE-R/-T PCS status 1 */
@@ -188,6 +190,11 @@
 /* PMA 10GBASE-T TX power register. */
 #define MDIO_PMA_10GBT_TXPWR_SHORT	0x0001	/* Short-reach mode */
 
+/* PMA 10GBASE-T SNR registers. */
+/* Value is SNR margin in dB, clamped to range [-127, 127], plus 0x8000. */
+#define MDIO_PMA_10GBT_SNR_BIAS		0x8000
+#define MDIO_PMA_10GBT_SNR_MAX		127
+
 /* PMA 10GBASE-R FEC ability register. */
 #define MDIO_PMA_10GBR_FECABLE_ABLE	0x0001	/* FEC ability */
 #define MDIO_PMA_10GBR_FECABLE_ERRABLE	0x0002	/* FEC error indic. ability */
-- 
cgit v1.2.3-70-g09d2


From e0b221bf4e07edf2fda645e457dc3c35c2f2f3a9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 15 May 2009 06:05:49 +0000
Subject: mdio: Add XENPAK LASI register definitions

These registers were originally defined for XENPAK modules, but are
also implemented by many other 10G PHYs.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 825f1e2e2ec..56851646529 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -55,6 +55,14 @@
 #define MDIO_AN_10GBT_CTRL	32	/* 10GBASE-T auto-negotiation control */
 #define MDIO_AN_10GBT_STAT	33	/* 10GBASE-T auto-negotiation status */
 
+/* LASI (Link Alarm Status Interrupt) registers, defined by XENPAK MSA. */
+#define MDIO_PMA_LASI_RXCTRL	0x9000	/* RX_ALARM control */
+#define MDIO_PMA_LASI_TXCTRL	0x9001	/* TX_ALARM control */
+#define MDIO_PMA_LASI_CTRL	0x9002	/* LASI control */
+#define MDIO_PMA_LASI_RXSTAT	0x9003	/* RX_ALARM status */
+#define MDIO_PMA_LASI_TXSTAT	0x9004	/* TX_ALARM status */
+#define MDIO_PMA_LASI_STAT	0x9005	/* LASI status */
+
 /* Control register 1. */
 /* Enable extended speed selection */
 #define MDIO_CTRL1_SPEEDSELEXT		(BMCR_SPEED1000 | BMCR_SPEED100)
@@ -218,6 +226,26 @@
 #define MDIO_AN_10GBT_STAT_MS		0x4000	/* Master/slave config */
 #define MDIO_AN_10GBT_STAT_MSFLT	0x8000	/* Master/slave config fault */
 
+/* LASI RX_ALARM control/status registers. */
+#define MDIO_PMA_LASI_RX_PHYXSLFLT	0x0001	/* PHY XS RX local fault */
+#define MDIO_PMA_LASI_RX_PCSLFLT	0x0008	/* PCS RX local fault */
+#define MDIO_PMA_LASI_RX_PMALFLT	0x0010	/* PMA/PMD RX local fault */
+#define MDIO_PMA_LASI_RX_OPTICPOWERFLT	0x0020	/* RX optical power fault */
+#define MDIO_PMA_LASI_RX_WISLFLT	0x0200	/* WIS local fault */
+
+/* LASI TX_ALARM control/status registers. */
+#define MDIO_PMA_LASI_TX_PHYXSLFLT	0x0001	/* PHY XS TX local fault */
+#define MDIO_PMA_LASI_TX_PCSLFLT	0x0008	/* PCS TX local fault */
+#define MDIO_PMA_LASI_TX_PMALFLT	0x0010	/* PMA/PMD TX local fault */
+#define MDIO_PMA_LASI_TX_LASERPOWERFLT	0x0080	/* Laser output power fault */
+#define MDIO_PMA_LASI_TX_LASERTEMPFLT	0x0100	/* Laser temperature fault */
+#define MDIO_PMA_LASI_TX_LASERBICURRFLT	0x0200	/* Laser bias current fault */
+
+/* LASI control/status registers. */
+#define MDIO_PMA_LASI_LSALARM		0x0001	/* LS_ALARM enable/status */
+#define MDIO_PMA_LASI_TXALARM		0x0002	/* TX_ALARM enable/status */
+#define MDIO_PMA_LASI_RXALARM		0x0004	/* RX_ALARM enable/status */
+
 /* Mapping between MDIO PRTAD/DEVAD and mii_ioctl_data::phy_id */
 
 #define MDIO_PHY_ID_C45			0x8000
-- 
cgit v1.2.3-70-g09d2


From 888a589f6be07d624e21e2174d98375e9f95911b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: mm, x86: remove MEMORY_HOTPLUG_RESERVE related code

after:

 | commit b263295dbffd33b0fbff670720fa178c30e3392a
 | Author: Christoph Lameter <clameter@sgi.com>
 | Date:   Wed Jan 30 13:30:47 2008 +0100
 |
 |    x86: 64-bit, make sparsemem vmemmap the only memory model

we don't have MEMORY_HOTPLUG_RESERVE anymore.

Historically, x86-64 had an architecture-specific method for memory hotplug
whereby it scanned the SRAT for physical memory ranges that could be
potentially used for memory hot-add later. By reserving those ranges
without physical memory, the memmap would be allocated and left dormant
until needed. This depended on the DISCONTIG memory model which has been
removed so the code implementing HOTPLUG_RESERVE is now dead.

This patch removes the dead code used by MEMORY_HOTPLUG_RESERVE.

(Changelog authored by Mel.)

v2: updated changelog, and remove hotadd= in doc

[ Impact: remove dead code ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Workflow-found-OK-by: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A0C4910.7090508@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/x86/x86_64/boot-options.txt |  5 ---
 arch/x86/include/asm/numa_64.h            |  3 --
 arch/x86/mm/numa_64.c                     |  5 ---
 arch/x86/mm/srat_64.c                     | 63 ++++++----------------------
 include/linux/mm.h                        |  2 -
 mm/page_alloc.c                           | 69 -------------------------------
 6 files changed, 12 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c13040a71..2db5893d6c9 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -150,11 +150,6 @@ NUMA
 		Otherwise, the remaining system RAM is allocated to an
 		additional node.
 
-  numa=hotadd=percent
-		Only allow hotadd memory to preallocate page structures upto
-		percent of already available memory.
-		numa=hotadd=0 will disable hotadd memory.
-
 ACPI
 
   acpi=off	Don't enable ACPI
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cb..7feff0648d7 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
 extern s16 apicid_to_node[MAX_LOCAL_APIC];
 
 extern unsigned long numa_free_all_bootmem(void);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index fb61d81a656..a6a93c39523 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -272,9 +272,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 		reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
 				 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
 
-#ifdef CONFIG_ACPI_NUMA
-	srat_reserve_add_area(nodeid);
-#endif
 	node_set_online(nodeid);
 }
 
@@ -591,8 +588,6 @@ static __init int numa_setup(char *opt)
 #ifdef CONFIG_ACPI_NUMA
 	if (!strncmp(opt, "noacpi", 6))
 		acpi_numa = -1;
-	if (!strncmp(opt, "hotadd=", 7))
-		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
 	return 0;
 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 87b45bff250..b0dbbd48e58 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,8 +31,6 @@ static nodemask_t nodes_parsed __initdata;
 static nodemask_t cpu_nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
 
 static int num_node_memblks __initdata;
 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
@@ -66,9 +64,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &nodes[i];
 
-	if (found_add_area)
-		return;
-
 	if (nd->start < start) {
 		nd->start = start;
 		if (nd->end < nd->start)
@@ -86,7 +81,6 @@ static __init void bad_srat(void)
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	found_add_area = 0;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
 	for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +176,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	       pxm, apic_id, node);
 }
 
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
 /*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
  */
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
 {
 	unsigned long s_pfn = start >> PAGE_SHIFT;
 	unsigned long e_pfn = end >> PAGE_SHIFT;
-	int ret = 0, changed = 0;
+	int changed = 0;
 	struct bootnode *nd = &nodes_add[node];
 
 	/* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +201,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 	   mistakes */
 	if ((signed long)(end - start) < NODE_MIN_SIZE) {
 		printk(KERN_ERR "SRAT: Hotplug area too small\n");
-		return -1;
+		return;
 	}
 
 	/* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +209,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 		printk(KERN_ERR
 			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
 			s_pfn, e_pfn);
-		return -1;
-	}
-
-	if (!hotadd_enough_memory(&nodes_add[node]))  {
-		printk(KERN_ERR "SRAT: Hotplug area too large\n");
-		return -1;
+		return;
 	}
 
 	/* Looks good */
@@ -245,11 +231,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 	}
 
-	ret = update_end_of_memory(nd->end);
-
 	if (changed)
-	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-	return ret;
+		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+				 nd->start, nd->end);
 }
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +294,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	       start, end);
 	e820_register_active_regions(node, start >> PAGE_SHIFT,
 				     end >> PAGE_SHIFT);
-	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
-						nd->end >> PAGE_SHIFT);
 
-	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
-	    (reserve_hotadd(node, start, end) < 0)) {
-		/* Ignore hotadd region. Undo damage */
-		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+		update_nodes_add(node, start, end);
+		/* restore nodes[node] */
 		*nd = oldnode;
 		if ((nd->start | nd->end) == 0)
 			node_clear(node, nodes_parsed);
@@ -510,26 +491,6 @@ static int null_slit_node_compare(int a, int b)
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init srat_reserve_add_area(int nodeid)
-{
-	if (found_add_area && nodes_add[nodeid].end) {
-		u64 total_mb;
-
-		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
-				"for node %d at %Lx-%Lx\n",
-			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
-		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
-					>> PAGE_SHIFT;
-		total_mb *= sizeof(struct page);
-		total_mb >>= 20;
-		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
-				"pre-allocated memory.\n", (unsigned long long)total_mb);
-		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
-			       nodes_add[nodeid].end - nodes_add[nodeid].start,
-			       BOOTMEM_DEFAULT);
-	}
-}
-
 int __node_distance(int a, int b)
 {
 	int index;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d475c..511b0986709 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1031,8 +1031,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
-					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 						unsigned long end_pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe753ecf2aa..474c7e9dd51 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
   static int __meminitdata nr_nodemap_entries;
   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
   static unsigned long __initdata required_kernelcore;
   static unsigned long __initdata required_movablecore;
   static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -3102,64 +3098,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
 				early_node_map[i].end_pfn);
 }
 
-/**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-			"Entering push_node_boundaries(%u, %lu, %lu)\n",
-			nid, start_pfn, end_pfn);
-
-	/* Initialise the boundary for this node if necessary */
-	if (node_boundary_end_pfn[nid] == 0)
-		node_boundary_start_pfn[nid] = -1UL;
-
-	/* Update the boundaries */
-	if (node_boundary_start_pfn[nid] > start_pfn)
-		node_boundary_start_pfn[nid] = start_pfn;
-	if (node_boundary_end_pfn[nid] < end_pfn)
-		node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
-		unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-			"Entering account_node_boundary(%u, %lu, %lu)\n",
-			nid, *start_pfn, *end_pfn);
-
-	/* Return if boundary information has not been provided */
-	if (node_boundary_end_pfn[nid] == 0)
-		return;
-
-	/* Check the boundaries and update if necessary */
-	if (node_boundary_start_pfn[nid] < *start_pfn)
-		*start_pfn = node_boundary_start_pfn[nid];
-	if (node_boundary_end_pfn[nid] > *end_pfn)
-		*end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
-		unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
-		unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
@@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
-
-	/* Push the node boundaries out if requested */
-	account_node_boundary(nid, start_pfn, end_pfn);
 }
 
 /*
@@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void)
 {
 	memset(early_node_map, 0, sizeof(early_node_map));
 	nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
-	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 }
 
 /* Compare two active node_active_regions */
-- 
cgit v1.2.3-70-g09d2


From 302b4215daa0a704c843da40fd2529e5757a72da Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:32 +0800
Subject: PCI: support the ATS capability

The PCIe ATS capability makes the Endpoint be able to request the
DMA address translation from the IOMMU and cache the translation
in the device side, thus alleviate IOMMU pressure and improve the
hardware performance in the I/O virtualization environment.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/iov.c        | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h        |  37 +++++++++++++++++
 include/linux/pci.h      |   2 +
 include/linux/pci_regs.h |  10 +++++
 4 files changed, 154 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index b497daab3d4..0a7a1b40286 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -5,6 +5,7 @@
  *
  * PCI Express I/O Virtualization (IOV) support.
  *   Single Root IOV 1.0
+ *   Address Translation Service 1.0
  */
 
 #include <linux/pci.h>
@@ -679,3 +680,107 @@ irqreturn_t pci_sriov_migration(struct pci_dev *dev)
 	return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_migration);
+
+static int ats_alloc_one(struct pci_dev *dev, int ps)
+{
+	int pos;
+	u16 cap;
+	struct pci_ats *ats;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	ats = kzalloc(sizeof(*ats), GFP_KERNEL);
+	if (!ats)
+		return -ENOMEM;
+
+	ats->pos = pos;
+	ats->stu = ps;
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+	ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+					    PCI_ATS_MAX_QDEP;
+	dev->ats = ats;
+
+	return 0;
+}
+
+static void ats_free_one(struct pci_dev *dev)
+{
+	kfree(dev->ats);
+	dev->ats = NULL;
+}
+
+/**
+ * pci_enable_ats - enable the ATS capability
+ * @dev: the PCI device
+ * @ps: the IOMMU page shift
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	int rc;
+	u16 ctrl;
+
+	BUG_ON(dev->ats);
+
+	if (ps < PCI_ATS_MIN_STU)
+		return -EINVAL;
+
+	rc = ats_alloc_one(dev, ps);
+	if (rc)
+		return rc;
+
+	ctrl = PCI_ATS_CTRL_ENABLE;
+	ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU);
+	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+
+	return 0;
+}
+
+/**
+ * pci_disable_ats - disable the ATS capability
+ * @dev: the PCI device
+ */
+void pci_disable_ats(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	BUG_ON(!dev->ats);
+
+	pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl);
+	ctrl &= ~PCI_ATS_CTRL_ENABLE;
+	pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
+
+	ats_free_one(dev);
+}
+
+/**
+ * pci_ats_queue_depth - query the ATS Invalidate Queue Depth
+ * @dev: the PCI device
+ *
+ * Returns the queue depth on success, or negative on failure.
+ *
+ * The ATS spec uses 0 in the Invalidate Queue Depth field to
+ * indicate that the function can accept 32 Invalidate Request.
+ * But here we use the `real' values (i.e. 1~32) for the Queue
+ * Depth.
+ */
+int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	int pos;
+	u16 cap;
+
+	if (dev->ats)
+		return dev->ats->qdep;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+
+	return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
+				       PCI_ATS_MAX_QDEP;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d03f6b99f29..3c2ec64f78e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -229,6 +229,13 @@ struct pci_sriov {
 	u8 __iomem *mstate;	/* VF Migration State Array */
 };
 
+/* Address Translation Service */
+struct pci_ats {
+	int pos;	/* capability position */
+	int stu;	/* Smallest Translation Unit */
+	int qdep;	/* Invalidate Queue Depth */
+};
+
 #ifdef CONFIG_PCI_IOV
 extern int pci_iov_init(struct pci_dev *dev);
 extern void pci_iov_release(struct pci_dev *dev);
@@ -236,6 +243,20 @@ extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
 				enum pci_bar_type *type);
 extern void pci_restore_iov_state(struct pci_dev *dev);
 extern int pci_iov_bus_range(struct pci_bus *bus);
+
+extern int pci_enable_ats(struct pci_dev *dev, int ps);
+extern void pci_disable_ats(struct pci_dev *dev);
+extern int pci_ats_queue_depth(struct pci_dev *dev);
+/**
+ * pci_ats_enabled - query the ATS status
+ * @dev: the PCI device
+ *
+ * Returns 1 if ATS capability is enabled, or 0 if not.
+ */
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return !!dev->ats;
+}
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -257,6 +278,22 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 {
 	return 0;
 }
+
+static inline int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_ats(struct pci_dev *dev)
+{
+}
+static inline int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_IOV */
 
 #endif /* DRIVERS_PCI_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 72698d89e76..bd3e4a798c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -188,6 +188,7 @@ struct pci_cap_saved_state {
 struct pcie_link_state;
 struct pci_vpd;
 struct pci_sriov;
+struct pci_ats;
 
 /*
  * The pci_dev structure is used to describe PCI devices.
@@ -285,6 +286,7 @@ struct pci_dev {
 		struct pci_sriov *sriov;	/* SR-IOV capability related */
 		struct pci_dev *physfn;	/* the PF this VF is associated with */
 	};
+	struct pci_ats	*ats;	/* Address Translation Service */
 #endif
 };
 
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index e4d08c1b2e0..c03189c56c7 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -501,6 +501,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_ATS	15
 #define PCI_EXT_CAP_ID_SRIOV	16
 
 /* Advanced Error Reporting */
@@ -619,6 +620,15 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Address Translation Service */
+#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
+#define  PCI_ATS_CAP_QDEP(x)	((x) & 0x1f)	/* Invalidate Queue Depth */
+#define  PCI_ATS_MAX_QDEP	32	/* Max Invalidate Queue Depth */
+#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
+#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
+#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f)	/* Smallest Translation Unit */
+#define  PCI_ATS_MIN_STU	12	/* shift of minimum STU block */
+
 /* Single Root I/O Virtualization */
 #define PCI_SRIOV_CAP		0x04	/* SR-IOV Capabilities */
 #define  PCI_SRIOV_CAP_VFM	0x01	/* VF Migration Capable */
-- 
cgit v1.2.3-70-g09d2


From 1cde26f928863d90e9e7c1217880c8450464d305 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 18 May 2009 14:41:30 +0200
Subject: virtio_blk: SG_IO passthru support

Add support for SG_IO passthru to virtio_blk.  We add the scsi command
block after the normal outhdr, and the scsi inhdr with full status
information aswell as the sense buffer before the regular inhdr.

[hch: forward ported, added the VIRTIO_BLK_F_SCSI flags, some comments
 and tested the whole beast]
[axboe: updated to use ->resid and not dual-path the byte count]

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ checkpatch.pl tweak)
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 64 ++++++++++++++++++++++++++++++++++++----------
 include/linux/virtio_blk.h |  8 ++++++
 2 files changed, 58 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dfe9ee5f169..62275dbdf2e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -37,6 +37,7 @@ struct virtblk_req
 	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
+	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
 };
 
@@ -49,7 +50,9 @@ static void blk_done(struct virtqueue *vq)
 
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
+		unsigned int nr_bytes;
 		int error;
+
 		switch (vbr->status) {
 		case VIRTIO_BLK_S_OK:
 			error = 0;
@@ -62,6 +65,12 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
+		if (blk_pc_request(vbr->req)) {
+			vbr->req->resid_len = vbr->in_hdr.residual;
+			vbr->req->sense_len = vbr->in_hdr.sense_len;
+			vbr->req->errors = vbr->in_hdr.errors;
+		}
+
 		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
@@ -74,7 +83,7 @@ static void blk_done(struct virtqueue *vq)
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		   struct request *req)
 {
-	unsigned long num, out, in;
+	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 
 	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
@@ -99,18 +108,36 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	if (blk_barrier_rq(vbr->req))
 		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
 
-	sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
-	num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
-	sg_set_buf(&vblk->sg[num+1], &vbr->status, sizeof(vbr->status));
+	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
 
-	if (rq_data_dir(vbr->req) == WRITE) {
-		vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-		out = 1 + num;
-		in = 1;
-	} else {
-		vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-		out = 1;
-		in = 1 + num;
+	/*
+	 * If this is a packet command we need a couple of additional headers.
+	 * Behind the normal outhdr we put a segment with the scsi command
+	 * block, and before the normal inhdr we put the sense data and the
+	 * inhdr with additional status information before the normal inhdr.
+	 */
+	if (blk_pc_request(vbr->req))
+		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
+
+	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
+
+	if (blk_pc_request(vbr->req)) {
+		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
+		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
+			   sizeof(vbr->in_hdr));
+	}
+
+	sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (rq_data_dir(vbr->req) == WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
+		} else {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
+		}
 	}
 
 	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
@@ -148,8 +175,16 @@ static void do_virtblk_request(struct request_queue *q)
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
-	return scsi_cmd_ioctl(bdev->bd_disk->queue,
-			      bdev->bd_disk, mode, cmd,
+	struct gendisk *disk = bdev->bd_disk;
+	struct virtio_blk *vblk = disk->private_data;
+
+	/*
+	 * Only allow the generic SCSI ioctls if the host can support it.
+	 */
+	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
+		return -ENOIOCTLCMD;
+
+	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
 			      (void __user *)data);
 }
 
@@ -356,6 +391,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+	VIRTIO_BLK_F_SCSI,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 94c56d29869..4dbcbc1c348 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -15,6 +15,7 @@
 #define VIRTIO_BLK_F_GEOMETRY	4	/* Legacy geometry available  */
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
 
 struct virtio_blk_config
 {
@@ -55,6 +56,13 @@ struct virtio_blk_outhdr
 	__u64 sector;
 };
 
+struct virtio_scsi_inhdr {
+	__u32 errors;
+	__u32 data_len;
+	__u32 sense_len;
+	__u32 residual;
+};
+
 /* And this is the final byte of the write scatter-gather list. */
 #define VIRTIO_BLK_S_OK		0
 #define VIRTIO_BLK_S_IOERR	1
-- 
cgit v1.2.3-70-g09d2


From aa5d2b515b6fca5f8a56eac84f7fa0a68c1ce9b7 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:34 +0800
Subject: VT-d: parse ATSR in DMA Remapping Reporting Structure

Parse the Root Port ATS Capability Reporting Structure in the DMA
Remapping Reporting Structure ACPI table.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 112 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/dmar.h        |   9 ++++
 include/linux/intel-iommu.h |   1 +
 3 files changed, 116 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index f23460a5d10..6d7f9619b8a 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -267,6 +267,84 @@ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
 	}
 	return ret;
 }
+
+static LIST_HEAD(dmar_atsr_units);
+
+static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
+
+	atsru->hdr = hdr;
+	atsru->include_all = atsr->flags & 0x1;
+
+	list_add(&atsru->list, &dmar_atsr_units);
+
+	return 0;
+}
+
+static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+{
+	int rc;
+	struct acpi_dmar_atsr *atsr;
+
+	if (atsru->include_all)
+		return 0;
+
+	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+	rc = dmar_parse_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt, &atsru->devices,
+				atsr->segment);
+	if (rc || !atsru->devices_cnt) {
+		list_del(&atsru->list);
+		kfree(atsru);
+	}
+
+	return rc;
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+	int i;
+	struct pci_bus *bus;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment == pci_domain_nr(dev->bus))
+			goto found;
+	}
+
+	return 0;
+
+found:
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		struct pci_dev *bridge = bus->self;
+
+		if (!bridge || !bridge->is_pcie ||
+		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+
+		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+			for (i = 0; i < atsru->devices_cnt; i++)
+				if (atsru->devices[i] == bridge)
+					return 1;
+			break;
+		}
+	}
+
+	if (atsru->include_all)
+		return 1;
+
+	return 0;
+}
 #endif
 
 static void __init
@@ -274,22 +352,28 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 {
 	struct acpi_dmar_hardware_unit *drhd;
 	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_atsr *atsr;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-		drhd = (struct acpi_dmar_hardware_unit *)header;
+		drhd = container_of(header, struct acpi_dmar_hardware_unit,
+				    header);
 		printk (KERN_INFO PREFIX
-			"DRHD (flags: 0x%08x)base: 0x%016Lx\n",
-			drhd->flags, (unsigned long long)drhd->address);
+			"DRHD base: %#016Lx flags: %#x\n",
+			(unsigned long long)drhd->address, drhd->flags);
 		break;
 	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-		rmrr = (struct acpi_dmar_reserved_memory *)header;
-
+		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+				    header);
 		printk (KERN_INFO PREFIX
-			"RMRR base: 0x%016Lx end: 0x%016Lx\n",
+			"RMRR base: %#016Lx end: %#016Lx\n",
 			(unsigned long long)rmrr->base_address,
 			(unsigned long long)rmrr->end_address);
 		break;
+	case ACPI_DMAR_TYPE_ATSR:
+		atsr = container_of(header, struct acpi_dmar_atsr, header);
+		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
+		break;
 	}
 }
 
@@ -361,6 +445,11 @@ parse_dmar_table(void)
 		case ACPI_DMAR_TYPE_RESERVED_MEMORY:
 #ifdef CONFIG_DMAR
 			ret = dmar_parse_one_rmrr(entry_header);
+#endif
+			break;
+		case ACPI_DMAR_TYPE_ATSR:
+#ifdef CONFIG_DMAR
+			ret = dmar_parse_one_atsr(entry_header);
 #endif
 			break;
 		default:
@@ -431,11 +520,19 @@ int __init dmar_dev_scope_init(void)
 #ifdef CONFIG_DMAR
 	{
 		struct dmar_rmrr_unit *rmrr, *rmrr_n;
+		struct dmar_atsr_unit *atsr, *atsr_n;
+
 		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
 			ret = rmrr_parse_dev(rmrr);
 			if (ret)
 				return ret;
 		}
+
+		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
+			ret = atsr_parse_dev(atsr);
+			if (ret)
+				return ret;
+		}
 	}
 #endif
 
@@ -468,6 +565,9 @@ int __init dmar_table_init(void)
 #ifdef CONFIG_DMAR
 	if (list_empty(&dmar_rmrr_units))
 		printk(KERN_INFO PREFIX "No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		printk(KERN_INFO PREFIX "No ATSR found\n");
 #endif
 
 #ifdef CONFIG_INTR_REMAP
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc342cd..7c9a207e5da 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -185,6 +185,15 @@ struct dmar_rmrr_unit {
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+struct dmar_atsr_unit {
+	struct list_head list;		/* list of ATSR units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct pci_dev **devices;	/* target devices */
+	int devices_cnt;		/* target device count */
+	u8 include_all:1;		/* include all ports */
+};
+
 /* Intel DMAR  initialization functions */
 extern int intel_iommu_init(void);
 #else
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 29e05a034c0..0a1939f200f 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -331,6 +331,7 @@ static inline void __iommu_flush_cache(
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
-- 
cgit v1.2.3-70-g09d2


From 6ba6c3a4cacfd68bf970e3e04e2ff0d66fa0f695 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:35 +0800
Subject: VT-d: add device IOTLB invalidation support

Support device IOTLB invalidation to flush the translation cached
in the Endpoint.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 77 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/intel-iommu.h | 14 ++++++++-
 2 files changed, 82 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 6d7f9619b8a..7b287cb38b7 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -699,7 +699,8 @@ void free_iommu(struct intel_iommu *iommu)
  */
 static inline void reclaim_free_desc(struct q_inval *qi)
 {
-	while (qi->desc_status[qi->free_tail] == QI_DONE) {
+	while (qi->desc_status[qi->free_tail] == QI_DONE ||
+	       qi->desc_status[qi->free_tail] == QI_ABORT) {
 		qi->desc_status[qi->free_tail] = QI_FREE;
 		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
 		qi->free_cnt++;
@@ -709,10 +710,13 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 static int qi_check_fault(struct intel_iommu *iommu, int index)
 {
 	u32 fault;
-	int head;
+	int head, tail;
 	struct q_inval *qi = iommu->qi;
 	int wait_index = (index + 1) % QI_LENGTH;
 
+	if (qi->desc_status[wait_index] == QI_ABORT)
+		return -EAGAIN;
+
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
 
 	/*
@@ -722,7 +726,11 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 	 */
 	if (fault & DMA_FSTS_IQE) {
 		head = readl(iommu->reg + DMAR_IQH_REG);
-		if ((head >> 4) == index) {
+		if ((head >> DMAR_IQ_SHIFT) == index) {
+			printk(KERN_ERR "VT-d detected invalid descriptor: "
+				"low=%llx, high=%llx\n",
+				(unsigned long long)qi->desc[index].low,
+				(unsigned long long)qi->desc[index].high);
 			memcpy(&qi->desc[index], &qi->desc[wait_index],
 					sizeof(struct qi_desc));
 			__iommu_flush_cache(iommu, &qi->desc[index],
@@ -732,6 +740,32 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 		}
 	}
 
+	/*
+	 * If ITE happens, all pending wait_desc commands are aborted.
+	 * No new descriptors are fetched until the ITE is cleared.
+	 */
+	if (fault & DMA_FSTS_ITE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+		head |= 1;
+		tail = readl(iommu->reg + DMAR_IQT_REG);
+		tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+
+		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+		do {
+			if (qi->desc_status[head] == QI_IN_USE)
+				qi->desc_status[head] = QI_ABORT;
+			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+		} while (head != tail);
+
+		if (qi->desc_status[wait_index] == QI_ABORT)
+			return -EAGAIN;
+	}
+
+	if (fault & DMA_FSTS_ICE)
+		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
 	return 0;
 }
 
@@ -741,7 +775,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
  */
 int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 {
-	int rc = 0;
+	int rc;
 	struct q_inval *qi = iommu->qi;
 	struct qi_desc *hw, wait_desc;
 	int wait_index, index;
@@ -752,6 +786,9 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 
 	hw = qi->desc;
 
+restart:
+	rc = 0;
+
 	spin_lock_irqsave(&qi->q_lock, flags);
 	while (qi->free_cnt < 3) {
 		spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -782,7 +819,7 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	 * update the HW tail register indicating the presence of
 	 * new descriptors.
 	 */
-	writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG);
+	writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
 
 	while (qi->desc_status[wait_index] != QI_DONE) {
 		/*
@@ -794,18 +831,21 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		 */
 		rc = qi_check_fault(iommu, index);
 		if (rc)
-			goto out;
+			break;
 
 		spin_unlock(&qi->q_lock);
 		cpu_relax();
 		spin_lock(&qi->q_lock);
 	}
-out:
-	qi->desc_status[index] = qi->desc_status[wait_index] = QI_DONE;
+
+	qi->desc_status[index] = QI_DONE;
 
 	reclaim_free_desc(qi);
 	spin_unlock_irqrestore(&qi->q_lock, flags);
 
+	if (rc == -EAGAIN)
+		goto restart;
+
 	return rc;
 }
 
@@ -857,6 +897,27 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	qi_submit_sync(&desc, iommu);
 }
 
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			u64 addr, unsigned mask)
+{
+	struct qi_desc desc;
+
+	if (mask) {
+		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
+		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+	} else
+		desc.high = QI_DEV_IOTLB_ADDR(addr);
+
+	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+		qdep = 0;
+
+	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+		   QI_DIOTLB_TYPE;
+
+	qi_submit_sync(&desc, iommu);
+}
+
 /*
  * Disable Queued Invalidation interface.
  */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0a1939f200f..40561b224a1 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -53,6 +53,7 @@
 #define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
 #define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
 #define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
@@ -198,6 +199,8 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_FSTS_PPF ((u32)2)
 #define DMA_FSTS_PFO ((u32)1)
 #define DMA_FSTS_IQE (1 << 4)
+#define DMA_FSTS_ICE (1 << 5)
+#define DMA_FSTS_ITE (1 << 6)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
@@ -226,7 +229,8 @@ do {									\
 enum {
 	QI_FREE,
 	QI_IN_USE,
-	QI_DONE
+	QI_DONE,
+	QI_ABORT
 };
 
 #define QI_CC_TYPE		0x1
@@ -255,6 +259,12 @@ enum {
 #define QI_CC_DID(did)		(((u64)did) << 16)
 #define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
 
+#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
+#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
+#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_SIZE	1
+#define QI_DEV_IOTLB_MAX_INVS	32
+
 struct qi_desc {
 	u64 low, high;
 };
@@ -344,6 +354,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
 			     u8 fm, u64 type);
 extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			       u64 addr, unsigned mask);
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-- 
cgit v1.2.3-70-g09d2


From 93a23a7271dfb811b3adb72779054c3a24433112 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 18 May 2009 13:51:37 +0800
Subject: VT-d: support the device IOTLB

Enable the device IOTLB (i.e. ATS) for both the bare metal and KVM
environments.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 109 ++++++++++++++++++++++++++++++++++++++----
 include/linux/dma_remapping.h |   1 +
 include/linux/intel-iommu.h   |   1 +
 3 files changed, 102 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 6d7cb84c63e..c3cdfc90c13 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -252,6 +252,7 @@ struct device_domain_info {
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
 	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
 	struct dmar_domain *domain; /* pointer to domain */
 };
 
@@ -945,6 +946,77 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 			(unsigned long long)DMA_TLB_IAIG(val));
 }
 
+static struct device_domain_info *iommu_support_dev_iotlb(
+	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
+{
+	int found = 0;
+	unsigned long flags;
+	struct device_domain_info *info;
+	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
+
+	if (!ecap_dev_iotlb_support(iommu->ecap))
+		return NULL;
+
+	if (!iommu->qi)
+		return NULL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->bus == bus && info->devfn == devfn) {
+			found = 1;
+			break;
+		}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (!found || !info->dev)
+		return NULL;
+
+	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
+		return NULL;
+
+	if (!dmar_find_matched_atsr_unit(info->dev))
+		return NULL;
+
+	info->iommu = iommu;
+
+	return info;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info)
+		return;
+
+	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+	if (!info->dev || !pci_ats_enabled(info->dev))
+		return;
+
+	pci_disable_ats(info->dev);
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+				  u64 addr, unsigned mask)
+{
+	u16 sid, qdep;
+	unsigned long flags;
+	struct device_domain_info *info;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev || !pci_ats_enabled(info->dev))
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		qdep = pci_ats_queue_depth(info->dev);
+		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 				  u64 addr, unsigned int pages)
 {
@@ -965,6 +1037,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	else
 		iommu->flush.flush_iotlb(iommu, did, addr, mask,
 						DMA_TLB_PSI_FLUSH);
+	if (did)
+		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1305,6 +1379,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	unsigned long ndomains;
 	int id;
 	int agaw;
+	struct device_domain_info *info = NULL;
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1372,15 +1447,21 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 
 	context_set_domain_id(context, id);
 
+	if (translation != CONTEXT_TT_PASS_THROUGH) {
+		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
+		translation = info ? CONTEXT_TT_DEV_IOTLB :
+				     CONTEXT_TT_MULTI_LEVEL;
+	}
 	/*
 	 * In pass through mode, AW must be programmed to indicate the largest
 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
 	 */
-	if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
-		context_set_address_width(context, iommu->agaw);
-		context_set_address_root(context, virt_to_phys(pgd));
-	} else
+	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
 		context_set_address_width(context, iommu->msagaw);
+	else {
+		context_set_address_root(context, virt_to_phys(pgd));
+		context_set_address_width(context, iommu->agaw);
+	}
 
 	context_set_translation_type(context, translation);
 	context_set_fault_enable(context);
@@ -1402,6 +1483,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 	} else {
 		iommu_flush_write_buffer(iommu);
 	}
+	iommu_enable_dev_iotlb(info);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	spin_lock_irqsave(&domain->iommu_lock, flags);
@@ -1552,6 +1634,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 			info->dev->dev.archdata.iommu = NULL;
 		spin_unlock_irqrestore(&device_domain_lock, flags);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		free_devinfo_mem(info);
@@ -2259,10 +2342,16 @@ static void flush_unmaps(void)
 			continue;
 
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH, 0);
+					 DMA_TLB_GLOBAL_FLUSH);
 		for (j = 0; j < deferred_flush[i].next; j++) {
-			__free_iova(&deferred_flush[i].domain[j]->iovad,
-					deferred_flush[i].iova[j]);
+			unsigned long mask;
+			struct iova *iova = deferred_flush[i].iova[j];
+
+			mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT;
+			mask = ilog2(mask >> VTD_PAGE_SHIFT);
+			iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
+					iova->pfn_lo << PAGE_SHIFT, mask);
+			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
 		}
 		deferred_flush[i].next = 0;
 	}
@@ -2943,6 +3032,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 				info->dev->dev.archdata.iommu = NULL;
 			spin_unlock_irqrestore(&device_domain_lock, flags);
 
+			iommu_disable_dev_iotlb(info);
 			iommu_detach_dev(iommu, info->bus, info->devfn);
 			iommu_detach_dependent_devices(iommu, pdev);
 			free_devinfo_mem(info);
@@ -2993,6 +3083,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
 		spin_unlock_irqrestore(&device_domain_lock, flags1);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		iommu_detach_dependent_devices(iommu, info->dev);
@@ -3197,11 +3288,11 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 		return -EFAULT;
 	}
 
-	ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+	ret = vm_domain_add_dev_info(dmar_domain, pdev);
 	if (ret)
 		return ret;
 
-	ret = vm_domain_add_dev_info(dmar_domain, pdev);
+	ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 	return ret;
 }
 
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index e0a03aff63d..5619f852273 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -14,6 +14,7 @@
 #define DMA_PTE_SNP (1 << 11)
 
 #define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_DEV_IOTLB	1
 #define CONTEXT_TT_PASS_THROUGH 2
 
 struct intel_iommu;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 40561b224a1..482dc91fd53 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -124,6 +124,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_pass_through(e)	((e >> 6) & 0x1)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
 #define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
 
-- 
cgit v1.2.3-70-g09d2


From 7004bf252c53da18f6b55103e0c92f777f846806 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 18 May 2009 00:34:33 +0000
Subject: net: add tx_packets/tx_bytes/tx_dropped counters in struct
 netdev_queue

offsetof(struct net_device, features)=0x44
offsetof(struct net_device, stats.tx_packets)=0x54
offsetof(struct net_device, stats.tx_bytes)=0x5c
offsetof(struct net_device, stats.tx_dropped)=0x6c

Network drivers that touch dev->stats.tx_packets/stats.tx_bytes in their
tx path can slow down SMP operations, since they dirty a cache line
that should stay shared (dev->features is needed in rx and tx paths)

We could move away stats field in net_device but it wont help that much.
(Two cache lines dirtied in tx path, we can do one only)

Better solution is to add tx_packets/tx_bytes/tx_dropped in struct
netdev_queue because this structure is already touched in tx path and
counters updates will then be free (no increase in size)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 23 ++++++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cd547d04a8c..f8574e76b74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -474,6 +474,9 @@ struct netdev_queue {
 	 * please use this field instead of dev->trans_start
 	 */
 	unsigned long		trans_start;
+	unsigned long		tx_bytes;
+	unsigned long		tx_packets;
+	unsigned long		tx_dropped;
 } ____cacheline_aligned_in_smp;
 
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 14dd725aaab..6d3630d1627 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4943,13 +4943,30 @@ void netdev_run_todo(void)
  *	the internal statistics structure is used.
  */
 const struct net_device_stats *dev_get_stats(struct net_device *dev)
- {
+{
 	const struct net_device_ops *ops = dev->netdev_ops;
 
 	if (ops->ndo_get_stats)
 		return ops->ndo_get_stats(dev);
-	else
-		return &dev->stats;
+	else {
+		unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+		struct net_device_stats *stats = &dev->stats;
+		unsigned int i;
+		struct netdev_queue *txq;
+
+		for (i = 0; i < dev->num_tx_queues; i++) {
+			txq = netdev_get_tx_queue(dev, i);
+			tx_bytes   += txq->tx_bytes;
+			tx_packets += txq->tx_packets;
+			tx_dropped += txq->tx_dropped;
+		}
+		if (tx_bytes || tx_packets || tx_dropped) {
+			stats->tx_bytes   = tx_bytes;
+			stats->tx_packets = tx_packets;
+			stats->tx_dropped = tx_dropped;
+		}
+		return stats;
+	}
 }
 EXPORT_SYMBOL(dev_get_stats);
 
-- 
cgit v1.2.3-70-g09d2


From 75834fc3b6fcff00327f5d2a18760c1e8e0179c5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 18 May 2009 10:26:10 -0400
Subject: SELinux: move SELINUX_MAGIC into magic.h

The selinuxfs superblock magic is used inside the IMA code, but is being
defined in two places and could someday get out of sync.  This patch moves the
declaration into magic.h so it is only done once.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/magic.h               | 1 +
 security/integrity/ima/ima_policy.c | 8 +++-----
 security/selinux/include/security.h | 3 +--
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/magic.h b/include/linux/magic.h
index 5b4e28bcb78..927138cf305 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -9,6 +9,7 @@
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
+#define SELINUX_MAGIC		0xf97cff8c
 #define TMPFS_MAGIC		0x01021994
 #define SQUASHFS_MAGIC		0x73717368
 #define EFS_SUPER_MAGIC		0x414A53
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index dec6dcb1c8d..31d677f7c65 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -49,14 +49,12 @@ struct ima_measure_rule_entry {
  * written in terms of .action, .func, .mask, .fsmagic, and .uid
  */
 static struct ima_measure_rule_entry default_rules[] = {
-	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,
-	 .flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC},
-	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,
-	 .flags = IMA_FSMAGIC},
-	{.action = DONT_MEASURE,.fsmagic = 0xF97CFF8C,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SELINUX_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = MEASURE,.func = FILE_MMAP,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index a7be3f01fb0..ca835795a8b 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -8,14 +8,13 @@
 #ifndef _SELINUX_SECURITY_H_
 #define _SELINUX_SECURITY_H_
 
+#include <linux/magic.h>
 #include "flask.h"
 
 #define SECSID_NULL			0x00000000 /* unspecified SID */
 #define SECSID_WILD			0xffffffff /* wildcard SID */
 #define SECCLASS_NULL			0x0000 /* no class */
 
-#define SELINUX_MAGIC 0xf97cff8c
-
 /* Identify specific policy version changes */
 #define POLICYDB_VERSION_BASE		15
 #define POLICYDB_VERSION_BOOL		16
-- 
cgit v1.2.3-70-g09d2


From 39549eef3587f1c1e8c65c88a2400d10fd30ea17 Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Fri, 15 May 2009 23:39:29 +0000
Subject: can: CAN Network device driver and Netlink interface

The CAN network device driver interface provides a generic interface to
setup, configure and monitor CAN network devices. It exports a set of
common data structures and functions, which all real CAN network device
drivers should use. Please have a look to the SJA1000 or MSCAN driver
to understand how to use them. The name of the module is can-dev.ko.

Furthermore, it adds a Netlink interface allowing to configure the CAN
device using the program "ip" from the iproute2 utility suite.

For further information please check "Documentation/networking/can.txt"

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/Kconfig     |  23 ++
 drivers/net/can/Makefile    |   5 +
 drivers/net/can/dev.c       | 657 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/Kbuild    |   1 +
 include/linux/can/dev.h     |  70 +++++
 include/linux/can/netlink.h | 113 ++++++++
 6 files changed, 869 insertions(+)
 create mode 100644 drivers/net/can/dev.c
 create mode 100644 include/linux/can/dev.h
 create mode 100644 include/linux/can/netlink.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index 57def0d5737..77adb8ef6e4 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -12,6 +12,29 @@ config CAN_VCAN
 	  This driver can also be built as a module.  If so, the module
 	  will be called vcan.
 
+config CAN_DEV
+	tristate "Platform CAN drivers with Netlink support"
+	depends on CAN
+	default Y
+	---help---
+	  Enables the common framework for platform CAN drivers with Netlink
+	  support. This is the standard library for CAN drivers.
+	  If unsure, say Y.
+
+config CAN_CALC_BITTIMING
+	bool "CAN bit-timing calculation"
+	depends on CAN_DEV
+	default Y
+	---help---
+	  If enabled, CAN bit-timing parameters will be calculated for the
+	  bit-rate specified via Netlink argument "bitrate" when the device
+	  get started. This works fine for the most common CAN controllers
+	  with standard bit-rates but may fail for exotic bit-rates or CAN
+	  source clock frequencies. Disabling saves some space, but then the
+	  bit-timing parameters must be specified directly using the Netlink
+	  arguments "tq", "prop_seg", "phase_seg1", "phase_seg2" and "sjw".
+	  If unsure, say Y.
+
 config CAN_DEBUG_DEVICES
 	bool "CAN devices debugging messages"
 	depends on CAN
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index c4bead705cd..6c865475cd8 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -3,3 +3,8 @@
 #
 
 obj-$(CONFIG_CAN_VCAN)		+= vcan.o
+
+obj-$(CONFIG_CAN_DEV)		+= can-dev.o
+can-dev-y			:= dev.o
+
+ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
new file mode 100644
index 00000000000..52b0e7d8901
--- /dev/null
+++ b/drivers/net/can/dev.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
+ * Copyright (C) 2006 Andrey Volkov, Varma Electronics
+ * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+#include <linux/can/netlink.h>
+#include <net/rtnetlink.h>
+
+#define MOD_DESC "CAN device driver interface"
+
+MODULE_DESCRIPTION(MOD_DESC);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>");
+
+#ifdef CONFIG_CAN_CALC_BITTIMING
+#define CAN_CALC_MAX_ERROR 50 /* in one-tenth of a percent */
+
+/*
+ * Bit-timing calculation derived from:
+ *
+ * Code based on LinCAN sources and H8S2638 project
+ * Copyright 2004-2006 Pavel Pisa - DCE FELK CVUT cz
+ * Copyright 2005      Stanislav Marek
+ * email: pisa@cmp.felk.cvut.cz
+ *
+ * Calculates proper bit-timing parameters for a specified bit-rate
+ * and sample-point, which can then be used to set the bit-timing
+ * registers of the CAN controller. You can find more information
+ * in the header file linux/can/netlink.h.
+ */
+static int can_update_spt(const struct can_bittiming_const *btc,
+			  int sampl_pt, int tseg, int *tseg1, int *tseg2)
+{
+	*tseg2 = tseg + 1 - (sampl_pt * (tseg + 1)) / 1000;
+	if (*tseg2 < btc->tseg2_min)
+		*tseg2 = btc->tseg2_min;
+	if (*tseg2 > btc->tseg2_max)
+		*tseg2 = btc->tseg2_max;
+	*tseg1 = tseg - *tseg2;
+	if (*tseg1 > btc->tseg1_max) {
+		*tseg1 = btc->tseg1_max;
+		*tseg2 = tseg - *tseg1;
+	}
+	return 1000 * (tseg + 1 - *tseg2) / (tseg + 1);
+}
+
+static int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	const struct can_bittiming_const *btc = priv->bittiming_const;
+	long rate, best_rate = 0;
+	long best_error = 1000000000, error = 0;
+	int best_tseg = 0, best_brp = 0, brp = 0;
+	int tsegall, tseg = 0, tseg1 = 0, tseg2 = 0;
+	int spt_error = 1000, spt = 0, sampl_pt;
+	u64 v64;
+
+	if (!priv->bittiming_const)
+		return -ENOTSUPP;
+
+	/* Use CIA recommended sample points */
+	if (bt->sample_point) {
+		sampl_pt = bt->sample_point;
+	} else {
+		if (bt->bitrate > 800000)
+			sampl_pt = 750;
+		else if (bt->bitrate > 500000)
+			sampl_pt = 800;
+		else
+			sampl_pt = 875;
+	}
+
+	/* tseg even = round down, odd = round up */
+	for (tseg = (btc->tseg1_max + btc->tseg2_max) * 2 + 1;
+	     tseg >= (btc->tseg1_min + btc->tseg2_min) * 2; tseg--) {
+		tsegall = 1 + tseg / 2;
+		/* Compute all possible tseg choices (tseg=tseg1+tseg2) */
+		brp = priv->clock.freq / (tsegall * bt->bitrate) + tseg % 2;
+		/* chose brp step which is possible in system */
+		brp = (brp / btc->brp_inc) * btc->brp_inc;
+		if ((brp < btc->brp_min) || (brp > btc->brp_max))
+			continue;
+		rate = priv->clock.freq / (brp * tsegall);
+		error = bt->bitrate - rate;
+		/* tseg brp biterror */
+		if (error < 0)
+			error = -error;
+		if (error > best_error)
+			continue;
+		best_error = error;
+		if (error == 0) {
+			spt = can_update_spt(btc, sampl_pt, tseg / 2,
+					     &tseg1, &tseg2);
+			error = sampl_pt - spt;
+			if (error < 0)
+				error = -error;
+			if (error > spt_error)
+				continue;
+			spt_error = error;
+		}
+		best_tseg = tseg / 2;
+		best_brp = brp;
+		best_rate = rate;
+		if (error == 0)
+			break;
+	}
+
+	if (best_error) {
+		/* Error in one-tenth of a percent */
+		error = (best_error * 1000) / bt->bitrate;
+		if (error > CAN_CALC_MAX_ERROR) {
+			dev_err(dev->dev.parent,
+				"bitrate error %ld.%ld%% too high\n",
+				error / 10, error % 10);
+			return -EDOM;
+		} else {
+			dev_warn(dev->dev.parent, "bitrate error %ld.%ld%%\n",
+				 error / 10, error % 10);
+		}
+	}
+
+	/* real sample point */
+	bt->sample_point = can_update_spt(btc, sampl_pt, best_tseg,
+					  &tseg1, &tseg2);
+
+	v64 = (u64)best_brp * 1000000000UL;
+	do_div(v64, priv->clock.freq);
+	bt->tq = (u32)v64;
+	bt->prop_seg = tseg1 / 2;
+	bt->phase_seg1 = tseg1 - bt->prop_seg;
+	bt->phase_seg2 = tseg2;
+	bt->sjw = 1;
+	bt->brp = best_brp;
+	/* real bit-rate */
+	bt->bitrate = priv->clock.freq / (bt->brp * (tseg1 + tseg2 + 1));
+
+	return 0;
+}
+#else /* !CONFIG_CAN_CALC_BITTIMING */
+static int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt)
+{
+	dev_err(dev->dev.parent, "bit-timing calculation not available\n");
+	return -EINVAL;
+}
+#endif /* CONFIG_CAN_CALC_BITTIMING */
+
+/*
+ * Checks the validity of the specified bit-timing parameters prop_seg,
+ * phase_seg1, phase_seg2 and sjw and tries to determine the bitrate
+ * prescaler value brp. You can find more information in the header
+ * file linux/can/netlink.h.
+ */
+static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	const struct can_bittiming_const *btc = priv->bittiming_const;
+	int tseg1, alltseg;
+	u64 brp64;
+
+	if (!priv->bittiming_const)
+		return -ENOTSUPP;
+
+	tseg1 = bt->prop_seg + bt->phase_seg1;
+	if (!bt->sjw)
+		bt->sjw = 1;
+	if (bt->sjw > btc->sjw_max ||
+	    tseg1 < btc->tseg1_min || tseg1 > btc->tseg1_max ||
+	    bt->phase_seg2 < btc->tseg2_min || bt->phase_seg2 > btc->tseg2_max)
+		return -ERANGE;
+
+	brp64 = (u64)priv->clock.freq * (u64)bt->tq;
+	if (btc->brp_inc > 1)
+		do_div(brp64, btc->brp_inc);
+	brp64 += 500000000UL - 1;
+	do_div(brp64, 1000000000UL); /* the practicable BRP */
+	if (btc->brp_inc > 1)
+		brp64 *= btc->brp_inc;
+	bt->brp = (u32)brp64;
+
+	if (bt->brp < btc->brp_min || bt->brp > btc->brp_max)
+		return -EINVAL;
+
+	alltseg = bt->prop_seg + bt->phase_seg1 + bt->phase_seg2 + 1;
+	bt->bitrate = priv->clock.freq / (bt->brp * alltseg);
+	bt->sample_point = ((tseg1 + 1) * 1000) / alltseg;
+
+	return 0;
+}
+
+int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	int err;
+
+	/* Check if the CAN device has bit-timing parameters */
+	if (priv->bittiming_const) {
+
+		/* Non-expert mode? Check if the bitrate has been pre-defined */
+		if (!bt->tq)
+			/* Determine bit-timing parameters */
+			err = can_calc_bittiming(dev, bt);
+		else
+			/* Check bit-timing params and calculate proper brp */
+			err = can_fixup_bittiming(dev, bt);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * Local echo of CAN messages
+ *
+ * CAN network devices *should* support a local echo functionality
+ * (see Documentation/networking/can.txt). To test the handling of CAN
+ * interfaces that do not support the local echo both driver types are
+ * implemented. In the case that the driver does not support the echo
+ * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core
+ * to perform the echo as a fallback solution.
+ */
+static void can_flush_echo_skb(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	struct net_device_stats *stats = &dev->stats;
+	int i;
+
+	for (i = 0; i < CAN_ECHO_SKB_MAX; i++) {
+		if (priv->echo_skb[i]) {
+			kfree_skb(priv->echo_skb[i]);
+			priv->echo_skb[i] = NULL;
+			stats->tx_dropped++;
+			stats->tx_aborted_errors++;
+		}
+	}
+}
+
+/*
+ * Put the skb on the stack to be looped backed locally lateron
+ *
+ * The function is typically called in the start_xmit function
+ * of the device driver. The driver must protect access to
+ * priv->echo_skb, if necessary.
+ */
+void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, int idx)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	/* check flag whether this packet has to be looped back */
+	if (!(dev->flags & IFF_ECHO) || skb->pkt_type != PACKET_LOOPBACK) {
+		kfree_skb(skb);
+		return;
+	}
+
+	if (!priv->echo_skb[idx]) {
+		struct sock *srcsk = skb->sk;
+
+		if (atomic_read(&skb->users) != 1) {
+			struct sk_buff *old_skb = skb;
+
+			skb = skb_clone(old_skb, GFP_ATOMIC);
+			kfree_skb(old_skb);
+			if (!skb)
+				return;
+		} else
+			skb_orphan(skb);
+
+		skb->sk = srcsk;
+
+		/* make settings for echo to reduce code in irq context */
+		skb->protocol = htons(ETH_P_CAN);
+		skb->pkt_type = PACKET_BROADCAST;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb->dev = dev;
+
+		/* save this skb for tx interrupt echo handling */
+		priv->echo_skb[idx] = skb;
+	} else {
+		/* locking problem with netif_stop_queue() ?? */
+		dev_err(dev->dev.parent, "%s: BUG! echo_skb is occupied!\n",
+			__func__);
+		kfree_skb(skb);
+	}
+}
+EXPORT_SYMBOL_GPL(can_put_echo_skb);
+
+/*
+ * Get the skb from the stack and loop it back locally
+ *
+ * The function is typically called when the TX done interrupt
+ * is handled in the device driver. The driver must protect
+ * access to priv->echo_skb, if necessary.
+ */
+void can_get_echo_skb(struct net_device *dev, int idx)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	if ((dev->flags & IFF_ECHO) && priv->echo_skb[idx]) {
+		netif_rx(priv->echo_skb[idx]);
+		priv->echo_skb[idx] = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(can_get_echo_skb);
+
+/*
+ * CAN device restart for bus-off recovery
+ */
+void can_restart(unsigned long data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct can_priv *priv = netdev_priv(dev);
+	struct net_device_stats *stats = &dev->stats;
+	struct sk_buff *skb;
+	struct can_frame *cf;
+	int err;
+
+	BUG_ON(netif_carrier_ok(dev));
+
+	/*
+	 * No synchronization needed because the device is bus-off and
+	 * no messages can come in or go out.
+	 */
+	can_flush_echo_skb(dev);
+
+	/* send restart message upstream */
+	skb = dev_alloc_skb(sizeof(struct can_frame));
+	if (skb == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_CAN);
+	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
+	memset(cf, 0, sizeof(struct can_frame));
+	cf->can_id = CAN_ERR_FLAG | CAN_ERR_RESTARTED;
+	cf->can_dlc = CAN_ERR_DLC;
+
+	netif_rx(skb);
+
+	dev->last_rx = jiffies;
+	stats->rx_packets++;
+	stats->rx_bytes += cf->can_dlc;
+
+	dev_dbg(dev->dev.parent, "restarted\n");
+	priv->can_stats.restarts++;
+
+	/* Now restart the device */
+	err = priv->do_set_mode(dev, CAN_MODE_START);
+
+out:
+	netif_carrier_on(dev);
+	if (err)
+		dev_err(dev->dev.parent, "Error %d during restart", err);
+}
+
+int can_restart_now(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	/*
+	 * A manual restart is only permitted if automatic restart is
+	 * disabled and the device is in the bus-off state
+	 */
+	if (priv->restart_ms)
+		return -EINVAL;
+	if (priv->state != CAN_STATE_BUS_OFF)
+		return -EBUSY;
+
+	/* Runs as soon as possible in the timer context */
+	mod_timer(&priv->restart_timer, jiffies);
+
+	return 0;
+}
+
+/*
+ * CAN bus-off
+ *
+ * This functions should be called when the device goes bus-off to
+ * tell the netif layer that no more packets can be sent or received.
+ * If enabled, a timer is started to trigger bus-off recovery.
+ */
+void can_bus_off(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	dev_dbg(dev->dev.parent, "bus-off\n");
+
+	netif_carrier_off(dev);
+	priv->can_stats.bus_off++;
+
+	if (priv->restart_ms)
+		mod_timer(&priv->restart_timer,
+			  jiffies + (priv->restart_ms * HZ) / 1000);
+}
+EXPORT_SYMBOL_GPL(can_bus_off);
+
+static void can_setup(struct net_device *dev)
+{
+	dev->type = ARPHRD_CAN;
+	dev->mtu = sizeof(struct can_frame);
+	dev->hard_header_len = 0;
+	dev->addr_len = 0;
+	dev->tx_queue_len = 10;
+
+	/* New-style flags. */
+	dev->flags = IFF_NOARP;
+	dev->features = NETIF_F_NO_CSUM;
+}
+
+/*
+ * Allocate and setup space for the CAN network device
+ */
+struct net_device *alloc_candev(int sizeof_priv)
+{
+	struct net_device *dev;
+	struct can_priv *priv;
+
+	dev = alloc_netdev(sizeof_priv, "can%d", can_setup);
+	if (!dev)
+		return NULL;
+
+	priv = netdev_priv(dev);
+
+	priv->state = CAN_STATE_STOPPED;
+
+	init_timer(&priv->restart_timer);
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(alloc_candev);
+
+/*
+ * Free space of the CAN network device
+ */
+void free_candev(struct net_device *dev)
+{
+	free_netdev(dev);
+}
+EXPORT_SYMBOL_GPL(free_candev);
+
+/*
+ * Common open function when the device gets opened.
+ *
+ * This function should be called in the open function of the device
+ * driver.
+ */
+int open_candev(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	if (!priv->bittiming.tq && !priv->bittiming.bitrate) {
+		dev_err(dev->dev.parent, "bit-timing not yet defined\n");
+		return -EINVAL;
+	}
+
+	setup_timer(&priv->restart_timer, can_restart, (unsigned long)dev);
+
+	return 0;
+}
+EXPORT_SYMBOL(open_candev);
+
+/*
+ * Common close function for cleanup before the device gets closed.
+ *
+ * This function should be called in the close function of the device
+ * driver.
+ */
+void close_candev(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	if (del_timer_sync(&priv->restart_timer))
+		dev_put(dev);
+	can_flush_echo_skb(dev);
+}
+EXPORT_SYMBOL_GPL(close_candev);
+
+/*
+ * CAN netlink interface
+ */
+static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
+	[IFLA_CAN_STATE]	= { .type = NLA_U32 },
+	[IFLA_CAN_CTRLMODE]	= { .len = sizeof(struct can_ctrlmode) },
+	[IFLA_CAN_RESTART_MS]	= { .type = NLA_U32 },
+	[IFLA_CAN_RESTART]	= { .type = NLA_U32 },
+	[IFLA_CAN_BITTIMING]	= { .len = sizeof(struct can_bittiming) },
+	[IFLA_CAN_BITTIMING_CONST]
+				= { .len = sizeof(struct can_bittiming_const) },
+	[IFLA_CAN_CLOCK]	= { .len = sizeof(struct can_clock) },
+};
+
+static int can_changelink(struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[])
+{
+	struct can_priv *priv = netdev_priv(dev);
+	int err;
+
+	/* We need synchronization with dev->stop() */
+	ASSERT_RTNL();
+
+	if (data[IFLA_CAN_CTRLMODE]) {
+		struct can_ctrlmode *cm;
+
+		/* Do not allow changing controller mode while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+		cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+		priv->ctrlmode &= ~cm->mask;
+		priv->ctrlmode |= cm->flags;
+	}
+
+	if (data[IFLA_CAN_BITTIMING]) {
+		struct can_bittiming bt;
+
+		/* Do not allow changing bittiming while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+		memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt));
+		if ((!bt.bitrate && !bt.tq) || (bt.bitrate && bt.tq))
+			return -EINVAL;
+		err = can_get_bittiming(dev, &bt);
+		if (err)
+			return err;
+		memcpy(&priv->bittiming, &bt, sizeof(bt));
+
+		if (priv->do_set_bittiming) {
+			/* Finally, set the bit-timing registers */
+			err = priv->do_set_bittiming(dev);
+			if (err)
+				return err;
+		}
+	}
+
+	if (data[IFLA_CAN_RESTART_MS]) {
+		/* Do not allow changing restart delay while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+		priv->restart_ms = nla_get_u32(data[IFLA_CAN_RESTART_MS]);
+	}
+
+	if (data[IFLA_CAN_RESTART]) {
+		/* Do not allow a restart while not running */
+		if (!(dev->flags & IFF_UP))
+			return -EINVAL;
+		err = can_restart_now(dev);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	struct can_ctrlmode cm = {.flags = priv->ctrlmode};
+	enum can_state state = priv->state;
+
+	if (priv->do_get_state)
+		priv->do_get_state(dev, &state);
+	NLA_PUT_U32(skb, IFLA_CAN_STATE, state);
+	NLA_PUT(skb, IFLA_CAN_CTRLMODE, sizeof(cm), &cm);
+	NLA_PUT_U32(skb, IFLA_CAN_RESTART_MS, priv->restart_ms);
+	NLA_PUT(skb, IFLA_CAN_BITTIMING,
+		sizeof(priv->bittiming), &priv->bittiming);
+	NLA_PUT(skb, IFLA_CAN_CLOCK, sizeof(cm), &priv->clock);
+	if (priv->bittiming_const)
+		NLA_PUT(skb, IFLA_CAN_BITTIMING_CONST,
+			sizeof(*priv->bittiming_const), priv->bittiming_const);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int can_fill_xstats(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	NLA_PUT(skb, IFLA_INFO_XSTATS,
+		sizeof(priv->can_stats), &priv->can_stats);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops can_link_ops __read_mostly = {
+	.kind		= "can",
+	.maxtype	= IFLA_CAN_MAX,
+	.policy		= can_policy,
+	.setup		= can_setup,
+	.changelink	= can_changelink,
+	.fill_info	= can_fill_info,
+	.fill_xstats	= can_fill_xstats,
+};
+
+/*
+ * Register the CAN network device
+ */
+int register_candev(struct net_device *dev)
+{
+	dev->rtnl_link_ops = &can_link_ops;
+	return register_netdev(dev);
+}
+EXPORT_SYMBOL_GPL(register_candev);
+
+/*
+ * Unregister the CAN network device
+ */
+void unregister_candev(struct net_device *dev)
+{
+	unregister_netdev(dev);
+}
+EXPORT_SYMBOL_GPL(unregister_candev);
+
+static __init int can_dev_init(void)
+{
+	int err;
+
+	err = rtnl_link_register(&can_link_ops);
+	if (!err)
+		printk(KERN_INFO MOD_DESC "\n");
+
+	return err;
+}
+module_init(can_dev_init);
+
+static __exit void can_dev_exit(void)
+{
+	rtnl_link_unregister(&can_link_ops);
+}
+module_exit(can_dev_exit);
+
+MODULE_ALIAS_RTNL_LINK("can");
diff --git a/include/linux/can/Kbuild b/include/linux/can/Kbuild
index eff898aac02..8cb05aae661 100644
--- a/include/linux/can/Kbuild
+++ b/include/linux/can/Kbuild
@@ -1,3 +1,4 @@
 header-y += raw.h
 header-y += bcm.h
 header-y += error.h
+header-y += netlink.h
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
new file mode 100644
index 00000000000..4a37a56f6cd
--- /dev/null
+++ b/include/linux/can/dev.h
@@ -0,0 +1,70 @@
+/*
+ * linux/can/dev.h
+ *
+ * Definitions for the CAN network device driver interface
+ *
+ * Copyright (C) 2006 Andrey Volkov <avolkov@varma-el.com>
+ *               Varma Electronics Oy
+ *
+ * Copyright (C) 2008 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ */
+
+#ifndef CAN_DEV_H
+#define CAN_DEV_H
+
+#include <linux/can/netlink.h>
+#include <linux/can/error.h>
+
+/*
+ * CAN mode
+ */
+enum can_mode {
+	CAN_MODE_STOP = 0,
+	CAN_MODE_START,
+	CAN_MODE_SLEEP
+};
+
+/*
+ * CAN common private data
+ */
+#define CAN_ECHO_SKB_MAX  4
+
+struct can_priv {
+	struct can_device_stats can_stats;
+
+	struct can_bittiming bittiming;
+	struct can_bittiming_const *bittiming_const;
+	struct can_clock clock;
+
+	enum can_state state;
+	u32 ctrlmode;
+
+	int restart_ms;
+	struct timer_list restart_timer;
+
+	struct sk_buff *echo_skb[CAN_ECHO_SKB_MAX];
+
+	int (*do_set_bittiming)(struct net_device *dev);
+	int (*do_set_mode)(struct net_device *dev, enum can_mode mode);
+	int (*do_get_state)(const struct net_device *dev,
+			    enum can_state *state);
+};
+
+struct net_device *alloc_candev(int sizeof_priv);
+void free_candev(struct net_device *dev);
+
+int open_candev(struct net_device *dev);
+void close_candev(struct net_device *dev);
+
+int register_candev(struct net_device *dev);
+void unregister_candev(struct net_device *dev);
+
+int can_restart_now(struct net_device *dev);
+void can_bus_off(struct net_device *dev);
+
+void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, int idx);
+void can_get_echo_skb(struct net_device *dev, int idx);
+
+#endif /* CAN_DEV_H */
diff --git a/include/linux/can/netlink.h b/include/linux/can/netlink.h
new file mode 100644
index 00000000000..9ecbb7871c0
--- /dev/null
+++ b/include/linux/can/netlink.h
@@ -0,0 +1,113 @@
+/*
+ * linux/can/netlink.h
+ *
+ * Definitions for the CAN netlink interface
+ *
+ * Copyright (c) 2009 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ *
+ */
+
+#ifndef CAN_NETLINK_H
+#define CAN_NETLINK_H
+
+#include <linux/types.h>
+
+/*
+ * CAN bit-timing parameters
+ *
+ * For futher information, please read chapter "8 BIT TIMING
+ * REQUIREMENTS" of the "Bosch CAN Specification version 2.0"
+ * at http://www.semiconductors.bosch.de/pdf/can2spec.pdf.
+ */
+struct can_bittiming {
+	__u32 bitrate;		/* Bit-rate in bits/second */
+	__u32 sample_point;	/* Sample point in one-tenth of a percent */
+	__u32 tq;		/* Time quanta (TQ) in nanoseconds */
+	__u32 prop_seg;		/* Propagation segment in TQs */
+	__u32 phase_seg1;	/* Phase buffer segment 1 in TQs */
+	__u32 phase_seg2;	/* Phase buffer segment 2 in TQs */
+	__u32 sjw;		/* Synchronisation jump width in TQs */
+	__u32 brp;		/* Bit-rate prescaler */
+};
+
+/*
+ * CAN harware-dependent bit-timing constant
+ *
+ * Used for calculating and checking bit-timing parameters
+ */
+struct can_bittiming_const {
+	char name[16];		/* Name of the CAN controller hardware */
+	__u32 tseg1_min;	/* Time segement 1 = prop_seg + phase_seg1 */
+	__u32 tseg1_max;
+	__u32 tseg2_min;	/* Time segement 2 = phase_seg2 */
+	__u32 tseg2_max;
+	__u32 sjw_max;		/* Synchronisation jump width */
+	__u32 brp_min;		/* Bit-rate prescaler */
+	__u32 brp_max;
+	__u32 brp_inc;
+};
+
+/*
+ * CAN clock parameters
+ */
+struct can_clock {
+	__u32 freq;		/* CAN system clock frequency in Hz */
+};
+
+/*
+ * CAN operational and error states
+ */
+enum can_state {
+	CAN_STATE_ERROR_ACTIVE = 0,	/* RX/TX error count < 96 */
+	CAN_STATE_ERROR_WARNING,	/* RX/TX error count < 128 */
+	CAN_STATE_ERROR_PASSIVE,	/* RX/TX error count < 256 */
+	CAN_STATE_BUS_OFF,		/* RX/TX error count >= 256 */
+	CAN_STATE_STOPPED,		/* Device is stopped */
+	CAN_STATE_SLEEPING,		/* Device is sleeping */
+	CAN_STATE_MAX
+};
+
+/*
+ * CAN controller mode
+ */
+struct can_ctrlmode {
+	__u32 mask;
+	__u32 flags;
+};
+
+#define CAN_CTRLMODE_LOOPBACK	0x1	/* Loopback mode */
+#define CAN_CTRLMODE_LISTENONLY	0x2 	/* Listen-only mode */
+#define CAN_CTRLMODE_3_SAMPLES	0x4	/* Triple sampling mode */
+
+/*
+ * CAN device statistics
+ */
+struct can_device_stats {
+	__u32 bus_error;	/* Bus errors */
+	__u32 error_warning;	/* Changes to error warning state */
+	__u32 error_passive;	/* Changes to error passive state */
+	__u32 bus_off;		/* Changes to bus off state */
+	__u32 arbitration_lost; /* Arbitration lost errors */
+	__u32 restarts;		/* CAN controller re-starts */
+};
+
+/*
+ * CAN netlink interface
+ */
+enum {
+	IFLA_CAN_UNSPEC,
+	IFLA_CAN_BITTIMING,
+	IFLA_CAN_BITTIMING_CONST,
+	IFLA_CAN_CLOCK,
+	IFLA_CAN_STATE,
+	IFLA_CAN_CTRLMODE,
+	IFLA_CAN_RESTART_MS,
+	IFLA_CAN_RESTART,
+	__IFLA_CAN_MAX
+};
+
+#define IFLA_CAN_MAX	(__IFLA_CAN_MAX - 1)
+
+#endif /* CAN_NETLINK_H */
-- 
cgit v1.2.3-70-g09d2


From f534e52f091a7b9f51fb6726710bdf731b663e94 Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Fri, 15 May 2009 23:39:31 +0000
Subject: can: SJA1000 generic platform bus driver

This driver adds support for the SJA1000 chips connected to the
"platform bus", which can be found on various embedded systems.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/Kconfig                    |  10 ++
 drivers/net/can/sja1000/Makefile           |   1 +
 drivers/net/can/sja1000/sja1000_platform.c | 164 +++++++++++++++++++++++++++++
 include/linux/can/platform/sja1000.h       |  32 ++++++
 4 files changed, 207 insertions(+)
 create mode 100644 drivers/net/can/sja1000/sja1000_platform.c
 create mode 100644 include/linux/can/platform/sja1000.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index c7e5ce339ea..8fe79741d3c 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -41,6 +41,16 @@ config CAN_SJA1000
 	---help---
 	  Driver for the SJA1000 CAN controllers from Philips or NXP
 
+config CAN_SJA1000_PLATFORM
+	depends on CAN_SJA1000
+	tristate "Generic Platform Bus based SJA1000 driver"
+	---help---
+	  This driver adds support for the SJA1000 chips connected to
+	  the "platform bus" (Linux abstraction for directly to the
+	  processor attached devices).  Which can be found on various
+	  boards from Phytec (http://www.phytec.de) like the PCM027,
+	  PCM038.
+
 config CAN_DEBUG_DEVICES
 	bool "CAN devices debugging messages"
 	depends on CAN
diff --git a/drivers/net/can/sja1000/Makefile b/drivers/net/can/sja1000/Makefile
index 893ab2cfe24..5115cc90ab5 100644
--- a/drivers/net/can/sja1000/Makefile
+++ b/drivers/net/can/sja1000/Makefile
@@ -3,5 +3,6 @@
 #
 
 obj-$(CONFIG_CAN_SJA1000) += sja1000.o
+obj-$(CONFIG_CAN_SJA1000_PLATFORM) += sja1000_platform.o
 
 ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/sja1000/sja1000_platform.c b/drivers/net/can/sja1000/sja1000_platform.c
new file mode 100644
index 00000000000..8017229d6fd
--- /dev/null
+++ b/drivers/net/can/sja1000/sja1000_platform.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2005 Sascha Hauer, Pengutronix
+ * Copyright (C) 2007 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/irq.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+#include <linux/can/platform/sja1000.h>
+#include <linux/io.h>
+
+#include "sja1000.h"
+
+#define DRV_NAME "sja1000_platform"
+
+MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
+MODULE_DESCRIPTION("Socket-CAN driver for SJA1000 on the platform bus");
+MODULE_LICENSE("GPL v2");
+
+static u8 sp_read_reg(const struct net_device *dev, int reg)
+{
+	return ioread8((void __iomem *)(dev->base_addr + reg));
+}
+
+static void sp_write_reg(const struct net_device *dev, int reg, u8 val)
+{
+	iowrite8(val, (void __iomem *)(dev->base_addr + reg));
+}
+
+static int sp_probe(struct platform_device *pdev)
+{
+	int err;
+	void __iomem *addr;
+	struct net_device *dev;
+	struct sja1000_priv *priv;
+	struct resource *res_mem, *res_irq;
+	struct sja1000_platform_data *pdata;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev, "No platform data provided!\n");
+		err = -ENODEV;
+		goto exit;
+	}
+
+	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	res_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res_mem || !res_irq) {
+		err = -ENODEV;
+		goto exit;
+	}
+
+	if (!request_mem_region(res_mem->start, resource_size(res_mem),
+				DRV_NAME)) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	addr = ioremap_nocache(res_mem->start, resource_size(res_mem));
+	if (!addr) {
+		err = -ENOMEM;
+		goto exit_release;
+	}
+
+	dev = alloc_sja1000dev(0);
+	if (!dev) {
+		err = -ENOMEM;
+		goto exit_iounmap;
+	}
+	priv = netdev_priv(dev);
+
+	dev->base_addr = (unsigned long)addr;
+	dev->irq = res_irq->start;
+	priv->irq_flags = res_irq->flags & IRQF_TRIGGER_MASK;
+	priv->read_reg = sp_read_reg;
+	priv->write_reg = sp_write_reg;
+	priv->can.clock.freq = pdata->clock;
+	priv->ocr = pdata->ocr;
+	priv->cdr = pdata->cdr;
+
+	dev_set_drvdata(&pdev->dev, dev);
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	err = register_sja1000dev(dev);
+	if (err) {
+		dev_err(&pdev->dev, "registering %s failed (err=%d)\n",
+			DRV_NAME, err);
+		goto exit_free;
+	}
+
+	dev_info(&pdev->dev, "%s device registered (base_addr=%#lx, irq=%d)\n",
+		 DRV_NAME, dev->base_addr, dev->irq);
+	return 0;
+
+ exit_free:
+	free_sja1000dev(dev);
+ exit_iounmap:
+	iounmap(addr);
+ exit_release:
+	release_mem_region(res_mem->start, resource_size(res_mem));
+ exit:
+	return err;
+}
+
+static int sp_remove(struct platform_device *pdev)
+{
+	struct net_device *dev = dev_get_drvdata(&pdev->dev);
+	struct resource *res;
+
+	unregister_sja1000dev(dev);
+	dev_set_drvdata(&pdev->dev, NULL);
+
+	if (dev->base_addr)
+		iounmap((void __iomem *)dev->base_addr);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	release_mem_region(res->start, resource_size(res));
+
+	free_sja1000dev(dev);
+
+	return 0;
+}
+
+static struct platform_driver sp_driver = {
+	.probe = sp_probe,
+	.remove = sp_remove,
+	.driver = {
+		.name = DRV_NAME,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init sp_init(void)
+{
+	return platform_driver_register(&sp_driver);
+}
+
+static void __exit sp_exit(void)
+{
+	platform_driver_unregister(&sp_driver);
+}
+
+module_init(sp_init);
+module_exit(sp_exit);
diff --git a/include/linux/can/platform/sja1000.h b/include/linux/can/platform/sja1000.h
new file mode 100644
index 00000000000..37966e630ff
--- /dev/null
+++ b/include/linux/can/platform/sja1000.h
@@ -0,0 +1,32 @@
+#ifndef _CAN_PLATFORM_SJA1000_H_
+#define _CAN_PLATFORM_SJA1000_H_
+
+/* clock divider register */
+#define CDR_CLKOUT_MASK 0x07
+#define CDR_CLK_OFF	0x08 /* Clock off (CLKOUT pin) */
+#define CDR_RXINPEN	0x20 /* TX1 output is RX irq output */
+#define CDR_CBP		0x40 /* CAN input comparator bypass */
+#define CDR_PELICAN	0x80 /* PeliCAN mode */
+
+/* output control register */
+#define OCR_MODE_BIPHASE  0x00
+#define OCR_MODE_TEST     0x01
+#define OCR_MODE_NORMAL   0x02
+#define OCR_MODE_CLOCK    0x03
+#define OCR_TX0_INVERT    0x04
+#define OCR_TX0_PULLDOWN  0x08
+#define OCR_TX0_PULLUP    0x10
+#define OCR_TX0_PUSHPULL  0x18
+#define OCR_TX1_INVERT    0x20
+#define OCR_TX1_PULLDOWN  0x40
+#define OCR_TX1_PULLUP    0x80
+#define OCR_TX1_PUSHPULL  0xc0
+
+struct sja1000_platform_data {
+	u32 clock;	/* CAN bus oscillator frequency in Hz */
+
+	u8 ocr;		/* output control register */
+	u8 cdr;		/* clock divider register */
+};
+
+#endif	/* !_CAN_PLATFORM_SJA1000_H_ */
-- 
cgit v1.2.3-70-g09d2


From 69e3c75f4d541a6eb151b3ef91f34033cb3ad6e1 Mon Sep 17 00:00:00 2001
From: Johann Baudy <johann.baudy@gnu-log.net>
Date: Mon, 18 May 2009 22:11:22 -0700
Subject: net: TX_RING and packet mmap

New packet socket feature that makes packet socket more efficient for
transmission.

- It reduces number of system call through a PACKET_TX_RING mechanism,
  based on PACKET_RX_RING (Circular buffer allocated in kernel space
  which is mmapped from user space).

- It minimizes CPU copy using fragmented SKB (almost zero copy).

Signed-off-by: Johann Baudy <johann.baudy@gnu-log.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/packet_mmap.txt | 140 +++++++-
 include/linux/if_packet.h                |  20 +-
 include/linux/skbuff.h                   |   3 +
 net/packet/af_packet.c                   | 588 +++++++++++++++++++++++++------
 4 files changed, 616 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 07c53d59603..a22fd85e379 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -4,16 +4,18 @@
 
 This file documents the CONFIG_PACKET_MMAP option available with the PACKET
 socket interface on 2.4 and 2.6 kernels. This type of sockets is used for 
-capture network traffic with utilities like tcpdump or any other that uses 
-the libpcap library. 
-
-You can find the latest version of this document at
+capture network traffic with utilities like tcpdump or any other that needs
+raw access to network interface.
 
+You can find the latest version of this document at:
     http://pusa.uv.es/~ulisses/packet_mmap/
 
-Please send me your comments to
+Howto can be found at:
+    http://wiki.gnu-log.net (packet_mmap)
 
+Please send your comments to
     Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
+    Johann Baudy <johann.baudy@gnu-log.net>
 
 -------------------------------------------------------------------------------
 + Why use PACKET_MMAP
@@ -25,19 +27,24 @@ to capture each packet, it requires two if you want to get packet's
 timestamp (like libpcap always does).
 
 In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size 
-configurable circular buffer mapped in user space. This way reading packets just 
-needs to wait for them, most of the time there is no need to issue a single 
-system call. By using a shared buffer between the kernel and the user 
-also has the benefit of minimizing packet copies.
-
-It's fine to use PACKET_MMAP to improve the performance of the capture process, 
-but it isn't everything. At least, if you are capturing at high speeds (this 
-is relative to the cpu speed), you should check if the device driver of your 
-network interface card supports some sort of interrupt load mitigation or 
-(even better) if it supports NAPI, also make sure it is enabled.
+configurable circular buffer mapped in user space that can be used to either
+send or receive packets. This way reading packets just needs to wait for them,
+most of the time there is no need to issue a single system call. Concerning
+transmission, multiple packets can be sent through one system call to get the
+highest bandwidth.
+By using a shared buffer between the kernel and the user also has the benefit
+of minimizing packet copies.
+
+It's fine to use PACKET_MMAP to improve the performance of the capture and
+transmission process, but it isn't everything. At least, if you are capturing
+at high speeds (this is relative to the cpu speed), you should check if the
+device driver of your network interface card supports some sort of interrupt
+load mitigation or (even better) if it supports NAPI, also make sure it is
+enabled. For transmission, check the MTU (Maximum Transmission Unit) used and
+supported by devices of your network.
 
 --------------------------------------------------------------------------------
-+ How to use CONFIG_PACKET_MMAP
++ How to use CONFIG_PACKET_MMAP to improve capture process
 --------------------------------------------------------------------------------
 
 From the user standpoint, you should use the higher level libpcap library, which
@@ -57,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP
 support.
 
 --------------------------------------------------------------------------------
-+ How to use CONFIG_PACKET_MMAP directly
++ How to use CONFIG_PACKET_MMAP directly to improve capture process
 --------------------------------------------------------------------------------
 
 From the system calls stand point, the use of PACKET_MMAP involves
@@ -66,6 +73,7 @@ the following process:
 
 [setup]     socket() -------> creation of the capture socket
             setsockopt() ---> allocation of the circular buffer (ring)
+                              option: PACKET_RX_RING
             mmap() ---------> mapping of the allocated buffer to the
                               user process
 
@@ -96,6 +104,65 @@ Next I will describe PACKET_MMAP settings and it's constraints,
 also the mapping of the circular buffer in the user process and 
 the use of this buffer.
 
+--------------------------------------------------------------------------------
++ How to use CONFIG_PACKET_MMAP directly to improve transmission process
+--------------------------------------------------------------------------------
+Transmission process is similar to capture as shown below.
+
+[setup]          socket() -------> creation of the transmission socket
+                 setsockopt() ---> allocation of the circular buffer (ring)
+                                   option: PACKET_TX_RING
+                 bind() ---------> bind transmission socket with a network interface
+                 mmap() ---------> mapping of the allocated buffer to the
+                                   user process
+
+[transmission]   poll() ---------> wait for free packets (optional)
+                 send() ---------> send all packets that are set as ready in
+                                   the ring
+                                   The flag MSG_DONTWAIT can be used to return
+                                   before end of transfer.
+
+[shutdown]  close() --------> destruction of the transmission socket and
+                              deallocation of all associated resources.
+
+Binding the socket to your network interface is mandatory (with zero copy) to
+know the header size of frames used in the circular buffer.
+
+As capture, each frame contains two parts:
+
+ --------------------
+| struct tpacket_hdr | Header. It contains the status of
+|                    | of this frame
+|--------------------|
+| data buffer        |
+.                    .  Data that will be sent over the network interface.
+.                    .
+ --------------------
+
+ bind() associates the socket to your network interface thanks to
+ sll_ifindex parameter of struct sockaddr_ll.
+
+ Initialization example:
+
+ struct sockaddr_ll my_addr;
+ struct ifreq s_ifr;
+ ...
+
+ strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
+
+ /* get interface index of eth0 */
+ ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
+
+ /* fill sockaddr_ll struct to prepare binding */
+ my_addr.sll_family = AF_PACKET;
+ my_addr.sll_protocol = ETH_P_ALL;
+ my_addr.sll_ifindex =  s_ifr.ifr_ifindex;
+
+ /* bind socket to eth0 */
+ bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
+
+ A complete tutorial is available at: http://wiki.gnu-log.net/
+
 --------------------------------------------------------------------------------
 + PACKET_MMAP settings
 --------------------------------------------------------------------------------
@@ -103,7 +170,10 @@ the use of this buffer.
 
 To setup PACKET_MMAP from user level code is done with a call like
 
+ - Capture process
      setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
+ - Transmission process
+     setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
 
 The most significant argument in the previous call is the req parameter, 
 this parameter must to have the following structure:
@@ -117,11 +187,11 @@ this parameter must to have the following structure:
     };
 
 This structure is defined in /usr/include/linux/if_packet.h and establishes a 
-circular buffer (ring) of unswappable memory mapped in the capture process. 
+circular buffer (ring) of unswappable memory.
 Being mapped in the capture process allows reading the captured frames and 
 related meta-information like timestamps without requiring a system call.
 
-Captured frames are grouped in blocks. Each block is a physically contiguous 
+Frames are grouped in blocks. Each block is a physically contiguous
 region of memory and holds tp_block_size/tp_frame_size frames. The total number 
 of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
 
@@ -336,6 +406,7 @@ struct tpacket_hdr). If this field is 0 means that the frame is ready
 to be used for the kernel, If not, there is a frame the user can read 
 and the following flags apply:
 
++++ Capture process:
      from include/linux/if_packet.h
 
      #define TP_STATUS_COPY          2 
@@ -391,6 +462,37 @@ packets are in the ring:
 It doesn't incur in a race condition to first check the status value and 
 then poll for frames.
 
+
+++ Transmission process
+Those defines are also used for transmission:
+
+     #define TP_STATUS_AVAILABLE        0 // Frame is available
+     #define TP_STATUS_SEND_REQUEST     1 // Frame will be sent on next send()
+     #define TP_STATUS_SENDING          2 // Frame is currently in transmission
+     #define TP_STATUS_WRONG_FORMAT     4 // Frame format is not correct
+
+First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a
+packet, the user fills a data buffer of an available frame, sets tp_len to
+current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST.
+This can be done on multiple frames. Once the user is ready to transmit, it
+calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are
+forwarded to the network device. The kernel updates each status of sent
+frames with TP_STATUS_SENDING until the end of transfer.
+At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE.
+
+    header->tp_len = in_i_size;
+    header->tp_status = TP_STATUS_SEND_REQUEST;
+    retval = send(this->socket, NULL, 0, 0);
+
+The user can also use poll() to check if a buffer is available:
+(status == TP_STATUS_SENDING)
+
+    struct pollfd pfd;
+    pfd.fd = fd;
+    pfd.revents = 0;
+    pfd.events = POLLOUT;
+    retval = poll(&pfd, 1, timeout);
+
 --------------------------------------------------------------------------------
 + THANKS
 --------------------------------------------------------------------------------
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 18db0668065..5b2badeb949 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -46,6 +46,8 @@ struct sockaddr_ll
 #define PACKET_VERSION			10
 #define PACKET_HDRLEN			11
 #define PACKET_RESERVE			12
+#define PACKET_TX_RING			13
+#define PACKET_LOSS			14
 
 struct tpacket_stats
 {
@@ -63,14 +65,22 @@ struct tpacket_auxdata
 	__u16		tp_vlan_tci;
 };
 
+/* Rx ring - header status */
+#define TP_STATUS_KERNEL	0x0
+#define TP_STATUS_USER		0x1
+#define TP_STATUS_COPY		0x2
+#define TP_STATUS_LOSING	0x4
+#define TP_STATUS_CSUMNOTREADY	0x8
+
+/* Tx ring - header status */
+#define TP_STATUS_AVAILABLE	0x0
+#define TP_STATUS_SEND_REQUEST	0x1
+#define TP_STATUS_SENDING	0x2
+#define TP_STATUS_WRONG_FORMAT	0x4
+
 struct tpacket_hdr
 {
 	unsigned long	tp_status;
-#define TP_STATUS_KERNEL	0
-#define TP_STATUS_USER		1
-#define TP_STATUS_COPY		2
-#define TP_STATUS_LOSING	4
-#define TP_STATUS_CSUMNOTREADY	8
 	unsigned int	tp_len;
 	unsigned int	tp_snaplen;
 	unsigned short	tp_mac;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1b5c3d298f4..aff494ba6a3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -203,6 +203,9 @@ struct skb_shared_info {
 #ifdef CONFIG_HAS_DMA
 	dma_addr_t	dma_maps[MAX_SKB_FRAGS + 1];
 #endif
+	/* Intermediate layers must ensure that destructor_arg
+	 * remains valid until skb destructor */
+	void *		destructor_arg;
 };
 
 /* We divide dataref into two halves.  The higher 16 bits hold references
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f546e81acc4..766e6b41f7c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -39,6 +39,7 @@
  *					will simply extend the hardware address
  *					byte arrays at the end of sockaddr_ll
  *					and packet_mreq.
+ *		Johann Baudy	:	Added TX RING.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -157,7 +158,25 @@ struct packet_mreq_max
 };
 
 #ifdef CONFIG_PACKET_MMAP
-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
+static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
+		int closing, int tx_ring);
+
+struct packet_ring_buffer {
+	char *			*pg_vec;
+	unsigned int		head;
+	unsigned int		frames_per_block;
+	unsigned int		frame_size;
+	unsigned int		frame_max;
+
+	unsigned int		pg_vec_order;
+	unsigned int		pg_vec_pages;
+	unsigned int		pg_vec_len;
+
+	atomic_t		pending;
+};
+
+struct packet_sock;
+static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 #endif
 
 static void packet_flush_mclist(struct sock *sk);
@@ -167,11 +186,8 @@ struct packet_sock {
 	struct sock		sk;
 	struct tpacket_stats	stats;
 #ifdef CONFIG_PACKET_MMAP
-	char *			*pg_vec;
-	unsigned int		head;
-	unsigned int            frames_per_block;
-	unsigned int		frame_size;
-	unsigned int		frame_max;
+	struct packet_ring_buffer	rx_ring;
+	struct packet_ring_buffer	tx_ring;
 	int			copy_thresh;
 #endif
 	struct packet_type	prot_hook;
@@ -185,12 +201,10 @@ struct packet_sock {
 	struct packet_mclist	*mclist;
 #ifdef CONFIG_PACKET_MMAP
 	atomic_t		mapped;
-	unsigned int            pg_vec_order;
-	unsigned int		pg_vec_pages;
-	unsigned int		pg_vec_len;
 	enum tpacket_versions	tp_version;
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
+	unsigned int		tp_loss:1;
 #endif
 };
 
@@ -206,36 +220,33 @@ struct packet_skb_cb {
 
 #ifdef CONFIG_PACKET_MMAP
 
-static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
-				 int status)
+static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 {
-	unsigned int pg_vec_pos, frame_offset;
 	union {
 		struct tpacket_hdr *h1;
 		struct tpacket2_hdr *h2;
 		void *raw;
 	} h;
 
-	pg_vec_pos = position / po->frames_per_block;
-	frame_offset = position % po->frames_per_block;
-
-	h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
+	h.raw = frame;
 	switch (po->tp_version) {
 	case TPACKET_V1:
-		if (status != (h.h1->tp_status ? TP_STATUS_USER :
-						TP_STATUS_KERNEL))
-			return NULL;
+		h.h1->tp_status = status;
+		flush_dcache_page(virt_to_page(&h.h1->tp_status));
 		break;
 	case TPACKET_V2:
-		if (status != (h.h2->tp_status ? TP_STATUS_USER :
-						TP_STATUS_KERNEL))
-			return NULL;
+		h.h2->tp_status = status;
+		flush_dcache_page(virt_to_page(&h.h2->tp_status));
 		break;
+	default:
+		printk(KERN_ERR "TPACKET version not supported\n");
+		BUG();
 	}
-	return h.raw;
+
+	smp_wmb();
 }
 
-static void __packet_set_status(struct packet_sock *po, void *frame, int status)
+static int __packet_get_status(struct packet_sock *po, void *frame)
 {
 	union {
 		struct tpacket_hdr *h1;
@@ -243,16 +254,66 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		void *raw;
 	} h;
 
+	smp_rmb();
+
 	h.raw = frame;
 	switch (po->tp_version) {
 	case TPACKET_V1:
-		h.h1->tp_status = status;
-		break;
+		flush_dcache_page(virt_to_page(&h.h1->tp_status));
+		return h.h1->tp_status;
 	case TPACKET_V2:
-		h.h2->tp_status = status;
-		break;
+		flush_dcache_page(virt_to_page(&h.h2->tp_status));
+		return h.h2->tp_status;
+	default:
+		printk(KERN_ERR "TPACKET version not supported\n");
+		BUG();
+		return 0;
 	}
 }
+
+static void *packet_lookup_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		unsigned int position,
+		int status)
+{
+	unsigned int pg_vec_pos, frame_offset;
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} h;
+
+	pg_vec_pos = position / rb->frames_per_block;
+	frame_offset = position % rb->frames_per_block;
+
+	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+
+	if (status != __packet_get_status(po, h.raw))
+		return NULL;
+
+	return h.raw;
+}
+
+static inline void *packet_current_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	return packet_lookup_frame(po, rb, rb->head, status);
+}
+
+static inline void *packet_previous_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
+	return packet_lookup_frame(po, rb, previous, status);
+}
+
+static inline void packet_increment_head(struct packet_ring_buffer *buff)
+{
+	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
+}
+
 #endif
 
 static inline struct packet_sock *pkt_sk(struct sock *sk)
@@ -648,7 +709,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 		macoff = netoff - maclen;
 	}
 
-	if (macoff + snaplen > po->frame_size) {
+	if (macoff + snaplen > po->rx_ring.frame_size) {
 		if (po->copy_thresh &&
 		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
 		    (unsigned)sk->sk_rcvbuf) {
@@ -661,16 +722,16 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 			if (copy_skb)
 				skb_set_owner_r(copy_skb, sk);
 		}
-		snaplen = po->frame_size - macoff;
+		snaplen = po->rx_ring.frame_size - macoff;
 		if ((int)snaplen < 0)
 			snaplen = 0;
 	}
 
 	spin_lock(&sk->sk_receive_queue.lock);
-	h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
+	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
 	if (!h.raw)
 		goto ring_is_full;
-	po->head = po->head != po->frame_max ? po->head+1 : 0;
+	packet_increment_head(&po->rx_ring);
 	po->stats.tp_packets++;
 	if (copy_skb) {
 		status |= TP_STATUS_COPY;
@@ -727,7 +788,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 
 	__packet_set_status(po, h.raw, status);
 	smp_mb();
-
 	{
 		struct page *p_start, *p_end;
 		u8 *h_end = h.raw + macoff + snaplen - 1;
@@ -760,10 +820,249 @@ ring_is_full:
 	goto drop_n_restore;
 }
 
-#endif
+static void tpacket_destruct_skb(struct sk_buff *skb)
+{
+	struct packet_sock *po = pkt_sk(skb->sk);
+	void * ph;
 
+	BUG_ON(skb == NULL);
 
-static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
+	if (likely(po->tx_ring.pg_vec)) {
+		ph = skb_shinfo(skb)->destructor_arg;
+		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
+		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
+		atomic_dec(&po->tx_ring.pending);
+		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
+	}
+
+	sock_wfree(skb);
+}
+
+static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
+		void * frame, struct net_device *dev, int size_max,
+		__be16 proto, unsigned char * addr)
+{
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} ph;
+	int to_write, offset, len, tp_len, nr_frags, len_max;
+	struct socket *sock = po->sk.sk_socket;
+	struct page *page;
+	void *data;
+	int err;
+
+	ph.raw = frame;
+
+	skb->protocol = proto;
+	skb->dev = dev;
+	skb->priority = po->sk.sk_priority;
+	skb_shinfo(skb)->destructor_arg = ph.raw;
+
+	switch (po->tp_version) {
+	case TPACKET_V2:
+		tp_len = ph.h2->tp_len;
+		break;
+	default:
+		tp_len = ph.h1->tp_len;
+		break;
+	}
+	if (unlikely(tp_len > size_max)) {
+		printk(KERN_ERR "packet size is too long (%d > %d)\n",
+				tp_len, size_max);
+		return -EMSGSIZE;
+	}
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+	skb_reset_network_header(skb);
+
+	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
+	to_write = tp_len;
+
+	if (sock->type == SOCK_DGRAM) {
+		err = dev_hard_header(skb, dev, ntohs(proto), addr,
+				NULL, tp_len);
+		if (unlikely(err < 0))
+			return -EINVAL;
+	} else if (dev->hard_header_len ) {
+		/* net device doesn't like empty head */
+		if (unlikely(tp_len <= dev->hard_header_len)) {
+			printk(KERN_ERR "packet size is too short "
+					"(%d < %d)\n", tp_len,
+					dev->hard_header_len);
+			return -EINVAL;
+		}
+
+		skb_push(skb, dev->hard_header_len);
+		err = skb_store_bits(skb, 0, data,
+				dev->hard_header_len);
+		if (unlikely(err))
+			return err;
+
+		data += dev->hard_header_len;
+		to_write -= dev->hard_header_len;
+	}
+
+	err = -EFAULT;
+	page = virt_to_page(data);
+	offset = offset_in_page(data);
+	len_max = PAGE_SIZE - offset;
+	len = ((to_write > len_max) ? len_max : to_write);
+
+	skb->data_len = to_write;
+	skb->len += to_write;
+	skb->truesize += to_write;
+	atomic_add(to_write, &po->sk.sk_wmem_alloc);
+
+	while (likely(to_write)) {
+		nr_frags = skb_shinfo(skb)->nr_frags;
+
+		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
+			printk(KERN_ERR "Packet exceed the number "
+					"of skb frags(%lu)\n",
+					MAX_SKB_FRAGS);
+			return -EFAULT;
+		}
+
+		flush_dcache_page(page);
+		get_page(page);
+		skb_fill_page_desc(skb,
+				nr_frags,
+				page++, offset, len);
+		to_write -= len;
+		offset = 0;
+		len_max = PAGE_SIZE;
+		len = ((to_write > len_max) ? len_max : to_write);
+	}
+
+	return tp_len;
+}
+
+static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
+{
+	struct socket *sock;
+	struct sk_buff *skb;
+	struct net_device *dev;
+	__be16 proto;
+	int ifindex, err, reserve = 0;
+	void * ph;
+	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
+	int tp_len, size_max;
+	unsigned char *addr;
+	int len_sum = 0;
+	int status = 0;
+
+	sock = po->sk.sk_socket;
+
+	mutex_lock(&po->pg_vec_lock);
+
+	err = -EBUSY;
+	if (saddr == NULL) {
+		ifindex	= po->ifindex;
+		proto	= po->num;
+		addr	= NULL;
+	} else {
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+			goto out;
+		if (msg->msg_namelen < (saddr->sll_halen
+					+ offsetof(struct sockaddr_ll,
+						sll_addr)))
+			goto out;
+		ifindex	= saddr->sll_ifindex;
+		proto	= saddr->sll_protocol;
+		addr	= saddr->sll_addr;
+	}
+
+	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
+	err = -ENXIO;
+	if (unlikely(dev == NULL))
+		goto out;
+
+	reserve = dev->hard_header_len;
+
+	err = -ENETDOWN;
+	if (unlikely(!(dev->flags & IFF_UP)))
+		goto out_put;
+
+	size_max = po->tx_ring.frame_size
+		- sizeof(struct skb_shared_info)
+		- po->tp_hdrlen
+		- LL_ALLOCATED_SPACE(dev)
+		- sizeof(struct sockaddr_ll);
+
+	if (size_max > dev->mtu + reserve)
+		size_max = dev->mtu + reserve;
+
+	do {
+		ph = packet_current_frame(po, &po->tx_ring,
+				TP_STATUS_SEND_REQUEST);
+
+		if (unlikely(ph == NULL)) {
+			schedule();
+			continue;
+		}
+
+		status = TP_STATUS_SEND_REQUEST;
+		skb = sock_alloc_send_skb(&po->sk,
+				LL_ALLOCATED_SPACE(dev)
+				+ sizeof(struct sockaddr_ll),
+				0, &err);
+
+		if (unlikely(skb == NULL))
+			goto out_status;
+
+		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
+				addr);
+
+		if (unlikely(tp_len < 0)) {
+			if (po->tp_loss) {
+				__packet_set_status(po, ph,
+						TP_STATUS_AVAILABLE);
+				packet_increment_head(&po->tx_ring);
+				kfree_skb(skb);
+				continue;
+			} else {
+				status = TP_STATUS_WRONG_FORMAT;
+				err = tp_len;
+				goto out_status;
+			}
+		}
+
+		skb->destructor = tpacket_destruct_skb;
+		__packet_set_status(po, ph, TP_STATUS_SENDING);
+		atomic_inc(&po->tx_ring.pending);
+
+		status = TP_STATUS_SEND_REQUEST;
+		err = dev_queue_xmit(skb);
+		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
+			goto out_xmit;
+		packet_increment_head(&po->tx_ring);
+		len_sum += tp_len;
+	}
+	while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
+					&& (atomic_read(&po->tx_ring.pending))))
+	      );
+
+	err = len_sum;
+	goto out_put;
+
+out_xmit:
+	skb->destructor = sock_wfree;
+	atomic_dec(&po->tx_ring.pending);
+out_status:
+	__packet_set_status(po, ph, status);
+	kfree_skb(skb);
+out_put:
+	dev_put(dev);
+out:
+	mutex_unlock(&po->pg_vec_lock);
+	return err;
+}
+#endif
+
+static int packet_snd(struct socket *sock,
 			  struct msghdr *msg, size_t len)
 {
 	struct sock *sk = sock->sk;
@@ -854,6 +1153,19 @@ out:
 	return err;
 }
 
+static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *msg, size_t len)
+{
+#ifdef CONFIG_PACKET_MMAP
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	if (po->tx_ring.pg_vec)
+		return tpacket_snd(po, msg);
+	else
+#endif
+		return packet_snd(sock, msg, len);
+}
+
 /*
  *	Close a PACKET socket. This is fairly simple. We immediately go
  *	to 'closed' state and remove our protocol entry in the device list.
@@ -864,6 +1176,9 @@ static int packet_release(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct packet_sock *po;
 	struct net *net;
+#ifdef CONFIG_PACKET_MMAP
+	struct tpacket_req req;
+#endif
 
 	if (!sk)
 		return 0;
@@ -893,11 +1208,13 @@ static int packet_release(struct socket *sock)
 	packet_flush_mclist(sk);
 
 #ifdef CONFIG_PACKET_MMAP
-	if (po->pg_vec) {
-		struct tpacket_req req;
-		memset(&req, 0, sizeof(req));
-		packet_set_ring(sk, &req, 1);
-	}
+	memset(&req, 0, sizeof(req));
+
+	if (po->rx_ring.pg_vec)
+		packet_set_ring(sk, &req, 1, 0);
+
+	if (po->tx_ring.pg_vec)
+		packet_set_ring(sk, &req, 1, 1);
 #endif
 
 	/*
@@ -1391,7 +1708,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 	if (level != SOL_PACKET)
 		return -ENOPROTOOPT;
 
-	switch(optname)	{
+	switch (optname) {
 	case PACKET_ADD_MEMBERSHIP:
 	case PACKET_DROP_MEMBERSHIP:
 	{
@@ -1415,6 +1732,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 #ifdef CONFIG_PACKET_MMAP
 	case PACKET_RX_RING:
+	case PACKET_TX_RING:
 	{
 		struct tpacket_req req;
 
@@ -1422,7 +1740,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 			return -EINVAL;
 		if (copy_from_user(&req,optval,sizeof(req)))
 			return -EFAULT;
-		return packet_set_ring(sk, &req, 0);
+		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
 	}
 	case PACKET_COPY_THRESH:
 	{
@@ -1442,7 +1760,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (optlen != sizeof(val))
 			return -EINVAL;
-		if (po->pg_vec)
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
 			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
@@ -1461,13 +1779,26 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (optlen != sizeof(val))
 			return -EINVAL;
-		if (po->pg_vec)
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
 			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
 		po->tp_reserve = val;
 		return 0;
 	}
+	case PACKET_LOSS:
+	{
+		unsigned int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+			return -EBUSY;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+		po->tp_loss = !!val;
+		return 0;
+	}
 #endif
 	case PACKET_AUXDATA:
 	{
@@ -1517,7 +1848,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 	if (len < 0)
 		return -EINVAL;
 
-	switch(optname)	{
+	switch (optname) {
 	case PACKET_STATISTICS:
 		if (len > sizeof(struct tpacket_stats))
 			len = sizeof(struct tpacket_stats);
@@ -1573,6 +1904,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = po->tp_reserve;
 		data = &val;
 		break;
+	case PACKET_LOSS:
+		if (len > sizeof(unsigned int))
+			len = sizeof(unsigned int);
+		val = po->tp_loss;
+		data = &val;
+		break;
 #endif
 	default:
 		return -ENOPROTOOPT;
@@ -1643,7 +1980,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
 {
 	struct sock *sk = sock->sk;
 
-	switch(cmd) {
+	switch (cmd) {
 		case SIOCOUTQ:
 		{
 			int amount = atomic_read(&sk->sk_wmem_alloc);
@@ -1705,13 +2042,17 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
 	unsigned int mask = datagram_poll(file, sock, wait);
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
-	if (po->pg_vec) {
-		unsigned last = po->head ? po->head-1 : po->frame_max;
-
-		if (packet_lookup_frame(po, last, TP_STATUS_USER))
+	if (po->rx_ring.pg_vec) {
+		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
 			mask |= POLLIN | POLLRDNORM;
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	if (po->tx_ring.pg_vec) {
+		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
+			mask |= POLLOUT | POLLWRNORM;
+	}
+	spin_unlock_bh(&sk->sk_write_queue.lock);
 	return mask;
 }
 
@@ -1788,21 +2129,33 @@ out_free_pgvec:
 	goto out;
 }
 
-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
+static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
+		int closing, int tx_ring)
 {
 	char **pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
+	struct packet_ring_buffer *rb;
+	struct sk_buff_head *rb_queue;
 	__be16 num;
-	int err = 0;
+	int err;
 
-	if (req->tp_block_nr) {
-		int i;
+	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
+	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
 
-		/* Sanity tests and some calculations */
+	err = -EBUSY;
+	if (!closing) {
+		if (atomic_read(&po->mapped))
+			goto out;
+		if (atomic_read(&rb->pending))
+			goto out;
+	}
 
-		if (unlikely(po->pg_vec))
-			return -EBUSY;
+	if (req->tp_block_nr) {
+		/* Sanity tests and some calculations */
+		err = -EBUSY;
+		if (unlikely(rb->pg_vec))
+			goto out;
 
 		switch (po->tp_version) {
 		case TPACKET_V1:
@@ -1813,42 +2166,35 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 			break;
 		}
 
+		err = -EINVAL;
 		if (unlikely((int)req->tp_block_size <= 0))
-			return -EINVAL;
+			goto out;
 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
-			return -EINVAL;
+			goto out;
 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
-						  po->tp_reserve))
-			return -EINVAL;
+					po->tp_reserve))
+			goto out;
 		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
-			return -EINVAL;
+			goto out;
 
-		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
-		if (unlikely(po->frames_per_block <= 0))
-			return -EINVAL;
-		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
-			     req->tp_frame_nr))
-			return -EINVAL;
+		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
+		if (unlikely(rb->frames_per_block <= 0))
+			goto out;
+		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
+					req->tp_frame_nr))
+			goto out;
 
 		err = -ENOMEM;
 		order = get_order(req->tp_block_size);
 		pg_vec = alloc_pg_vec(req, order);
 		if (unlikely(!pg_vec))
 			goto out;
-
-		for (i = 0; i < req->tp_block_nr; i++) {
-			void *ptr = pg_vec[i];
-			int k;
-
-			for (k = 0; k < po->frames_per_block; k++) {
-				__packet_set_status(po, ptr, TP_STATUS_KERNEL);
-				ptr += req->tp_frame_size;
-			}
-		}
-		/* Done */
-	} else {
+	}
+	/* Done */
+	else {
+		err = -EINVAL;
 		if (unlikely(req->tp_frame_nr))
-			return -EINVAL;
+			goto out;
 	}
 
 	lock_sock(sk);
@@ -1872,23 +2218,24 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 	if (closing || atomic_read(&po->mapped) == 0) {
 		err = 0;
 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
-
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		pg_vec = XC(po->pg_vec, pg_vec);
-		po->frame_max = (req->tp_frame_nr - 1);
-		po->head = 0;
-		po->frame_size = req->tp_frame_size;
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-
-		order = XC(po->pg_vec_order, order);
-		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
-
-		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
-		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
-		skb_queue_purge(&sk->sk_receive_queue);
+		spin_lock_bh(&rb_queue->lock);
+		pg_vec = XC(rb->pg_vec, pg_vec);
+		rb->frame_max = (req->tp_frame_nr - 1);
+		rb->head = 0;
+		rb->frame_size = req->tp_frame_size;
+		spin_unlock_bh(&rb_queue->lock);
+
+		order = XC(rb->pg_vec_order, order);
+		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
+
+		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
+		po->prot_hook.func = (po->rx_ring.pg_vec) ?
+						tpacket_rcv : packet_rcv;
+		skb_queue_purge(rb_queue);
 #undef XC
 		if (atomic_read(&po->mapped))
-			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
+			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
+						atomic_read(&po->mapped));
 	}
 	mutex_unlock(&po->pg_vec_lock);
 
@@ -1909,11 +2256,13 @@ out:
 	return err;
 }
 
-static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+static int packet_mmap(struct file *file, struct socket *sock,
+		struct vm_area_struct *vma)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po = pkt_sk(sk);
-	unsigned long size;
+	unsigned long size, expected_size;
+	struct packet_ring_buffer *rb;
 	unsigned long start;
 	int err = -EINVAL;
 	int i;
@@ -1921,26 +2270,43 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
 	if (vma->vm_pgoff)
 		return -EINVAL;
 
-	size = vma->vm_end - vma->vm_start;
-
 	mutex_lock(&po->pg_vec_lock);
-	if (po->pg_vec == NULL)
+
+	expected_size = 0;
+	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
+		if (rb->pg_vec) {
+			expected_size += rb->pg_vec_len
+						* rb->pg_vec_pages
+						* PAGE_SIZE;
+		}
+	}
+
+	if (expected_size == 0)
 		goto out;
-	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
+
+	size = vma->vm_end - vma->vm_start;
+	if (size != expected_size)
 		goto out;
 
 	start = vma->vm_start;
-	for (i = 0; i < po->pg_vec_len; i++) {
-		struct page *page = virt_to_page(po->pg_vec[i]);
-		int pg_num;
-
-		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
-			err = vm_insert_page(vma, start, page);
-			if (unlikely(err))
-				goto out;
-			start += PAGE_SIZE;
+	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
+		if (rb->pg_vec == NULL)
+			continue;
+
+		for (i = 0; i < rb->pg_vec_len; i++) {
+			struct page *page = virt_to_page(rb->pg_vec[i]);
+			int pg_num;
+
+			for (pg_num = 0; pg_num < rb->pg_vec_pages;
+					pg_num++,page++) {
+				err = vm_insert_page(vma, start, page);
+				if (unlikely(err))
+					goto out;
+				start += PAGE_SIZE;
+			}
 		}
 	}
+
 	atomic_inc(&po->mapped);
 	vma->vm_ops = &packet_mmap_ops;
 	err = 0;
-- 
cgit v1.2.3-70-g09d2


From 690cc3ffe33ac4a2857583c22d4c6244ae11684d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 May 2009 16:55:10 +0000
Subject: syscall: Implement a convinience function restart_syscall

Currently when we have a signal pending we have the functionality
to restart that the current system call.  There are other cases
such as nasty lock ordering issues where it makes sense to have
a simple fix that uses try lock and restarts the system call.
Buying time to figure out how to rework the locking strategy.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sched.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049..d853f6bb0ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2178,6 +2178,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+static inline int restart_syscall(void)
+{
+	set_tsk_thread_flag(current, TIF_SIGPENDING);
+	return -ERESTARTNOINTR;
+}
+
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
-- 
cgit v1.2.3-70-g09d2


From 93f154b594fe47e4a7e5358b309add449a046cd3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 18 May 2009 22:19:19 -0700
Subject: net: release dst entry in dev_hard_start_xmit()

One point of contention in high network loads is the dst_release() performed
when a transmited skb is freed. This is because NIC tx completion calls
dev_kree_skb() long after original call to dev_queue_xmit(skb).

CPU cache is cold and the atomic op in dst_release() stalls. On SMP, this is
quite visible if one CPU is 100% handling softirqs for a network device,
since dst_clone() is done by other cpus, involving cache line ping pongs.

It seems right place to release dst is in dev_hard_start_xmit(), for most
devices but ones that are virtual, and some exceptions.

David Miller suggested to define a new device flag, set in alloc_netdev_mq()
(so that most devices set it at init time), and carefuly unset in devices
which dont want a NULL skb->dst in their ndo_start_xmit().

List of devices that must clear this flag is :

- loopback device, because it calls netif_rx() and quoting Patrick :
    "ip_route_input() doesn't accept loopback addresses, so loopback packets
     already need to have a dst_entry attached."
- appletalk/ipddp.c : needs skb->dst in its xmit function

- And all devices that call again dev_queue_xmit() from their xmit function
(as some classifiers need skb->dst) : bonding, vlan, macvlan, eql, ifb, hdlc_fr

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/appletalk/ipddp.c   | 1 +
 drivers/net/bonding/bond_main.c | 1 +
 drivers/net/eql.c               | 1 +
 drivers/net/ifb.c               | 1 +
 drivers/net/loopback.c          | 1 +
 drivers/net/macvlan.c           | 1 +
 drivers/net/wan/hdlc_fr.c       | 1 +
 include/linux/if.h              | 3 +++
 net/8021q/vlan_dev.c            | 1 +
 net/core/dev.c                  | 9 +++++++++
 10 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index da64ba88d7f..f939e92fcf8 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -71,6 +71,7 @@ static struct net_device * __init ipddp_init(void)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 	strcpy(dev->name, "ipddp%d");
 
 	if (version_printed++ == 0)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 96d7689995c..92a9d69c565 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5148,6 +5148,7 @@ int bond_create(char *name, struct bond_params *params)
 		goto out_rtnl;
 	}
 
+	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 	if (!name) {
 		res = dev_alloc_name(bond_dev, "bond%d");
 		if (res < 0)
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 5210bb1027c..19b7dd98394 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -194,6 +194,7 @@ static void __init eql_setup(struct net_device *dev)
 
 	dev->type       	= ARPHRD_SLIP;
 	dev->tx_queue_len 	= 5;		/* Hands them off fast */
+	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
 }
 
 static int eql_open(struct net_device *dev)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 60a26300193..96713ef0629 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -156,6 +156,7 @@ static void ifb_setup(struct net_device *dev)
 
 	dev->flags |= IFF_NOARP;
 	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 	random_ether_addr(dev->dev_addr);
 }
 
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 6f71157bea8..da472c68748 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -170,6 +170,7 @@ static void loopback_setup(struct net_device *dev)
 	dev->tx_queue_len	= 0;
 	dev->type		= ARPHRD_LOOPBACK;	/* 0x0001*/
 	dev->flags		= IFF_LOOPBACK;
+	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
 	dev->features 		= NETIF_F_SG | NETIF_F_FRAGLIST
 		| NETIF_F_TSO
 		| NETIF_F_NO_CSUM
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 329cd50d0e2..d5334b41e4b 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -414,6 +414,7 @@ static void macvlan_setup(struct net_device *dev)
 {
 	ether_setup(dev);
 
+	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
 	dev->netdev_ops		= &macvlan_netdev_ops;
 	dev->destructor		= free_netdev;
 	dev->header_ops		= &macvlan_hard_header_ops,
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 80053010109..bfa0161a02d 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1054,6 +1054,7 @@ static void pvc_setup(struct net_device *dev)
 	dev->flags = IFF_POINTOPOINT;
 	dev->hard_header_len = 10;
 	dev->addr_len = 2;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 }
 
 static const struct net_device_ops pvc_ops = {
diff --git a/include/linux/if.h b/include/linux/if.h
index 1108f3e099e..b9a6229f3be 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -67,6 +67,9 @@
 #define IFF_ISATAP	0x80		/* ISATAP interface (RFC4214)	*/
 #define IFF_MASTER_ARPMON 0x100		/* bonding master, ARP mon in use */
 #define IFF_WAN_HDLC	0x200		/* WAN HDLC device		*/
+#define IFF_XMIT_DST_RELEASE 0x400	/* dev_hard_start_xmit() is allowed to
+					 * release skb->dst
+					 */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 8faacee6863..ff7572ac548 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -739,6 +739,7 @@ void vlan_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->priv_flags		|= IFF_802_1Q_VLAN;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 	dev->tx_queue_len	= 0;
 
 	dev->netdev_ops		= &vlan_netdev_ops;
diff --git a/net/core/dev.c b/net/core/dev.c
index 6d3630d1627..92ebeca2990 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1688,6 +1688,14 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				goto gso;
 		}
 
+		/*
+		 * If device doesnt need skb->dst, release it right now while
+		 * its hot in this cpu cache
+		 */
+		if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) {
+			dst_release(skb->dst);
+			skb->dst = NULL;
+		}
 		rc = ops->ndo_start_xmit(skb, dev);
 		/*
 		 * TODO: if skb_orphan() was called by
@@ -5045,6 +5053,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
+	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 	strcpy(dev->name, name);
 	return dev;
-- 
cgit v1.2.3-70-g09d2


From bfa568d19a3faed3b94978ad48ac15d1b0d7e5bc Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Sat, 2 May 2009 06:16:47 +0400
Subject: powerpc/85xx: Add PCI IDs for MPC8569 family processors

This patch adds PCI IDs for MPC8569 and MPC8569E processors,
plus adds appropriate quirks for these IDs, and thus makes
PCI-E actually work on MPC8569E-MDS boards.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/sysdev/fsl_pci.c | 2 ++
 include/linux/pci_ids.h       | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 376603df7c4..94d8b3feb1e 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -285,6 +285,8 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8543, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8547E, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8545E, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8545, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8569E, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8569, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8568E, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8568, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8567E, quirk_fsl_pcie_header);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 06ba90c211a..7cc5b80327f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2274,6 +2274,8 @@
 #define PCI_DEVICE_ID_MPC8547E		0x0018
 #define PCI_DEVICE_ID_MPC8545E		0x0019
 #define PCI_DEVICE_ID_MPC8545		0x001a
+#define PCI_DEVICE_ID_MPC8569E		0x0061
+#define PCI_DEVICE_ID_MPC8569		0x0060
 #define PCI_DEVICE_ID_MPC8568E		0x0020
 #define PCI_DEVICE_ID_MPC8568		0x0021
 #define PCI_DEVICE_ID_MPC8567E		0x0022
-- 
cgit v1.2.3-70-g09d2


From 01af9507ff36578dad89b1cc88ff37ac18e719cb Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 15 Apr 2009 14:38:40 -0500
Subject: powerpc/85xx: Add P2020DS board support

The P2020 is a dual e500v2 core based SOC with:
* 3 PCIe controllers
* 2 General purpose DMA controllers
* 2 sRIO controllers
* 3 eTSECS
* USB 2.0
* SDHC
* SPI, I2C, DUART
* enhanced localbus
* and optional Security (P2020E) security w/XOR acceleration

The p2020 DS reference board is pretty similar to the existing MPC85xx
DS boards and has a ULI 1575 connected on one of the PCIe controllers.

Signed-off-by: Ted Peters <Ted.Peters@freescale.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/boot/dts/p2020ds.dts        | 704 +++++++++++++++++++++++++++++++
 arch/powerpc/platforms/85xx/mpc85xx_ds.c |  43 +-
 arch/powerpc/platforms/fsl_uli1575.c     |   1 +
 arch/powerpc/sysdev/fsl_pci.c            |   2 +
 include/linux/pci_ids.h                  |   2 +
 5 files changed, 747 insertions(+), 5 deletions(-)
 create mode 100644 arch/powerpc/boot/dts/p2020ds.dts

(limited to 'include/linux')

diff --git a/arch/powerpc/boot/dts/p2020ds.dts b/arch/powerpc/boot/dts/p2020ds.dts
new file mode 100644
index 00000000000..11019142813
--- /dev/null
+++ b/arch/powerpc/boot/dts/p2020ds.dts
@@ -0,0 +1,704 @@
+/*
+ * P2020 DS Device Tree Source
+ *
+ * Copyright 2009 Freescale Semiconductor Inc.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+/dts-v1/;
+/ {
+	model = "fsl,P2020";
+	compatible = "fsl,P2020DS";
+	#address-cells = <2>;
+	#size-cells = <2>;
+
+	aliases {
+		ethernet0 = &enet0;
+		ethernet1 = &enet1;
+		ethernet2 = &enet2;
+		serial0 = &serial0;
+		serial1 = &serial1;
+		pci0 = &pci0;
+		pci1 = &pci1;
+		pci2 = &pci2;
+	};
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		PowerPC,P2020@0 {
+			device_type = "cpu";
+			reg = <0x0>;
+			next-level-cache = <&L2>;
+		};
+
+		PowerPC,P2020@1 {
+			device_type = "cpu";
+			reg = <0x1>;
+			next-level-cache = <&L2>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	localbus@ffe05000 {
+		#address-cells = <2>;
+		#size-cells = <1>;
+		compatible = "fsl,elbc", "simple-bus";
+		reg = <0 0xffe05000 0 0x1000>;
+		interrupts = <19 2>;
+		interrupt-parent = <&mpic>;
+
+		ranges = <0x0 0x0 0x0 0xe8000000 0x08000000
+			  0x1 0x0 0x0 0xe0000000 0x08000000
+			  0x2 0x0 0x0 0xffa00000 0x00040000
+			  0x3 0x0 0x0 0xffdf0000 0x00008000
+			  0x4 0x0 0x0 0xffa40000 0x00040000
+			  0x5 0x0 0x0 0xffa80000 0x00040000
+			  0x6 0x0 0x0 0xffac0000 0x00040000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+
+			ramdisk@0 {
+				reg = <0x0 0x03000000>;
+				read-only;
+			};
+
+			diagnostic@3000000 {
+				reg = <0x03000000 0x00e00000>;
+				read-only;
+			};
+
+			dink@3e00000 {
+				reg = <0x03e00000 0x00200000>;
+				read-only;
+			};
+
+			kernel@4000000 {
+				reg = <0x04000000 0x00400000>;
+				read-only;
+			};
+
+			jffs2@4400000 {
+				reg = <0x04400000 0x03b00000>;
+			};
+
+			dtb@7f00000 {
+				reg = <0x07f00000 0x00080000>;
+				read-only;
+			};
+
+			u-boot@7f80000 {
+				reg = <0x07f80000 0x00080000>;
+				read-only;
+			};
+		};
+
+		nand@2,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,elbc-fcm-nand";
+			reg = <0x2 0x0 0x40000>;
+
+			u-boot@0 {
+				reg = <0x0 0x02000000>;
+				read-only;
+			};
+
+			jffs2@2000000 {
+				reg = <0x02000000 0x10000000>;
+			};
+
+			ramdisk@12000000 {
+				reg = <0x12000000 0x08000000>;
+				read-only;
+			};
+
+			kernel@1a000000 {
+				reg = <0x1a000000 0x04000000>;
+			};
+
+			dtb@1e000000 {
+				reg = <0x1e000000 0x01000000>;
+				read-only;
+			};
+
+			empty@1f000000 {
+				reg = <0x1f000000 0x21000000>;
+			};
+		};
+
+		nand@4,0 {
+			compatible = "fsl,elbc-fcm-nand";
+			reg = <0x4 0x0 0x40000>;
+		};
+
+		nand@5,0 {
+			compatible = "fsl,elbc-fcm-nand";
+			reg = <0x5 0x0 0x40000>;
+		};
+
+		nand@6,0 {
+			compatible = "fsl,elbc-fcm-nand";
+			reg = <0x6 0x0 0x40000>;
+		};
+	};
+
+	soc@ffe00000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		device_type = "soc";
+		compatible = "fsl,p2020-immr", "simple-bus";
+		ranges = <0x0 0 0xffe00000 0x100000>;
+		bus-frequency = <0>;		// Filled out by uboot.
+
+		ecm-law@0 {
+			compatible = "fsl,ecm-law";
+			reg = <0x0 0x1000>;
+			fsl,num-laws = <12>;
+		};
+
+		ecm@1000 {
+			compatible = "fsl,p2020-ecm", "fsl,ecm";
+			reg = <0x1000 0x1000>;
+			interrupts = <17 2>;
+			interrupt-parent = <&mpic>;
+		};
+
+		memory-controller@2000 {
+			compatible = "fsl,p2020-memory-controller";
+			reg = <0x2000 0x1000>;
+			interrupt-parent = <&mpic>;
+			interrupts = <18 2>;
+		};
+
+		i2c@3000 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			cell-index = <0>;
+			compatible = "fsl-i2c";
+			reg = <0x3000 0x100>;
+			interrupts = <43 2>;
+			interrupt-parent = <&mpic>;
+			dfsrr;
+		};
+
+		i2c@3100 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			cell-index = <1>;
+			compatible = "fsl-i2c";
+			reg = <0x3100 0x100>;
+			interrupts = <43 2>;
+			interrupt-parent = <&mpic>;
+			dfsrr;
+		};
+
+		serial0: serial@4500 {
+			cell-index = <0>;
+			device_type = "serial";
+			compatible = "ns16550";
+			reg = <0x4500 0x100>;
+			clock-frequency = <0>;
+			interrupts = <42 2>;
+			interrupt-parent = <&mpic>;
+		};
+
+		serial1: serial@4600 {
+			cell-index = <1>;
+			device_type = "serial";
+			compatible = "ns16550";
+			reg = <0x4600 0x100>;
+			clock-frequency = <0>;
+			interrupts = <42 2>;
+			interrupt-parent = <&mpic>;
+		};
+
+		spi@7000 {
+			compatible = "fsl,espi";
+			reg = <0x7000 0x1000>;
+			interrupts = <59 0x2>;
+			interrupt-parent = <&mpic>;
+		};
+
+		dma@c300 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,eloplus-dma";
+			reg = <0xc300 0x4>;
+			ranges = <0x0 0xc100 0x200>;
+			cell-index = <1>;
+			dma-channel@0 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x0 0x80>;
+				cell-index = <0>;
+				interrupt-parent = <&mpic>;
+				interrupts = <76 2>;
+			};
+			dma-channel@80 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x80 0x80>;
+				cell-index = <1>;
+				interrupt-parent = <&mpic>;
+				interrupts = <77 2>;
+			};
+			dma-channel@100 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x100 0x80>;
+				cell-index = <2>;
+				interrupt-parent = <&mpic>;
+				interrupts = <78 2>;
+			};
+			dma-channel@180 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x180 0x80>;
+				cell-index = <3>;
+				interrupt-parent = <&mpic>;
+				interrupts = <79 2>;
+			};
+		};
+
+		gpio: gpio-controller@f000 {
+			#gpio-cells = <2>;
+			compatible = "fsl,mpc8572-gpio";
+			reg = <0xf000 0x100>;
+			interrupts = <47 0x2>;
+			interrupt-parent = <&mpic>;
+			gpio-controller;
+		};
+
+		L2: l2-cache-controller@20000 {
+			compatible = "fsl,p2020-l2-cache-controller";
+			reg = <0x20000 0x1000>;
+			cache-line-size = <32>;	// 32 bytes
+			cache-size = <0x80000>; // L2, 512k
+			interrupt-parent = <&mpic>;
+			interrupts = <16 2>;
+		};
+
+		dma@21300 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,eloplus-dma";
+			reg = <0x21300 0x4>;
+			ranges = <0x0 0x21100 0x200>;
+			cell-index = <0>;
+			dma-channel@0 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x0 0x80>;
+				cell-index = <0>;
+				interrupt-parent = <&mpic>;
+				interrupts = <20 2>;
+			};
+			dma-channel@80 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x80 0x80>;
+				cell-index = <1>;
+				interrupt-parent = <&mpic>;
+				interrupts = <21 2>;
+			};
+			dma-channel@100 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x100 0x80>;
+				cell-index = <2>;
+				interrupt-parent = <&mpic>;
+				interrupts = <22 2>;
+			};
+			dma-channel@180 {
+				compatible = "fsl,eloplus-dma-channel";
+				reg = <0x180 0x80>;
+				cell-index = <3>;
+				interrupt-parent = <&mpic>;
+				interrupts = <23 2>;
+			};
+		};
+
+		usb@22000 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl-usb2-dr";
+			reg = <0x22000 0x1000>;
+			interrupt-parent = <&mpic>;
+			interrupts = <28 0x2>;
+			phy_type = "ulpi";
+		};
+
+		enet0: ethernet@24000 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			cell-index = <0>;
+			device_type = "network";
+			model = "eTSEC";
+			compatible = "gianfar";
+			reg = <0x24000 0x1000>;
+			ranges = <0x0 0x24000 0x1000>;
+			local-mac-address = [ 00 00 00 00 00 00 ];
+			interrupts = <29 2 30 2 34 2>;
+			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
+			phy-handle = <&phy0>;
+			phy-connection-type = "rgmii-id";
+
+			mdio@520 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "fsl,gianfar-mdio";
+				reg = <0x520 0x20>;
+
+				phy0: ethernet-phy@0 {
+					interrupt-parent = <&mpic>;
+					interrupts = <3 1>;
+					reg = <0x0>;
+				};
+				phy1: ethernet-phy@1 {
+					interrupt-parent = <&mpic>;
+					interrupts = <3 1>;
+					reg = <0x1>;
+				};
+				phy2: ethernet-phy@2 {
+					interrupt-parent = <&mpic>;
+					interrupts = <3 1>;
+					reg = <0x2>;
+				};
+				tbi0: tbi-phy@11 {
+					reg = <0x11>;
+					device_type = "tbi-phy";
+				};
+			};
+		};
+
+		enet1: ethernet@25000 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			cell-index = <1>;
+			device_type = "network";
+			model = "eTSEC";
+			compatible = "gianfar";
+			reg = <0x25000 0x1000>;
+			ranges = <0x0 0x25000 0x1000>;
+			local-mac-address = [ 00 00 00 00 00 00 ];
+			interrupts = <35 2 36 2 40 2>;
+			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
+			phy-handle = <&phy1>;
+			phy-connection-type = "rgmii-id";
+
+			mdio@520 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "fsl,gianfar-tbi";
+				reg = <0x520 0x20>;
+
+				tbi1: tbi-phy@11 {
+					reg = <0x11>;
+					device_type = "tbi-phy";
+				};
+			};
+		};
+
+		enet2: ethernet@26000 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			cell-index = <2>;
+			device_type = "network";
+			model = "eTSEC";
+			compatible = "gianfar";
+			reg = <0x26000 0x1000>;
+			ranges = <0x0 0x26000 0x1000>;
+			local-mac-address = [ 00 00 00 00 00 00 ];
+			interrupts = <31 2 32 2 33 2>;
+			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
+			phy-handle = <&phy2>;
+			phy-connection-type = "rgmii-id";
+
+			mdio@520 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "fsl,gianfar-tbi";
+				reg = <0x520 0x20>;
+
+				tbi2: tbi-phy@11 {
+					reg = <0x11>;
+					device_type = "tbi-phy";
+				};
+			};
+		};
+
+		sdhci@2e000 {
+			compatible = "fsl,p2020-esdhc", "fsl,esdhc";
+			reg = <0x2e000 0x1000>;
+			interrupts = <72 0x2>;
+			interrupt-parent = <&mpic>;
+			/* Filled in by U-Boot */
+			clock-frequency = <0>;
+		};
+
+		crypto@30000 {
+			compatible = "fsl,sec3.1", "fsl,sec3.0", "fsl,sec2.4",
+				     "fsl,sec2.2", "fsl,sec2.1", "fsl,sec2.0";
+			reg = <0x30000 0x10000>;
+			interrupts = <45 2 58 2>;
+			interrupt-parent = <&mpic>;
+			fsl,num-channels = <4>;
+			fsl,channel-fifo-len = <24>;
+			fsl,exec-units-mask = <0xbfe>;
+			fsl,descriptor-types-mask = <0x3ab0ebf>;
+		};
+
+		mpic: pic@40000 {
+			interrupt-controller;
+			#address-cells = <0>;
+			#interrupt-cells = <2>;
+			reg = <0x40000 0x40000>;
+			compatible = "chrp,open-pic";
+			device_type = "open-pic";
+		};
+
+		msi@41600 {
+			compatible = "fsl,mpic-msi";
+			reg = <0x41600 0x80>;
+			msi-available-ranges = <0 0x100>;
+			interrupts = <
+				0xe0 0
+				0xe1 0
+				0xe2 0
+				0xe3 0
+				0xe4 0
+				0xe5 0
+				0xe6 0
+				0xe7 0>;
+			interrupt-parent = <&mpic>;
+		};
+
+		global-utilities@e0000 {	//global utilities block
+			compatible = "fsl,p2020-guts";
+			reg = <0xe0000 0x1000>;
+			fsl,has-rstcr;
+		};
+	};
+
+	pci0: pcie@ffe08000 {
+		compatible = "fsl,mpc8548-pcie";
+		device_type = "pci";
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		reg = <0 0xffe08000 0 0x1000>;
+		bus-range = <0 255>;
+		ranges = <0x2000000 0x0 0x80000000 0 0x80000000 0x0 0x20000000
+			  0x1000000 0x0 0x00000000 0 0xffc00000 0x0 0x10000>;
+		clock-frequency = <33333333>;
+		interrupt-parent = <&mpic>;
+		interrupts = <24 2>;
+		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0x0 0x0 0x1 &mpic 0x8 0x1
+			0000 0x0 0x0 0x2 &mpic 0x9 0x1
+			0000 0x0 0x0 0x3 &mpic 0xa 0x1
+			0000 0x0 0x0 0x4 &mpic 0xb 0x1
+			>;
+		pcie@0 {
+			reg = <0x0 0x0 0x0 0x0 0x0>;
+			#size-cells = <2>;
+			#address-cells = <3>;
+			device_type = "pci";
+			ranges = <0x2000000 0x0 0x80000000
+				  0x2000000 0x0 0x80000000
+				  0x0 0x20000000
+
+				  0x1000000 0x0 0x0
+				  0x1000000 0x0 0x0
+				  0x0 0x10000>;
+		};
+	};
+
+	pci1: pcie@ffe09000 {
+		compatible = "fsl,mpc8548-pcie";
+		device_type = "pci";
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		reg = <0 0xffe09000 0 0x1000>;
+		bus-range = <0 255>;
+		ranges = <0x2000000 0x0 0xa0000000 0 0xa0000000 0x0 0x20000000
+			  0x1000000 0x0 0x00000000 0 0xffc10000 0x0 0x10000>;
+		clock-frequency = <33333333>;
+		interrupt-parent = <&mpic>;
+		interrupts = <25 2>;
+		interrupt-map-mask = <0xff00 0x0 0x0 0x7>;
+		interrupt-map = <
+
+			// IDSEL 0x11 func 0 - PCI slot 1
+			0x8800 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8800 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 1 - PCI slot 1
+			0x8900 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8900 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 2 - PCI slot 1
+			0x8a00 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8a00 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 3 - PCI slot 1
+			0x8b00 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8b00 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 4 - PCI slot 1
+			0x8c00 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8c00 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 5 - PCI slot 1
+			0x8d00 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8d00 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 6 - PCI slot 1
+			0x8e00 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8e00 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x11 func 7 - PCI slot 1
+			0x8f00 0x0 0x0 0x1 &i8259 0x9 0x2
+			0x8f00 0x0 0x0 0x2 &i8259 0xa 0x2
+
+			// IDSEL 0x1d  Audio
+			0xe800 0x0 0x0 0x1 &i8259 0x6 0x2
+
+			// IDSEL 0x1e Legacy
+			0xf000 0x0 0x0 0x1 &i8259 0x7 0x2
+			0xf100 0x0 0x0 0x1 &i8259 0x7 0x2
+
+			// IDSEL 0x1f IDE/SATA
+			0xf800 0x0 0x0 0x1 &i8259 0xe 0x2
+			0xf900 0x0 0x0 0x1 &i8259 0x5 0x2
+			>;
+
+		pcie@0 {
+			reg = <0x0 0x0 0x0 0x0 0x0>;
+			#size-cells = <2>;
+			#address-cells = <3>;
+			device_type = "pci";
+			ranges = <0x2000000 0x0 0xa0000000
+				  0x2000000 0x0 0xa0000000
+				  0x0 0x20000000
+
+				  0x1000000 0x0 0x0
+				  0x1000000 0x0 0x0
+				  0x0 0x10000>;
+			uli1575@0 {
+				reg = <0x0 0x0 0x0 0x0 0x0>;
+				#size-cells = <2>;
+				#address-cells = <3>;
+				ranges = <0x2000000 0x0 0xa0000000
+					  0x2000000 0x0 0xa0000000
+					  0x0 0x20000000
+
+					  0x1000000 0x0 0x0
+					  0x1000000 0x0 0x0
+					  0x0 0x10000>;
+				isa@1e {
+					device_type = "isa";
+					#interrupt-cells = <2>;
+					#size-cells = <1>;
+					#address-cells = <2>;
+					reg = <0xf000 0x0 0x0 0x0 0x0>;
+					ranges = <0x1 0x0 0x1000000 0x0 0x0
+						  0x1000>;
+					interrupt-parent = <&i8259>;
+
+					i8259: interrupt-controller@20 {
+						reg = <0x1 0x20 0x2
+						       0x1 0xa0 0x2
+						       0x1 0x4d0 0x2>;
+						interrupt-controller;
+						device_type = "interrupt-controller";
+						#address-cells = <0>;
+						#interrupt-cells = <2>;
+						compatible = "chrp,iic";
+						interrupts = <4 1>;
+						interrupt-parent = <&mpic>;
+					};
+
+					i8042@60 {
+						#size-cells = <0>;
+						#address-cells = <1>;
+						reg = <0x1 0x60 0x1 0x1 0x64 0x1>;
+						interrupts = <1 3 12 3>;
+						interrupt-parent =
+							<&i8259>;
+
+						keyboard@0 {
+							reg = <0x0>;
+							compatible = "pnpPNP,303";
+						};
+
+						mouse@1 {
+							reg = <0x1>;
+							compatible = "pnpPNP,f03";
+						};
+					};
+
+					rtc@70 {
+						compatible = "pnpPNP,b00";
+						reg = <0x1 0x70 0x2>;
+					};
+
+					gpio@400 {
+						reg = <0x1 0x400 0x80>;
+					};
+				};
+			};
+		};
+
+	};
+
+	pci2: pcie@ffe0a000 {
+		compatible = "fsl,mpc8548-pcie";
+		device_type = "pci";
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		reg = <0 0xffe0a000 0 0x1000>;
+		bus-range = <0 255>;
+		ranges = <0x2000000 0x0 0xc0000000 0 0xc0000000 0x0 0x20000000
+			  0x1000000 0x0 0x00000000 0 0xffc20000 0x0 0x10000>;
+		clock-frequency = <33333333>;
+		interrupt-parent = <&mpic>;
+		interrupts = <26 2>;
+		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0x0 0x0 0x1 &mpic 0x0 0x1
+			0000 0x0 0x0 0x2 &mpic 0x1 0x1
+			0000 0x0 0x0 0x3 &mpic 0x2 0x1
+			0000 0x0 0x0 0x4 &mpic 0x3 0x1
+			>;
+		pcie@0 {
+			reg = <0x0 0x0 0x0 0x0 0x0>;
+			#size-cells = <2>;
+			#address-cells = <3>;
+			device_type = "pci";
+			ranges = <0x2000000 0x0 0xc0000000
+				  0x2000000 0x0 0xc0000000
+				  0x0 0x20000000
+
+				  0x1000000 0x0 0x0
+				  0x1000000 0x0 0x0
+				  0x0 0x10000>;
+		};
+	};
+};
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c
index de66de7a9ca..53d5851a6c9 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c
@@ -163,7 +163,8 @@ static void __init mpc85xx_ds_setup_arch(void)
 #ifdef CONFIG_PCI
 	for_each_node_by_type(np, "pci") {
 		if (of_device_is_compatible(np, "fsl,mpc8540-pci") ||
-		    of_device_is_compatible(np, "fsl,mpc8548-pcie")) {
+		    of_device_is_compatible(np, "fsl,mpc8548-pcie") ||
+		    of_device_is_compatible(np, "fsl,p2020-pcie")) {
 			struct resource rsrc;
 			of_address_to_resource(np, 0, &rsrc);
 			if ((rsrc.start & 0xfffff) == primary_phb_addr)
@@ -195,9 +196,9 @@ static int __init mpc8544_ds_probe(void)
 		primary_phb_addr = 0xb000;
 #endif
 		return 1;
-	} else {
-		return 0;
 	}
+
+	return 0;
 }
 
 static struct of_device_id __initdata mpc85xxds_ids[] = {
@@ -214,6 +215,7 @@ static int __init mpc85xxds_publish_devices(void)
 }
 machine_device_initcall(mpc8544_ds, mpc85xxds_publish_devices);
 machine_device_initcall(mpc8572_ds, mpc85xxds_publish_devices);
+machine_device_initcall(p2020_ds, mpc85xxds_publish_devices);
 
 /*
  * Called very early, device-tree isn't unflattened
@@ -227,9 +229,26 @@ static int __init mpc8572_ds_probe(void)
 		primary_phb_addr = 0x8000;
 #endif
 		return 1;
-	} else {
-		return 0;
 	}
+
+	return 0;
+}
+
+/*
+ * Called very early, device-tree isn't unflattened
+ */
+static int __init p2020_ds_probe(void)
+{
+	unsigned long root = of_get_flat_dt_root();
+
+	if (of_flat_dt_is_compatible(root, "fsl,P2020DS")) {
+#ifdef CONFIG_PCI
+		primary_phb_addr = 0x9000;
+#endif
+		return 1;
+	}
+
+	return 0;
 }
 
 define_machine(mpc8544_ds) {
@@ -259,3 +278,17 @@ define_machine(mpc8572_ds) {
 	.calibrate_decr		= generic_calibrate_decr,
 	.progress		= udbg_progress,
 };
+
+define_machine(p2020_ds) {
+	.name			= "P2020 DS",
+	.probe			= p2020_ds_probe,
+	.setup_arch		= mpc85xx_ds_setup_arch,
+	.init_IRQ		= mpc85xx_ds_pic_init,
+#ifdef CONFIG_PCI
+	.pcibios_fixup_bus	= fsl_pcibios_fixup_bus,
+#endif
+	.get_irq		= mpic_get_irq,
+	.restart		= fsl_rstcr_restart,
+	.calibrate_decr		= generic_calibrate_decr,
+	.progress		= udbg_progress,
+};
diff --git a/arch/powerpc/platforms/fsl_uli1575.c b/arch/powerpc/platforms/fsl_uli1575.c
index efd41f40984..fd23a1d4b39 100644
--- a/arch/powerpc/platforms/fsl_uli1575.c
+++ b/arch/powerpc/platforms/fsl_uli1575.c
@@ -55,6 +55,7 @@ static inline bool is_quirk_valid(void)
 {
 	return (machine_is(mpc86xx_hpcn) ||
 		machine_is(mpc8544_ds) ||
+		machine_is(p2020_ds) ||
 		machine_is(mpc8572_ds));
 }
 
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 94d8b3feb1e..b20171d9df0 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -302,6 +302,8 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8536, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641D, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8610, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020E, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020, quirk_fsl_pcie_header);
 #endif /* CONFIG_PPC_85xx || CONFIG_PPC_86xx */
 
 #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7cc5b80327f..bfa0402d343 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2288,6 +2288,8 @@
 #define PCI_DEVICE_ID_MPC8572		0x0041
 #define PCI_DEVICE_ID_MPC8536E		0x0050
 #define PCI_DEVICE_ID_MPC8536		0x0051
+#define PCI_DEVICE_ID_P2020E		0x0070
+#define PCI_DEVICE_ID_P2020		0x0071
 #define PCI_DEVICE_ID_MPC8641		0x7010
 #define PCI_DEVICE_ID_MPC8641D		0x7011
 #define PCI_DEVICE_ID_MPC8610		0x7018
-- 
cgit v1.2.3-70-g09d2


From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 May 2009 09:22:19 +0200
Subject: sched: properly define the sched_group::cpumask and
 sched_domain::span fields

Properly document the variable-size structure tricks we are doing
wrt. struct sched_group and sched_domain, and use the field[0] GCC
extension instead of defining a vla array.

Dont use unions for this, as pointed out by Linus.

[ Impact: cleanup, un-confuse Sparse and LLVM ]

Reported-by: Jeff Garzik <jeff@garzik.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 25 ++++++++++++++++++++++---
 kernel/sched.c        |  5 +++--
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index de7b3b21777..dbb1043e865 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,17 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_group' in kernel/sched.c)
+	 */
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -925,8 +935,17 @@ struct sched_domain {
 	char *name;
 #endif
 
-	/* span of all CPUs in this domain */
-	unsigned long span[];
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_domain' in kernel/sched.c)
+	 */
+	unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/kernel/sched.c b/kernel/sched.c
index 497c09ba61e..228acae8821 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;
-- 
cgit v1.2.3-70-g09d2


From 79eb63e9e5875b84341a3a05f8e6ae9cdb4bb6f6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:57:15 +0300
Subject: block: Add blk_make_request(), takes bio, returns a request

New block API:
given a struct bio allocates a new request. This is the parallel of
generic_make_request for BLOCK_PC commands users.

The passed bio may be a chained-bio. The bio is bounced if needed
inside the call to this member.

This is in the effort of un-exporting blk_rq_append_bio().

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index e3f7e6a3a09..bec1d69952d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -890,6 +890,51 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_get_request);
 
+/**
+ * blk_make_request - given a bio, allocate a corresponding struct request.
+ *
+ * @bio:  The bio describing the memory mappings that will be submitted for IO.
+ *        It may be a chained-bio properly constructed by block/bio layer.
+ *
+ * blk_make_request is the parallel of generic_make_request for BLOCK_PC
+ * type commands. Where the struct request needs to be farther initialized by
+ * the caller. It is passed a &struct bio, which describes the memory info of
+ * the I/O transfer.
+ *
+ * The caller of blk_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffers. That bio_data_dir() will return
+ * the needed direction of the request. (And all bio's in the passed bio-chain
+ * are properly set accordingly)
+ *
+ * If called under none-sleepable conditions, mapped bio buffers must not
+ * need bouncing, by calling the appropriate masked or flagged allocator,
+ * suitable for the target device. Otherwise the call to blk_queue_bounce will
+ * BUG.
+ */
+struct request *blk_make_request(struct request_queue *q, struct bio *bio,
+				 gfp_t gfp_mask)
+{
+	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
+
+	if (unlikely(!rq))
+		return ERR_PTR(-ENOMEM);
+
+	for_each_bio(bio) {
+		struct bio *bounce_bio = bio;
+		int ret;
+
+		blk_queue_bounce(q, &bounce_bio);
+		ret = blk_rq_append_bio(q, rq, bounce_bio);
+		if (unlikely(ret)) {
+			blk_put_request(rq);
+			return ERR_PTR(ret);
+		}
+	}
+
+	return rq;
+}
+EXPORT_SYMBOL(blk_make_request);
+
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f9d60a78c08..88a83e112c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -740,6 +740,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
+extern struct request *blk_make_request(struct request_queue *, struct bio *,
+					gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
-- 
cgit v1.2.3-70-g09d2


From a411f4bbb89f1f08687b344064d6775bce1e4658 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 19:00:01 +0300
Subject: block: Un-export blk_rq_append_bio

OSD was the last in-tree user of blk_rq_append_bio(). Now
that it is fixed blk_rq_append_bio is un-exported and
is only used internally by block layer.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c        | 1 -
 block/blk.h            | 2 ++
 include/linux/blkdev.h | 6 ------
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index caa05a66774..ef2492adca7 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -24,7 +24,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(blk_rq_append_bio);
 
 static int __blk_rq_unmap_user(struct bio *bio)
 {
diff --git a/block/blk.h b/block/blk.h
index 9e0042ca949..c863ec2281e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,8 @@ extern struct kobj_type blk_queue_ktype;
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
+int blk_rq_append_bio(struct request_queue *q, struct request *rq,
+		      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 88a83e112c9..564445be7a6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -757,12 +757,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-/*
- * Temporary export, until SCSI gets fixed up.
- */
-extern int blk_rq_append_bio(struct request_queue *q, struct request *rq,
-			     struct bio *bio);
-
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
-- 
cgit v1.2.3-70-g09d2


From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:24:08 +0200
Subject: perf_counter: fix counter inheritance race

Context rotation should not occur when we are in the middle of
walking the counter list when inheriting counters ...

[ Impact: fix occasionally incorrect perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc22c9..13cb2fbbf33 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,6 +508,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7af16d1c480..4d8f97375f3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx->rr_allowed)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
+	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
+	parent_ctx->rr_allowed = 0;
+	barrier(); /* irqs */
+
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
@@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
+	barrier(); /* irqs */
+	parent_ctx->rr_allowed = 1;
+
 	mutex_unlock(&parent_ctx->mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 645069299a1c7358cf7330afe293f07552f11a5d Mon Sep 17 00:00:00 2001
From: Sascha Hlusiak <contact@saschahlusiak.de>
Date: Tue, 19 May 2009 12:56:52 +0000
Subject: sit: stateless autoconf for isatap

be sent periodically. The rs_delay can be speficied when adding the
PRL entry and defaults to 15 minutes.

The RS is sent from every link local adress that's assigned to the
tunnel interface. It's directed to the (guessed) linklocal address
of the router and is sent through the tunnel.

Better: send to ff02::2 encapsuled in unicast directed to router-v4.

Signed-off-by: Sascha Hlusiak <contact@saschahlusiak.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_tunnel.h |  2 +-
 include/net/ipip.h        |  7 ++++++
 net/ipv6/ndisc.c          |  1 +
 net/ipv6/sit.c            | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 5a9aae4adb4..5eb9b0f857e 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -44,7 +44,7 @@ struct ip_tunnel_prl {
 	__u16			flags;
 	__u16			__reserved;
 	__u32			datalen;
-	__u32			__reserved2;
+	__u32			rs_delay;
 	/* data follows */
 };
 
diff --git a/include/net/ipip.h b/include/net/ipip.h
index fdf9bd74370..5d3036fa151 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -28,11 +28,18 @@ struct ip_tunnel
 	unsigned int			prl_count;	/* # of entries in PRL */
 };
 
+/* ISATAP: default interval between RS in secondy */
+#define IPTUNNEL_RS_DEFAULT_DELAY	(900)
+
 struct ip_tunnel_prl_entry
 {
 	struct ip_tunnel_prl_entry	*next;
 	__be32				addr;
 	u16				flags;
+	unsigned long			rs_delay;
+	struct timer_list		rs_timer;
+	struct ip_tunnel		*tunnel;
+	spinlock_t			lock;
 };
 
 #define IPTUNNEL_XMIT() do {						\
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index ab65cc51b00..e09f12ee57c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -658,6 +658,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
 		     &icmp6h, NULL,
 		     send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0);
 }
+EXPORT_SYMBOL(ndisc_send_rs);
 
 
 static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 3fd060076e7..b3a59bd40f0 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -15,6 +15,7 @@
  * Roger Venning <r.venning@telstra.com>:	6to4 support
  * Nate Thompson <nate@thebog.net>:		6to4 support
  * Fred Templin <fred.l.templin@boeing.com>:	isatap support
+ * Sascha Hlusiak <mail@saschahlusiak.de>:	stateless autoconf for isatap
  */
 
 #include <linux/module.h>
@@ -222,6 +223,44 @@ failed:
 	return NULL;
 }
 
+static void ipip6_tunnel_rs_timer(unsigned long data)
+{
+	struct ip_tunnel_prl_entry *p = (struct ip_tunnel_prl_entry *) data;
+	struct inet6_dev *ifp;
+	struct inet6_ifaddr *addr;
+
+	spin_lock(&p->lock);
+	ifp = __in6_dev_get(p->tunnel->dev);
+
+	read_lock_bh(&ifp->lock);
+	for (addr = ifp->addr_list; addr; addr = addr->if_next) {
+		struct in6_addr rtr;
+
+		if (!(ipv6_addr_type(&addr->addr) & IPV6_ADDR_LINKLOCAL))
+			continue;
+
+		/* Send RS to guessed linklocal address of router
+		 *
+		 * Better: send to ff02::2 encapsuled in unicast directly
+		 * to router-v4 instead of guessing the v6 address.
+		 *
+		 * Cisco/Windows seem to not set the u/l bit correctly,
+		 * so we won't guess right.
+		 */
+		ipv6_addr_set(&rtr,  htonl(0xFE800000), 0, 0, 0);
+		if (!__ipv6_isatap_ifid(rtr.s6_addr + 8,
+					p->addr)) {
+			ndisc_send_rs(p->tunnel->dev, &addr->addr, &rtr);
+		}
+	}
+	read_unlock_bh(&ifp->lock);
+
+	mod_timer(&p->rs_timer, jiffies + HZ * p->rs_delay);
+	spin_unlock(&p->lock);
+
+	return;
+}
+
 static struct ip_tunnel_prl_entry *
 __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
 {
@@ -280,6 +319,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
 			continue;
 		kp[c].addr = prl->addr;
 		kp[c].flags = prl->flags;
+		kp[c].rs_delay = prl->rs_delay;
 		c++;
 		if (kprl.addr != htonl(INADDR_ANY))
 			break;
@@ -329,11 +369,23 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
 	}
 
 	p->next = t->prl;
+	p->tunnel = t;
 	t->prl = p;
 	t->prl_count++;
+
+	spin_lock_init(&p->lock);
+	setup_timer(&p->rs_timer, ipip6_tunnel_rs_timer, (unsigned long) p);
 update:
 	p->addr = a->addr;
 	p->flags = a->flags;
+	p->rs_delay = a->rs_delay;
+	if (p->rs_delay == 0)
+		p->rs_delay = IPTUNNEL_RS_DEFAULT_DELAY;
+	spin_lock(&p->lock);
+	del_timer(&p->rs_timer);
+	if (p->flags & PRL_DEFAULT)
+		mod_timer(&p->rs_timer, jiffies + 1);
+	spin_unlock(&p->lock);
 out:
 	write_unlock(&ipip6_lock);
 	return err;
@@ -352,6 +404,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 			if ((*p)->addr == a->addr) {
 				x = *p;
 				*p = x->next;
+				spin_lock(&x->lock);
+				del_timer(&x->rs_timer);
+				spin_unlock(&x->lock);
 				kfree(x);
 				t->prl_count--;
 				goto out;
@@ -362,6 +417,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 		while (t->prl) {
 			x = t->prl;
 			t->prl = t->prl->next;
+			spin_lock(&x->lock);
+			del_timer(&x->rs_timer);
+			spin_unlock(&x->lock);
 			kfree(x);
 			t->prl_count--;
 		}
-- 
cgit v1.2.3-70-g09d2


From 86579a4cccf18a2ddbf7de8fc5a0f5d9b94ed76d Mon Sep 17 00:00:00 2001
From: Michael Roth <mroth@nessie.de>
Date: Mon, 18 May 2009 16:04:44 -0700
Subject: Input: ads7846 - support swapping x and y axes

Signed-off-by: Michael Roth <mroth@nessie.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 7 +++++++
 include/linux/spi/ads7846.h         | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 2b01e56568f..b5ad252f5cf 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -97,6 +97,8 @@ struct ads7846 {
 	u16			x_plate_ohms;
 	u16			pressure_max;
 
+	bool			swap_xy;
+
 	struct ads7846_packet	*packet;
 
 	struct spi_transfer	xfer[18];
@@ -599,6 +601,10 @@ static void ads7846_rx(void *ads)
 			dev_dbg(&ts->spi->dev, "DOWN\n");
 #endif
 		}
+
+		if (ts->swap_xy)
+			swap(x, y);
+
 		input_report_abs(input, ABS_X, x);
 		input_report_abs(input, ABS_Y, y);
 		input_report_abs(input, ABS_PRESSURE, Rt);
@@ -917,6 +923,7 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	ts->spi = spi;
 	ts->input = input_dev;
 	ts->vref_mv = pdata->vref_mv;
+	ts->swap_xy = pdata->swap_xy;
 
 	hrtimer_init(&ts->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	ts->timer.function = ads7846_timer;
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 2ea20320c09..51948eb6927 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -17,6 +17,7 @@ struct ads7846_platform_data {
 	u16	vref_mv;		/* external vref value, milliVolts */
 	bool	keep_vref_on;		/* set to keep vref on for differential
 					 * measurements as well */
+	bool	swap_xy;		/* swap x and y axes */
 
 	/* Settling time of the analog signals; a function of Vcc and the
 	 * capacitance on the X/Y drivers.  If set to non-zero, two samples
-- 
cgit v1.2.3-70-g09d2


From 0a7ae2ff0d29bb3b327edff4c8ab67b3834fa811 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 20 May 2009 08:54:31 +0200
Subject: block: change the tag sync vs async restriction logic

Make them fully share the tag space, but disallow async requests using
the last any two slots.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    |  2 +-
 block/blk-core.c       |  2 +-
 block/blk-tag.c        | 15 +++++++++------
 block/elevator.c       |  8 ++++----
 include/linux/blkdev.h |  7 ++++++-
 5 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0ab81a0a750..0d98054cdbd 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -218,7 +218,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
+	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
diff --git a/block/blk-core.c b/block/blk-core.c
index 49065075d46..1c748403882 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1815,7 +1815,7 @@ void blk_dequeue_request(struct request *rq)
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq))
-		q->in_flight++;
+		q->in_flight[rq_is_sync(rq)]++;
 }
 
 /**
diff --git a/block/blk-tag.c b/block/blk-tag.c
index c260f7c30dd..2e5cfeb5933 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(blk_queue_end_tag);
 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
-	unsigned max_depth, offset;
+	unsigned max_depth;
 	int tag;
 
 	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
@@ -355,13 +355,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 * to starve sync IO on behalf of flooding async IO.
 	 */
 	max_depth = bqt->max_depth;
-	if (rq_is_sync(rq))
-		offset = 0;
-	else
-		offset = max_depth >> 2;
+	if (!rq_is_sync(rq) && max_depth > 1) {
+		max_depth -= 2;
+		if (!max_depth)
+			max_depth = 1;
+		if (q->in_flight[0] > max_depth)
+			return 1;
+	}
 
 	do {
-		tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
+		tag = find_first_zero_bit(bqt->tag_map, max_depth);
 		if (tag >= max_depth)
 			return 1;
 
diff --git a/block/elevator.c b/block/elevator.c
index 918920056e4..ebee948293e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -546,7 +546,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	 * in_flight count again
 	 */
 	if (blk_account_rq(rq)) {
-		q->in_flight--;
+		q->in_flight[rq_is_sync(rq)]--;
 		if (blk_sorted_rq(rq))
 			elv_deactivate_rq(q, rq);
 	}
@@ -685,7 +685,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 
 	if (unplug_it && blk_queue_plugged(q)) {
 		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-			- q->in_flight;
+				- queue_in_flight(q);
 
 		if (nrq >= q->unplug_thresh)
 			__generic_unplug_device(q);
@@ -823,7 +823,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
-		q->in_flight--;
+		q->in_flight[rq_is_sync(rq)]--;
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
@@ -838,7 +838,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		if (!list_empty(&q->queue_head))
 			next = list_entry_rq(q->queue_head.next);
 
-		if (!q->in_flight &&
+		if (!queue_in_flight(q) &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 564445be7a6..a967dd775db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -404,7 +404,7 @@ struct request_queue
 	struct list_head	tag_busy_list;
 
 	unsigned int		nr_sorted;
-	unsigned int		in_flight;
+	unsigned int		in_flight[2];
 
 	unsigned int		rq_timeout;
 	struct timer_list	timeout;
@@ -511,6 +511,11 @@ static inline void queue_flag_clear_unlocked(unsigned int flag,
 	__clear_bit(flag, &q->queue_flags);
 }
 
+static inline int queue_in_flight(struct request_queue *q)
+{
+	return q->in_flight[0] + q->in_flight[1];
+}
+
 static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
-- 
cgit v1.2.3-70-g09d2


From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:19 +0200
Subject: perf_counter: Solve the rotate_ctx vs inherit race differently

Instead of disabling RR scheduling of the counters, use a different list
that does not get rotated to iterate the counters on inheritance.

[ Impact: cleanup, optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.237504544@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 -
 kernel/perf_counter.c        | 15 +++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 13cb2fbbf33..c8c1dfc22c9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,7 +508,6 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
-	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4d8f97375f3..64113e6d194 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,8 +1120,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	if (ctx->rr_allowed)
-		rotate_ctx(ctx);
+	rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3109,7 +3108,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3350,14 +3348,14 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
-	parent_ctx->rr_allowed = 0;
-	barrier(); /* irqs */
-
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+		if (counter != counter->group_leader)
+			continue;
+
 		if (!counter->hw_event.inherit)
 			continue;
 
@@ -3366,9 +3364,6 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
-	barrier(); /* irqs */
-	parent_ctx->rr_allowed = 1;
-
 	mutex_unlock(&parent_ctx->mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:20 +0200
Subject: perf_counter: Log irq_period changes

For the dynamic irq_period code, log whenever we change the period so that
analyzing code can normalize the event flow.

[ Impact: add new feature to allow more precise profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.298769743@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  8 ++++++++
 kernel/perf_counter.c        | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc22c9..f612941ef46 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -257,6 +257,14 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_COMM			= 3,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				irq_period;
+	 * };
+	 */
+	PERF_EVENT_PERIOD		= 4,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 64113e6d194..db02eb16c77 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,7 +1046,9 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
-void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_log_period(struct perf_counter *counter, u64 period);
+
+static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	u64 irq_period;
@@ -1072,6 +1074,8 @@ void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!irq_period)
 			irq_period = 1;
 
+		perf_log_period(counter, irq_period);
+
 		counter->hw.irq_period = irq_period;
 		counter->hw.interrupts = 0;
 	}
@@ -2406,6 +2410,40 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
+/*
+ *
+ */
+
+static void perf_log_period(struct perf_counter *counter, u64 period)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+		u64				period;
+	} freq_event = {
+		.header = {
+			.type = PERF_EVENT_PERIOD,
+			.misc = 0,
+			.size = sizeof(freq_event),
+		},
+		.time = sched_clock(),
+		.period = period,
+	};
+
+	if (counter->hw.irq_period == period)
+		return;
+
+	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, freq_event);
+	perf_output_end(&handle);
+}
+
 /*
  * Generic counter overflow handling.
  */
-- 
cgit v1.2.3-70-g09d2


From 89f536ccfa8b370ff4d054f4061858ca9322c25a Mon Sep 17 00:00:00 2001
From: Stephane Chatty <chatty@enac.fr>
Date: Wed, 20 May 2009 15:41:24 +0200
Subject: HID: add new multitouch and digitizer contants

Added constants to hid.h for all digitizer usages (including the new multitouch
ones that are not yet in the official USB spec but are being pushed by Microsft
as described in their paper "Digitizer Drivers for Windows Touch and Pen-Based
Computers"). Updated hid-debug.c to support the new MT input constants such as
ABS_MT_POSITION_X.

Signed-off-by: Stephane Chatty <chatty@enac.fr>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-debug.c | 23 +++++++++++++++++++++--
 include/linux/hid.h     | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 47ac1a7d66e..04359ed64b8 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -137,6 +137,14 @@ static const struct hid_usage_entry hid_usage_table[] = {
     {0, 0x44, "BarrelSwitch"},
     {0, 0x45, "Eraser"},
     {0, 0x46, "TabletPick"},
+    {0, 0x47, "Confidence"},
+    {0, 0x48, "Width"},
+    {0, 0x49, "Height"},
+    {0, 0x51, "ContactID"},
+    {0, 0x52, "InputMode"},
+    {0, 0x53, "DeviceIndex"},
+    {0, 0x54, "ContactCount"},
+    {0, 0x55, "ContactMaximumNumber"},
   { 15, 0, "PhysicalInterfaceDevice" },
     {0, 0x00, "Undefined"},
     {0, 0x01, "Physical_Interface_Device"},
@@ -514,9 +522,11 @@ static const char *events[EV_MAX + 1] = {
 	[EV_FF_STATUS] = "ForceFeedbackStatus",
 };
 
-static const char *syncs[2] = {
+static const char *syncs[3] = {
 	[SYN_REPORT] = "Report",		[SYN_CONFIG] = "Config",
+	[SYN_MT_REPORT] = "MT Report",
 };
+
 static const char *keys[KEY_MAX + 1] = {
 	[KEY_RESERVED] = "Reserved",		[KEY_ESC] = "Esc",
 	[KEY_1] = "1",				[KEY_2] = "2",
@@ -734,8 +744,17 @@ static const char *absolutes[ABS_MAX + 1] = {
 	[ABS_HAT2Y] = "Hat2Y",		[ABS_HAT3X] = "Hat3X",
 	[ABS_HAT3Y] = "Hat 3Y",		[ABS_PRESSURE] = "Pressure",
 	[ABS_DISTANCE] = "Distance",	[ABS_TILT_X] = "XTilt",
-	[ABS_TILT_Y] = "YTilt",		[ABS_TOOL_WIDTH] = "Tool Width",
+	[ABS_TILT_Y] = "YTilt",		[ABS_TOOL_WIDTH] = "ToolWidth",
 	[ABS_VOLUME] = "Volume",	[ABS_MISC] = "Misc",
+	[ABS_MT_TOUCH_MAJOR] = "MTMajor",
+	[ABS_MT_TOUCH_MINOR] = "MTMinor",
+	[ABS_MT_WIDTH_MAJOR] = "MTMajorW",
+	[ABS_MT_WIDTH_MINOR] = "MTMinorW",
+	[ABS_MT_ORIENTATION] = "MTOrientation",
+	[ABS_MT_POSITION_X] = "MTPositionX",
+	[ABS_MT_POSITION_Y] = "MTPositionY",
+	[ABS_MT_TOOL_TYPE] = "MTToolType",
+	[ABS_MT_BLOB_ID] = "MTBlobID",
 };
 
 static const char *misc[MSC_MAX + 1] = {
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a72876e4358..53489fd4d70 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -238,6 +238,42 @@ struct hid_item {
 #define HID_GD_RIGHT		0x00010092
 #define HID_GD_LEFT		0x00010093
 
+#define HID_DG_DIGITIZER	0x000d0001
+#define HID_DG_PEN		0x000d0002
+#define HID_DG_LIGHTPEN		0x000d0003
+#define HID_DG_TOUCHSCREEN	0x000d0004
+#define HID_DG_TOUCHPAD		0x000d0005
+#define HID_DG_STYLUS		0x000d0020
+#define HID_DG_PUCK		0x000d0021
+#define HID_DG_FINGER		0x000d0022
+#define HID_DG_TIPPRESSURE	0x000d0030
+#define HID_DG_BARRELPRESSURE	0x000d0031
+#define HID_DG_INRANGE		0x000d0032
+#define HID_DG_TOUCH		0x000d0033
+#define HID_DG_UNTOUCH		0x000d0034
+#define HID_DG_TAP		0x000d0035
+#define HID_DG_TABLETFUNCTIONKEY	0x000d0039
+#define HID_DG_PROGRAMCHANGEKEY	0x000d003a
+#define HID_DG_INVERT		0x000d003c
+#define HID_DG_TIPSWITCH	0x000d0042
+#define HID_DG_TIPSWITCH2	0x000d0043
+#define HID_DG_BARRELSWITCH	0x000d0044
+#define HID_DG_ERASER		0x000d0045
+#define HID_DG_TABLETPICK	0x000d0046
+/*
+ * as of May 20, 2009 the usages below are not yet in the official USB spec
+ * but are being pushed by Microsft as described in their paper "Digitizer
+ * Drivers for Windows Touch and Pen-Based Computers"
+ */
+#define HID_DG_CONFIDENCE	0x000d0047
+#define HID_DG_WIDTH		0x000d0048
+#define HID_DG_HEIGHT		0x000d0049
+#define HID_DG_CONTACTID	0x000d0051
+#define HID_DG_INPUTMODE	0x000d0052
+#define HID_DG_DEVICEINDEX	0x000d0053
+#define HID_DG_CONTACTCOUNT	0x000d0054
+#define HID_DG_CONTACTMAX	0x000d0055
+
 /*
  * HID report types --- Ouch! HID spec says 1 2 3!
  */
-- 
cgit v1.2.3-70-g09d2


From d95ed9275edcb8995bda31005bb3f55e087626d7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 19 May 2009 18:27:17 +0000
Subject: af_packet: Teach to listen for multiple unicast addresses.

The the PACKET_ADD_MEMBERSHIP and the PACKET_DROP_MEMBERSHIP setsockopt
calls for af_packet already has all of the infrastructure needed to subscribe
to multiple mac addresses.  All that is missing is a flag to say that
the address we want to listen on is a unicast address.

So introduce PACKET_MR_UNICAST and wire it up to dev_unicast_add and
dev_unicast_delete.

Additionally I noticed that errors from dev_mc_add were not propagated
from packet_dev_mc so fix that.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |  1 +
 net/packet/af_packet.c    | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 5b2badeb949..dea7d6b7cf9 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -145,5 +145,6 @@ struct packet_mreq
 #define PACKET_MR_MULTICAST	0
 #define PACKET_MR_PROMISC	1
 #define PACKET_MR_ALLMULTI	2
+#define PACKET_MR_UNICAST	3
 
 #endif
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 766e6b41f7c..c7c5d524967 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1570,9 +1570,9 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
 	switch (i->type) {
 	case PACKET_MR_MULTICAST:
 		if (what > 0)
-			dev_mc_add(dev, i->addr, i->alen, 0);
+			return dev_mc_add(dev, i->addr, i->alen, 0);
 		else
-			dev_mc_delete(dev, i->addr, i->alen, 0);
+			return dev_mc_delete(dev, i->addr, i->alen, 0);
 		break;
 	case PACKET_MR_PROMISC:
 		return dev_set_promiscuity(dev, what);
@@ -1580,6 +1580,12 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
 	case PACKET_MR_ALLMULTI:
 		return dev_set_allmulti(dev, what);
 		break;
+	case PACKET_MR_UNICAST:
+		if (what > 0)
+			return dev_unicast_add(dev, i->addr, i->alen);
+		else
+			return dev_unicast_delete(dev, i->addr, i->alen);
+		break;
 	default:;
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From b9fc745db833bbf74b4988493b8cd902a84c9415 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:57 -0400
Subject: integrity: path_check update

- Add support in ima_path_check() for integrity checking without
incrementing the counts. (Required for nfsd.)
- rename and export opencount_get to ima_counts_get
- replace ima_shm_check calls with ima_counts_get
- export ima_path_check

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c                         |  5 ++--
 fs/namei.c                        |  6 +++--
 include/linux/ima.h               | 11 +++++----
 ipc/shm.c                         |  4 ++--
 mm/shmem.c                        |  2 +-
 security/integrity/ima/ima_main.c | 48 +++++++++++++++++++++++----------------
 6 files changed, 46 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 998e856c307..618d6d1e2c5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -130,7 +130,8 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
-	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN,
+			       IMA_COUNT_UPDATE);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
-	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN, IMA_COUNT_UPDATE);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 78f253cd2d4..b05a2b1dea6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
 		if (!err)
-			err = ima_path_check(&nd->path, MAY_EXEC);
+			err = ima_path_check(&nd->path, MAY_EXEC,
+				             IMA_COUNT_UPDATE);
  		if (err)
 			break;
 
@@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag)
 		return error;
 
 	error = ima_path_check(path,
-			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			       IMA_COUNT_UPDATE);
 	if (error)
 		return error;
 	/*
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0e2aa45cb0c..b1b827d091a 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -13,14 +13,17 @@
 #include <linux/fs.h>
 struct linux_binprm;
 
+#define IMA_COUNT_UPDATE 1
+#define IMA_COUNT_LEAVE 0
+
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_inode_alloc(struct inode *inode);
 extern void ima_inode_free(struct inode *inode);
-extern int ima_path_check(struct path *path, int mask);
+extern int ima_path_check(struct path *path, int mask, int update_counts);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
-extern void ima_shm_check(struct file *file);
+extern void ima_counts_get(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -38,7 +41,7 @@ static inline void ima_inode_free(struct inode *inode)
 	return;
 }
 
-static inline int ima_path_check(struct path *path, int mask)
+static inline int ima_path_check(struct path *path, int mask, int update_counts)
 {
 	return 0;
 }
@@ -53,7 +56,7 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
-static inline void ima_shm_check(struct file *file)
+static inline void ima_counts_get(struct file *file)
 {
 	return;
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index faa46da99eb..47b464229cd 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -384,7 +384,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
-	ima_shm_check(file);
+	ima_counts_get(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
@@ -891,7 +891,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 	file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
 	if (!file)
 		goto out_free;
-	ima_shm_check(file);
+	ima_counts_get(file);
 
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index b25f95ce3db..a817f75f144 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2684,7 +2684,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ima_shm_check(file);
+	ima_counts_get(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index c4228c0eb2d..a2eb23310ea 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -125,6 +125,15 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 	return rc;
 }
 
+static void ima_update_counts(struct ima_iint_cache *iint, int mask)
+{
+	iint->opencount++;
+	if ((mask & MAY_WRITE) || (mask == 0))
+		iint->writecount++;
+	else if (mask & (MAY_READ | MAY_EXEC))
+		iint->readcount++;
+}
+
 /**
  * ima_path_check - based on policy, collect/store measurement.
  * @path: contains a pointer to the path to be measured
@@ -143,7 +152,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
  * Return 0 on success, an error code on failure.
  * (Based on the results of appraise_measurement().)
  */
-int ima_path_check(struct path *path, int mask)
+int ima_path_check(struct path *path, int mask, int update_counts)
 {
 	struct inode *inode = path->dentry->d_inode;
 	struct ima_iint_cache *iint;
@@ -157,11 +166,8 @@ int ima_path_check(struct path *path, int mask)
 		return 0;
 
 	mutex_lock(&iint->mutex);
-	iint->opencount++;
-	if ((mask & MAY_WRITE) || (mask == 0))
-		iint->writecount++;
-	else if (mask & (MAY_READ | MAY_EXEC))
-		iint->readcount++;
+	if (update_counts)
+		ima_update_counts(iint, mask);
 
 	rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK);
 	if (rc < 0)
@@ -197,6 +203,7 @@ out:
 	kref_put(&iint->refcount, iint_free);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ima_path_check);
 
 static int process_measurement(struct file *file, const unsigned char *filename,
 			       int mask, int function)
@@ -225,7 +232,16 @@ out:
 	return rc;
 }
 
-static void opencount_get(struct file *file)
+/*
+ * ima_opens_get - increment file counts
+ *
+ * - for IPC shm and shmat file.
+ * - for nfsd exported files.
+ *
+ * Increment the counts for these files to prevent unnecessary
+ * imbalance messages.
+ */
+void ima_counts_get(struct file *file)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ima_iint_cache *iint;
@@ -237,8 +253,14 @@ static void opencount_get(struct file *file)
 		return;
 	mutex_lock(&iint->mutex);
 	iint->opencount++;
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		iint->readcount++;
+
+	if (file->f_mode & FMODE_WRITE)
+		iint->writecount++;
 	mutex_unlock(&iint->mutex);
 }
+EXPORT_SYMBOL_GPL(ima_counts_get);
 
 /**
  * ima_file_mmap - based on policy, collect/store measurement.
@@ -263,18 +285,6 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
-/*
- * ima_shm_check - IPC shm and shmat create/fput a file
- *
- * Maintain the opencount for these files to prevent unnecessary
- * imbalance messages.
- */
-void ima_shm_check(struct file *file)
-{
-	opencount_get(file);
-	return;
-}
-
 /**
  * ima_bprm_check - based on policy, collect/store measurement.
  * @bprm: contains the linux_binprm structure
-- 
cgit v1.2.3-70-g09d2


From 4ea7e38696c7e798c47ebbecadfd392f23f814f9 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 21 May 2009 07:36:08 +0000
Subject: dropmon: add ability to detect when hardware dropsrxpackets

Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 include/linux/net_dropmon.h |    8 ++
 include/trace/napi.h        |   11 +++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  124 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2
 6 files changed, 149 insertions(+), 5 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_dropmon.h |   8 +++
 include/trace/napi.h        |  11 ++++
 net/core/dev.c              |   5 +-
 net/core/drop_monitor.c     | 124 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |   4 ++
 net/core/netpoll.c          |   2 +
 6 files changed, 149 insertions(+), 5 deletions(-)
 create mode 100644 include/trace/napi.h

(limited to 'include/linux')

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb81a63..e8a8b5c50ed 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -2,12 +2,20 @@
 #define __NET_DROPMON_H
 
 #include <linux/netlink.h>
+#include <linux/types.h>
 
 struct net_dm_drop_point {
 	__u8 pc[8];
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 00000000000..a8989c4547e
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/netdevice.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 92ebeca2990..3942266d1f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2771,8 +2772,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b711a97..a6c2ac2828f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,24 +170,80 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *temp;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	if (!rc)
+		trace_state = state;
+
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
 	return rc;
@@ -204,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *tmp;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		spin_lock(&trace_state_lock);
+		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+			if (new_stat->dev == dev) {
+				new_stat->dev = NULL;
+				if (trace_state == TRACE_OFF) {
+					list_del_rcu(&new_stat->list);
+					call_rcu(&new_stat->rcu, free_dm_hw_stat);
+					break;
+				}
+			}
+		}
+		spin_unlock(&trace_state_lock);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +325,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +352,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +374,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb45665e4..b07b25bd2cd 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 64f51eec657..00b14e2c50e 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);
-- 
cgit v1.2.3-70-g09d2


From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:17:31 +1000
Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct

This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct.  The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.

This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.

The perf_counter_context structures are reference-counted and freed
when the last reference is dropped.  A context can have references
from its task and the counters on its task.  Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.

Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.

This also removes the task pointer from the perf_counter struct.  The
task pointer was not used anywhere and would make it harder to move a
context from one task to another.  Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.

The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.

We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.

Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.

This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.

[ Impact: refactor counter context management to prepare for new feature ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c  |   1 +
 include/linux/init_task.h    |  13 ---
 include/linux/perf_counter.h |   4 +-
 include/linux/sched.h        |   6 +-
 kernel/exit.c                |   3 +-
 kernel/fork.c                |   1 +
 kernel/perf_counter.c        | 218 +++++++++++++++++++++++++++----------------
 7 files changed, 145 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e9021a90802..b4f64402a82 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
+#include <linux/perf_counter.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 503afaa0afa..d87247d2641 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,18 +108,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)					\
-	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
-	.perf_counter_ctx.event_list =					\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
-	.perf_counter_ctx.lock =					\
-		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
-#else
-# define INIT_PERF_COUNTERS(tsk)
-#endif
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -183,7 +171,6 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
-	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f612941ef46..07130900546 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -449,7 +449,6 @@ struct perf_counter {
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
-	struct task_struct		*task;
 	struct file			*filp;
 
 	struct perf_counter		*parent;
@@ -498,7 +497,6 @@ struct perf_counter {
  * Used as a container for task counters and CPU counters as well:
  */
 struct perf_counter_context {
-#ifdef CONFIG_PERF_COUNTERS
 	/*
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
@@ -516,6 +514,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	atomic_t		refcount;
 	struct task_struct	*task;
 
 	/*
@@ -523,7 +522,6 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
-#endif
 };
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff59d123151..9714d450f41 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,7 +71,6 @@ struct sched_param {
 #include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
-#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -99,6 +98,7 @@ struct robust_list_head;
 struct bio;
 struct bts_tracer;
 struct fs_struct;
+struct perf_counter_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1387,7 +1387,9 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
-	struct perf_counter_context perf_counter_ctx;
+#ifdef CONFIG_PERF_COUNTERS
+	struct perf_counter_context *perf_counter_ctxp;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9dfedd94af..99ad4063ee4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
+#include <linux/perf_counter.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
@@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
 #ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+	WARN_ON_ONCE(tsk->perf_counter_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index d32fef4d38e..e72a09f5355 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -63,6 +63,7 @@
 #include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
+#include <linux/perf_counter.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 08584c16049..06ea3eae886 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -97,6 +97,17 @@ void perf_enable(void)
 		hw_perf_enable();
 }
 
+static void get_ctx(struct perf_counter_context *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount))
+		kfree(ctx);
+}
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	ctx->nr_counters++;
 }
 
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with counter->mutex and ctx->mutex held.
+ */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
+	if (list_empty(&counter->list_entry))
+		return;
 	ctx->nr_counters--;
 
 	list_del_init(&counter->list_entry);
@@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info)
 
 	counter_sched_out(counter, cpuctx, ctx);
 
-	counter->task = NULL;
-
 	list_del_counter(counter, ctx);
 
 	if (!ctx->task) {
@@ -279,7 +294,6 @@ retry:
 	 */
 	if (!list_empty(&counter->list_entry)) {
 		list_del_counter(counter, ctx);
-		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
@@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info)
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
+	 * Or possibly this is the right context but it isn't
+	 * on this cpu because it had no counters.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	/*
@@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 		return;
 	}
 
-	counter->task = task;
 retry:
 	task_oncpu_function_call(task, __perf_install_in_context,
 				 counter);
@@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info)
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct pt_regs *regs;
 
-	if (likely(!cpuctx->task_ctx))
+	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
 	update_context_time(ctx);
@@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
+	if (!cpuctx->task_ctx)
+		return;
 	__perf_counter_sched_out(ctx, cpuctx);
 	cpuctx->task_ctx = NULL;
 }
@@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 
+	if (likely(!ctx))
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 int perf_counter_task_disable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void)
 int perf_counter_task_enable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 	int cpu;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 		return;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	ctx = &curr->perf_counter_ctx;
+	ctx = curr->perf_counter_ctxp;
 
 	perf_adjust_freq(&cpuctx->ctx);
-	perf_adjust_freq(ctx);
+	if (ctx)
+		perf_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
-	__perf_counter_task_sched_out(ctx);
+	if (ctx)
+		__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
-	perf_counter_task_sched_in(curr, cpu);
+	if (ctx)
+		perf_counter_task_sched_in(curr, cpu);
 }
 
 /*
@@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
+	atomic_set(&ctx->refcount, 1);
+	ctx->task = task;
+}
+
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
+	struct perf_counter_context *tctx;
 	struct task_struct *task;
 
 	/*
@@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	ctx = &task->perf_counter_ctx;
-	ctx->task = task;
-
 	/* Reuse ptrace permission checks for now. */
 	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_context(ctx);
+		put_task_struct(task);
 		return ERR_PTR(-EACCES);
 	}
 
+	ctx = task->perf_counter_ctxp;
+	if (!ctx) {
+		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+		if (!ctx) {
+			put_task_struct(task);
+			return ERR_PTR(-ENOMEM);
+		}
+		__perf_counter_init_context(ctx, task);
+		/*
+		 * Make sure other cpus see correct values for *ctx
+		 * once task->perf_counter_ctxp is visible to them.
+		 */
+		smp_wmb();
+		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
+		if (tctx) {
+			/*
+			 * We raced with some other task; use
+			 * the context they set.
+			 */
+			kfree(ctx);
+			ctx = tctx;
+		}
+	}
+
 	return ctx;
 }
 
@@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+	perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task)
 
 	if (!atomic_read(&nr_comm_tracking))
 		return;
-       
+	if (!current->perf_counter_ctxp)
+		return;
+
 	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
@@ -2372,7 +2444,7 @@ got_name:
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+	perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
 
 	kfree(buf);
 }
@@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
 	if (!atomic_read(&nr_mmap_tracking))
 		return;
+	if (!current->perf_counter_ctxp)
+		return;
 
 	mmap_event = (struct perf_mmap_event){
 		.file   = file,
@@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
+	get_ctx(ctx);
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -3149,21 +3224,6 @@ err_put_context:
 	goto out_fput;
 }
 
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	spin_lock_init(&ctx->lock);
-	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->task = task;
-}
-
 /*
  * inherit a counter from parent task to child task:
  */
@@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->task = child;
 	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
@@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	/*
-	 * If we do not self-reap then we have to wait for the
-	 * child task to unschedule (it will happen for sure),
-	 * so that its counter is at its final count. (This
-	 * condition triggers rarely - child tasks usually get
-	 * off their CPU before the parent has a chance to
-	 * get this far into the reaping action)
+	 * Protect against concurrent operations on child_counter
+	 * due its fd getting closed, etc.
 	 */
-	if (child != current) {
-		wait_task_inactive(child, 0);
-		update_counter_times(child_counter);
-		list_del_counter(child_counter, child_ctx);
-	} else {
-		struct perf_cpu_context *cpuctx;
-		unsigned long flags;
-
-		/*
-		 * Disable and unlink this counter.
-		 *
-		 * Be careful about zapping the list - IRQ/NMI context
-		 * could still be processing it:
-		 */
-		local_irq_save(flags);
-		perf_disable();
-
-		cpuctx = &__get_cpu_var(perf_cpu_context);
+	mutex_lock(&child_counter->mutex);
 
-		group_sched_out(child_counter, cpuctx, child_ctx);
-		update_counter_times(child_counter);
+	update_counter_times(child_counter);
+	list_del_counter(child_counter, child_ctx);
 
-		list_del_counter(child_counter, child_ctx);
-
-		perf_enable();
-		local_irq_restore(flags);
-	}
+	mutex_unlock(&child_counter->mutex);
 
 	parent_counter = child_counter->parent;
 	/*
@@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child,
  *
  * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
+ * (XXX not sure that is true when we get called from flush_old_exec.
+ *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
+	unsigned long flags;
 
 	WARN_ON_ONCE(child != current);
 
-	child_ctx = &child->perf_counter_ctx;
+	child_ctx = child->perf_counter_ctxp;
 
-	if (likely(!child_ctx->nr_counters))
+	if (likely(!child_ctx))
 		return;
 
+	local_irq_save(flags);
+	__perf_counter_task_sched_out(child_ctx);
+	child->perf_counter_ctxp = NULL;
+	local_irq_restore(flags);
+
+	mutex_lock(&child_ctx->mutex);
+
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
@@ -3371,6 +3415,10 @@ again:
 	 */
 	if (!list_empty(&child_ctx->counter_list))
 		goto again;
+
+	mutex_unlock(&child_ctx->mutex);
+
+	put_ctx(child_ctx);
 }
 
 /*
@@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 
-	child_ctx  =  &child->perf_counter_ctx;
-	parent_ctx = &parent->perf_counter_ctx;
-
-	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = NULL;
 
 	/*
 	 * This is executed from the parent task context, so inherit
-	 * counters that have been marked for cloning:
+	 * counters that have been marked for cloning.
+	 * First allocate and initialize a context for the child.
 	 */
 
-	if (likely(!parent_ctx->nr_counters))
+	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+	if (!child_ctx)
+		return;
+
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
 		return;
 
+	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = child_ctx;
+
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
-- 
cgit v1.2.3-70-g09d2


From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:27:22 +1000
Subject: perf_counter: Optimize context switch between identical inherited
 contexts

When monitoring a process and its descendants with a set of inherited
counters, we can often get the situation in a context switch where
both the old (outgoing) and new (incoming) process have the same set
of counters, and their values are ultimately going to be added together.
In that situation it doesn't matter which set of counters are used to
count the activity for the new process, so there is really no need to
go through the process of reading the hardware counters and updating
the old task's counters and then setting up the PMU for the new task.

This optimizes the context switch in this situation.  Instead of
scheduling out the perf_counter_context for the old task and
scheduling in the new context, we simply transfer the old context
to the new task and keep using it without interruption.  The new
context gets transferred to the old task.  This means that both
tasks still have a valid perf_counter_context, so no special case
is introduced when the old task gets scheduled in again, either on
this CPU or another CPU.

The equivalence of contexts is detected by keeping a pointer in
each cloned context pointing to the context it was cloned from.
To cope with the situation where a context is changed by adding
or removing counters after it has been cloned, we also keep a
generation number on each context which is incremented every time
a context is changed.  When a context is cloned we take a copy
of the parent's generation number, and two cloned contexts are
equivalent only if they have the same parent and the same
generation number.  In order that the parent context pointer
remains valid (and is not reused), we increment the parent
context's reference count for each context cloned from it.

Since we don't have individual fds for the counters in a cloned
context, the only thing that can make two clones of a given parent
different after they have been cloned is enabling or disabling all
counters with prctl.  To account for this, we keep a count of the
number of enabled counters in each context.  Two contexts must have
the same number of enabled counters to be considered equivalent.

Here are some measurements of the context switch time as measured with
the lat_ctx benchmark from lmbench, comparing the times obtained with
and without this patch series:

		-----Unmodified-----		With this patch series
Counters:	none	2 HW	4H+4S	none	2 HW	4H+4S

2 processes:
Average		3.44	6.45	11.24	3.12	3.39	3.60
St dev		0.04	0.04	0.13	0.05	0.17	0.19

8 processes:
Average		6.45	8.79	14.00	5.57	6.23	7.57
St dev		1.27	1.04	0.88	1.42	1.46	1.42

32 processes:
Average		5.56	8.43	13.78	5.28	5.55	7.15
St dev		0.41	0.47	0.53	0.54	0.57	0.81

The numbers are the mean and standard deviation of 20 runs of
lat_ctx.  The "none" columns are lat_ctx run directly without any
counters.  The "2 HW" columns are with lat_ctx run under perfstat,
counting cycles and instructions.  The "4H+4S" columns are lat_ctx run
under perfstat with 4 hardware counters and 4 software counters
(cycles, instructions, cache references, cache misses, task
clock, context switch, cpu migrations, and page faults).

[ Impact: performance optimization of counter context-switches ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  12 ++++-
 kernel/perf_counter.c        | 109 +++++++++++++++++++++++++++++++++++++------
 kernel/sched.c               |   2 +-
 3 files changed, 107 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 07130900546..4cae01a5045 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,6 +513,7 @@ struct perf_counter_context {
 	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
+	int			nr_enabled;
 	int			is_active;
 	atomic_t		refcount;
 	struct task_struct	*task;
@@ -522,6 +523,14 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
+
+	/*
+	 * These fields let us detect when two contexts have both
+	 * been cloned (inherited) from a common ancestor.
+	 */
+	struct perf_counter_context *parent_ctx;
+	u32			parent_gen;
+	u32			generation;
 };
 
 /**
@@ -552,7 +561,8 @@ extern int perf_max_counters;
 extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
-extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task,
+					struct task_struct *next, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06ea3eae886..c10055416de 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx)
 
 static void put_ctx(struct perf_counter_context *ctx)
 {
-	if (atomic_dec_and_test(&ctx->refcount))
+	if (atomic_dec_and_test(&ctx->refcount)) {
+		if (ctx->parent_ctx)
+			put_ctx(ctx->parent_ctx);
 		kfree(ctx);
+	}
 }
 
 static void
@@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		ctx->nr_enabled++;
 }
 
 /*
@@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		ctx->nr_enabled--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -203,6 +210,22 @@ group_sched_out(struct perf_counter *group_counter,
 		cpuctx->exclusive = 0;
 }
 
+/*
+ * Mark this context as not being a clone of another.
+ * Called when counters are added to or removed from this context.
+ * We also increment our generation number so that anything that
+ * was cloned from this context before this will not match anything
+ * cloned from this context after this.
+ */
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+	++ctx->generation;
+	if (!ctx->parent_ctx)
+		return;
+	put_ctx(ctx->parent_ctx);
+	ctx->parent_ctx = NULL;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
+	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info)
 		else
 			counter_sched_out(counter, cpuctx, ctx);
 		counter->state = PERF_COUNTER_STATE_OFF;
+		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
+		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irq(&ctx->lock);
@@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+	ctx->nr_enabled++;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter)
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->tstamp_enabled =
 			ctx->time - counter->total_time_enabled;
+		ctx->nr_enabled++;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -861,6 +889,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	spin_unlock(&ctx->lock);
 }
 
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+			 struct perf_counter_context *ctx2)
+{
+	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& ctx1->nr_enabled == ctx2->nr_enabled;
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
  */
-void perf_counter_task_sched_out(struct task_struct *task, int cpu)
+void perf_counter_task_sched_out(struct task_struct *task,
+				 struct task_struct *next, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
+	struct perf_counter_context *next_ctx;
 	struct pt_regs *regs;
 
 	if (likely(!ctx || !cpuctx->task_ctx))
@@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+	next_ctx = next->perf_counter_ctxp;
+	if (next_ctx && context_equiv(ctx, next_ctx)) {
+		task->perf_counter_ctxp = next_ctx;
+		next->perf_counter_ctxp = ctx;
+		ctx->task = next;
+		next_ctx->task = task;
+		return;
+	}
+
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	if (likely(!ctx))
 		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -3252,6 +3313,16 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (IS_ERR(child_counter))
 		return child_counter;
 
+	/*
+	 * Make the child state follow the state of the parent counter,
+	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_counter_{en,dis}able_family.
+	 */
+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+	else
+		child_counter->state = PERF_COUNTER_STATE_OFF;
+
 	/*
 	 * Link it up in the child's context:
 	 */
@@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	mutex_lock(&parent_counter->mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
 
-	/*
-	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en,dis}able_family.
-	 */
-	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-	else
-		child_counter->state = PERF_COUNTER_STATE_OFF;
-
 	mutex_unlock(&parent_counter->mutex);
 
 	return child_counter;
@@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx, *parent_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
+	int inherited_all = 1;
 
 	child->perf_counter_ctxp = NULL;
 
@@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit)
+		if (!counter->hw_event.inherit) {
+			inherited_all = 0;
 			continue;
+		}
 
 		if (inherit_group(counter, parent,
-				  parent_ctx, child, child_ctx))
+				  parent_ctx, child, child_ctx)) {
+			inherited_all = 0;
 			break;
+		}
+	}
+
+	if (inherited_all) {
+		/*
+		 * Mark the child context as a clone of the parent
+		 * context, or of whatever the parent is a clone of.
+		 */
+		if (parent_ctx->parent_ctx) {
+			child_ctx->parent_ctx = parent_ctx->parent_ctx;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
+		} else {
+			child_ctx->parent_ctx = parent_ctx;
+			child_ctx->parent_gen = parent_ctx->generation;
+		}
+		get_ctx(child_ctx->parent_ctx);
 	}
 
 	mutex_unlock(&parent_ctx->mutex);
diff --git a/kernel/sched.c b/kernel/sched.c
index 419a39d0988..4c0d58bce6b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5091,7 +5091,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_counter_task_sched_out(prev, cpu);
+		perf_counter_task_sched_out(prev, next, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
cgit v1.2.3-70-g09d2


From 910431c7f2e963017d767b29c80ae706421e569f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 May 2009 12:32:15 +0200
Subject: perf_counter: fix !PERF_COUNTERS build failure

Update the !CONFIG_PERF_COUNTERS prototype too, for
perf_counter_task_sched_out().

[ Impact: build fix ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4cae01a5045..2eedae8498d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -625,7 +625,8 @@ extern void perf_counter_init(void);
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
-perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
+perf_counter_task_sched_out(struct task_struct *task,
+			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
-- 
cgit v1.2.3-70-g09d2


From e31a16d6f64ef0e324c6f54d5112703c3f13a9c4 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Thu, 21 May 2009 21:47:03 +0800
Subject: wireless: move some utility functions from mac80211 to cfg80211

The patch moves some utility functions from mac80211 to cfg80211.
Because these functions are doing generic 802.11 operations so they
are not mac80211 specific. The moving allows some fullmac drivers
to be also benefit from these utility functions.

Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: Samuel Ortiz <samuel.ortiz@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ar9170/main.c     |   2 +-
 drivers/net/wireless/ath/ath5k/pcu.c       |   4 +-
 drivers/net/wireless/ath/ath9k/hw.c        |   8 +-
 drivers/net/wireless/b43/main.c            |   2 +-
 drivers/net/wireless/rt2x00/rt2x00crypto.c |   2 +-
 include/linux/ieee80211.h                  |   9 +
 include/net/cfg80211.h                     |  47 +++++
 include/net/mac80211.h                     |  28 ---
 net/mac80211/ieee80211_i.h                 |   2 -
 net/mac80211/mesh.h                        |   4 -
 net/mac80211/rx.c                          |  89 +--------
 net/mac80211/util.c                        |  73 -------
 net/mac80211/wme.c                         |  30 +--
 net/wireless/util.c                        | 305 +++++++++++++++++++++++++++++
 14 files changed, 375 insertions(+), 230 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ar9170/main.c b/drivers/net/wireless/ath/ar9170/main.c
index 4ef1d2fc859..99df9ddae9c 100644
--- a/drivers/net/wireless/ath/ar9170/main.c
+++ b/drivers/net/wireless/ath/ar9170/main.c
@@ -1555,7 +1555,7 @@ static int ar9170_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 
 	switch (key->alg) {
 	case ALG_WEP:
-		if (key->keylen == LEN_WEP40)
+		if (key->keylen == WLAN_KEY_LEN_WEP40)
 			ktype = AR9170_ENC_ALG_WEP64;
 		else
 			ktype = AR9170_ENC_ALG_WEP128;
diff --git a/drivers/net/wireless/ath/ath5k/pcu.c b/drivers/net/wireless/ath/ath5k/pcu.c
index 579aa0a96ab..ec35503f6a4 100644
--- a/drivers/net/wireless/ath/ath5k/pcu.c
+++ b/drivers/net/wireless/ath/ath5k/pcu.c
@@ -1038,9 +1038,9 @@ int ath5k_keycache_type(const struct ieee80211_key_conf *key)
 	case ALG_CCMP:
 		return AR5K_KEYTABLE_TYPE_CCM;
 	case ALG_WEP:
-		if (key->keylen == LEN_WEP40)
+		if (key->keylen == WLAN_KEY_LEN_WEP40)
 			return AR5K_KEYTABLE_TYPE_40;
-		else if (key->keylen == LEN_WEP104)
+		else if (key->keylen == WLAN_KEY_LEN_WEP104)
 			return AR5K_KEYTABLE_TYPE_104;
 		return -EINVAL;
 	default:
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 4acfab51491..1579c9407ed 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -2472,14 +2472,14 @@ bool ath9k_hw_set_keycache_entry(struct ath_hw *ah, u16 entry,
 		}
 		break;
 	case ATH9K_CIPHER_WEP:
-		if (k->kv_len < LEN_WEP40) {
+		if (k->kv_len < WLAN_KEY_LEN_WEP40) {
 			DPRINTF(ah->ah_sc, ATH_DBG_ANY,
 				"WEP key length %u too small\n", k->kv_len);
 			return false;
 		}
-		if (k->kv_len <= LEN_WEP40)
+		if (k->kv_len <= WLAN_KEY_LEN_WEP40)
 			keyType = AR_KEYTABLE_TYPE_40;
-		else if (k->kv_len <= LEN_WEP104)
+		else if (k->kv_len <= WLAN_KEY_LEN_WEP104)
 			keyType = AR_KEYTABLE_TYPE_104;
 		else
 			keyType = AR_KEYTABLE_TYPE_128;
@@ -2498,7 +2498,7 @@ bool ath9k_hw_set_keycache_entry(struct ath_hw *ah, u16 entry,
 	key2 = get_unaligned_le32(k->kv_val + 6);
 	key3 = get_unaligned_le16(k->kv_val + 10);
 	key4 = get_unaligned_le32(k->kv_val + 12);
-	if (k->kv_len <= LEN_WEP104)
+	if (k->kv_len <= WLAN_KEY_LEN_WEP104)
 		key4 &= 0xff;
 
 	/*
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index ec8516eadc4..cb4a8712946 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -3637,7 +3637,7 @@ static int b43_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	err = -EINVAL;
 	switch (key->alg) {
 	case ALG_WEP:
-		if (key->keylen == LEN_WEP40)
+		if (key->keylen == WLAN_KEY_LEN_WEP40)
 			algorithm = B43_SEC_ALGO_WEP40;
 		else
 			algorithm = B43_SEC_ALGO_WEP104;
diff --git a/drivers/net/wireless/rt2x00/rt2x00crypto.c b/drivers/net/wireless/rt2x00/rt2x00crypto.c
index 57ab42cfed3..bc4e81e2184 100644
--- a/drivers/net/wireless/rt2x00/rt2x00crypto.c
+++ b/drivers/net/wireless/rt2x00/rt2x00crypto.c
@@ -33,7 +33,7 @@ enum cipher rt2x00crypto_key_to_cipher(struct ieee80211_key_conf *key)
 {
 	switch (key->alg) {
 	case ALG_WEP:
-		if (key->keylen == LEN_WEP40)
+		if (key->keylen == WLAN_KEY_LEN_WEP40)
 			return CIPHER_WEP64;
 		else
 			return CIPHER_WEP128;
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 05c29c01174..34de8b21f6d 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -493,6 +493,7 @@ struct ieee80211s_hdr {
 /* Mesh flags */
 #define MESH_FLAGS_AE_A4 	0x1
 #define MESH_FLAGS_AE_A5_A6	0x2
+#define MESH_FLAGS_AE		0x3
 #define MESH_FLAGS_PS_DEEP	0x4
 
 /**
@@ -1085,6 +1086,14 @@ enum ieee80211_spectrum_mgmt_actioncode {
 	WLAN_ACTION_SPCT_CHL_SWITCH = 4,
 };
 
+/* Security key length */
+enum ieee80211_key_len {
+	WLAN_KEY_LEN_WEP40 = 5,
+	WLAN_KEY_LEN_WEP104 = 13,
+	WLAN_KEY_LEN_CCMP = 16,
+	WLAN_KEY_LEN_TKIP = 32,
+};
+
 /*
  * IEEE 802.11-2007 7.3.2.9 Country information element
  *
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 389f1d20adf..f20da7d63b1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1244,6 +1244,53 @@ extern int ieee80211_radiotap_iterator_init(
 extern int ieee80211_radiotap_iterator_next(
    struct ieee80211_radiotap_iterator *iterator);
 
+extern const unsigned char rfc1042_header[6];
+extern const unsigned char bridge_tunnel_header[6];
+
+/**
+ * ieee80211_get_hdrlen_from_skb - get header length from data
+ *
+ * Given an skb with a raw 802.11 header at the data pointer this function
+ * returns the 802.11 header length in bytes (not including encryption
+ * headers). If the data in the sk_buff is too short to contain a valid 802.11
+ * header the function returns 0.
+ *
+ * @skb: the frame
+ */
+unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb);
+
+/**
+ * ieee80211_hdrlen - get header length in bytes from frame control
+ * @fc: frame control field in little-endian format
+ */
+unsigned int ieee80211_hdrlen(__le16 fc);
+
+/**
+ * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3
+ * @skb: the 802.11 data frame
+ * @addr: the device MAC address
+ * @iftype: the virtual interface type
+ */
+int ieee80211_data_to_8023(struct sk_buff *skb, u8 *addr,
+			   enum nl80211_iftype iftype);
+
+/**
+ * ieee80211_data_from_8023 - convert an 802.3 frame to 802.11
+ * @skb: the 802.3 frame
+ * @addr: the device MAC address
+ * @iftype: the virtual interface type
+ * @bssid: the network bssid (used only for iftype STATION and ADHOC)
+ * @qos: build 802.11 QoS data frame
+ */
+int ieee80211_data_from_8023(struct sk_buff *skb, u8 *addr,
+			     enum nl80211_iftype iftype, u8 *bssid, bool qos);
+
+/**
+ * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame
+ * @skb: the data frame
+ */
+unsigned int cfg80211_classify8021d(struct sk_buff *skb);
+
 /*
  * Regulatory helper functions for wiphys
  */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 2d0610581ef..d72346ff324 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -671,16 +671,6 @@ enum ieee80211_key_alg {
 	ALG_AES_CMAC,
 };
 
-/**
- * enum ieee80211_key_len - key length
- * @LEN_WEP40: WEP 5-byte long key
- * @LEN_WEP104: WEP 13-byte long key
- */
-enum ieee80211_key_len {
-	LEN_WEP40 = 5,
-	LEN_WEP104 = 13,
-};
-
 /**
  * enum ieee80211_key_flags - key flags
  *
@@ -1811,24 +1801,6 @@ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
 struct sk_buff *
 ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 
-/**
- * ieee80211_get_hdrlen_from_skb - get header length from data
- *
- * Given an skb with a raw 802.11 header at the data pointer this function
- * returns the 802.11 header length in bytes (not including encryption
- * headers). If the data in the sk_buff is too short to contain a valid 802.11
- * header the function returns 0.
- *
- * @skb: the frame
- */
-unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb);
-
-/**
- * ieee80211_hdrlen - get header length in bytes from frame control
- * @fc: frame control field in little-endian format
- */
-unsigned int ieee80211_hdrlen(__le16 fc);
-
 /**
  * ieee80211_get_tkip_key - get a TKIP rc4 for skb
  *
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8db8d16d206..c088c46704a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1085,8 +1085,6 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw)
 
 /* utility functions/constants */
 extern void *mac80211_wiphy_privid; /* for wiphy privid */
-extern const unsigned char rfc1042_header[6];
-extern const unsigned char bridge_tunnel_header[6];
 u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 			enum nl80211_iftype type);
 int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 832bb503ca9..c7d72819cdd 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -191,12 +191,8 @@ struct mesh_rmc {
 #define PLINK_CATEGORY		30
 #define MESH_PATH_SEL_CATEGORY	32
 
-/* Mesh Header Flags */
-#define IEEE80211S_FLAGS_AE	0x3
-
 /* Public interfaces */
 /* Various */
-int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr);
 int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr,
 		struct ieee80211_sub_if_data *sdata);
 int mesh_rmc_check(u8 *addr, struct ieee80211s_hdr *mesh_hdr,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f3a041cc5dc..6a9b8e63a6b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1247,93 +1247,12 @@ ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
 }
 
 static int
-ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
+__ieee80211_data_to_8023(struct ieee80211_rx_data *rx)
 {
 	struct net_device *dev = rx->dev;
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data;
-	u16 hdrlen, ethertype;
-	u8 *payload;
-	u8 dst[ETH_ALEN];
-	u8 src[ETH_ALEN] __aligned(2);
-	struct sk_buff *skb = rx->skb;
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
-		return -1;
-
-	hdrlen = ieee80211_hdrlen(hdr->frame_control);
-
-	/* convert IEEE 802.11 header + possible LLC headers into Ethernet
-	 * header
-	 * IEEE 802.11 address fields:
-	 * ToDS FromDS Addr1 Addr2 Addr3 Addr4
-	 *   0     0   DA    SA    BSSID n/a
-	 *   0     1   DA    BSSID SA    n/a
-	 *   1     0   BSSID SA    DA    n/a
-	 *   1     1   RA    TA    DA    SA
-	 */
-	memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
-	memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
-
-	switch (hdr->frame_control &
-		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
-	case cpu_to_le16(IEEE80211_FCTL_TODS):
-		if (unlikely(sdata->vif.type != NL80211_IFTYPE_AP &&
-			     sdata->vif.type != NL80211_IFTYPE_AP_VLAN))
-			return -1;
-		break;
-	case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
-		if (unlikely(sdata->vif.type != NL80211_IFTYPE_WDS &&
-			     sdata->vif.type != NL80211_IFTYPE_MESH_POINT))
-			return -1;
-		if (ieee80211_vif_is_mesh(&sdata->vif)) {
-			struct ieee80211s_hdr *meshdr = (struct ieee80211s_hdr *)
-				(skb->data + hdrlen);
-			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
-			if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
-				memcpy(dst, meshdr->eaddr1, ETH_ALEN);
-				memcpy(src, meshdr->eaddr2, ETH_ALEN);
-			}
-		}
-		break;
-	case cpu_to_le16(IEEE80211_FCTL_FROMDS):
-		if (sdata->vif.type != NL80211_IFTYPE_STATION ||
-		    (is_multicast_ether_addr(dst) &&
-		     !compare_ether_addr(src, dev->dev_addr)))
-			return -1;
-		break;
-	case cpu_to_le16(0):
-		if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
-			return -1;
-		break;
-	}
-
-	if (unlikely(skb->len - hdrlen < 8))
-		return -1;
-
-	payload = skb->data + hdrlen;
-	ethertype = (payload[6] << 8) | payload[7];
-
-	if (likely((compare_ether_addr(payload, rfc1042_header) == 0 &&
-		    ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
-		   compare_ether_addr(payload, bridge_tunnel_header) == 0)) {
-		/* remove RFC1042 or Bridge-Tunnel encapsulation and
-		 * replace EtherType */
-		skb_pull(skb, hdrlen + 6);
-		memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
-		memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
-	} else {
-		struct ethhdr *ehdr;
-		__be16 len;
-
-		skb_pull(skb, hdrlen);
-		len = htons(skb->len);
-		ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr));
-		memcpy(ehdr->h_dest, dst, ETH_ALEN);
-		memcpy(ehdr->h_source, src, ETH_ALEN);
-		ehdr->h_proto = len;
-	}
-	return 0;
+	return ieee80211_data_to_8023(rx->skb, dev->dev_addr, sdata->vif.type);
 }
 
 /*
@@ -1472,7 +1391,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	if (!(rx->flags & IEEE80211_RX_AMSDU))
 		return RX_CONTINUE;
 
-	err = ieee80211_data_to_8023(rx);
+	err = __ieee80211_data_to_8023(rx);
 	if (unlikely(err))
 		return RX_DROP_UNUSABLE;
 
@@ -1658,7 +1577,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
 	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
 		return RX_DROP_MONITOR;
 
-	err = ieee80211_data_to_8023(rx);
+	err = __ieee80211_data_to_8023(rx);
 	if (unlikely(err))
 		return RX_DROP_UNUSABLE;
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ffb6e88f2ec..949d857debd 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -35,15 +35,6 @@
 /* privid for wiphys to determine whether they belong to us or not */
 void *mac80211_wiphy_privid = &mac80211_wiphy_privid;
 
-/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
-/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
-const unsigned char rfc1042_header[] __aligned(2) =
-	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
-
-/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
-const unsigned char bridge_tunnel_header[] __aligned(2) =
-	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
-
 struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
 {
 	struct ieee80211_local *local;
@@ -103,70 +94,6 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
 	return NULL;
 }
 
-unsigned int ieee80211_hdrlen(__le16 fc)
-{
-	unsigned int hdrlen = 24;
-
-	if (ieee80211_is_data(fc)) {
-		if (ieee80211_has_a4(fc))
-			hdrlen = 30;
-		if (ieee80211_is_data_qos(fc))
-			hdrlen += IEEE80211_QOS_CTL_LEN;
-		goto out;
-	}
-
-	if (ieee80211_is_ctl(fc)) {
-		/*
-		 * ACK and CTS are 10 bytes, all others 16. To see how
-		 * to get this condition consider
-		 *   subtype mask:   0b0000000011110000 (0x00F0)
-		 *   ACK subtype:    0b0000000011010000 (0x00D0)
-		 *   CTS subtype:    0b0000000011000000 (0x00C0)
-		 *   bits that matter:         ^^^      (0x00E0)
-		 *   value of those: 0b0000000011000000 (0x00C0)
-		 */
-		if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0))
-			hdrlen = 10;
-		else
-			hdrlen = 16;
-	}
-out:
-	return hdrlen;
-}
-EXPORT_SYMBOL(ieee80211_hdrlen);
-
-unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
-{
-	const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data;
-	unsigned int hdrlen;
-
-	if (unlikely(skb->len < 10))
-		return 0;
-	hdrlen = ieee80211_hdrlen(hdr->frame_control);
-	if (unlikely(hdrlen > skb->len))
-		return 0;
-	return hdrlen;
-}
-EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);
-
-int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
-{
-	int ae = meshhdr->flags & IEEE80211S_FLAGS_AE;
-	/* 7.1.3.5a.2 */
-	switch (ae) {
-	case 0:
-		return 6;
-	case 1:
-		return 12;
-	case 2:
-		return 18;
-	case 3:
-		return 24;
-	default:
-		return 6;
-	}
-}
-
 void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
 {
 	struct sk_buff *skb = tx->skb;
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 45b74f38b86..694343b9102 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -23,34 +23,6 @@
  */
 const int ieee802_1d_to_ac[8] = { 2, 3, 3, 2, 1, 1, 0, 0 };
 
-static const char llc_ip_hdr[8] = {0xAA, 0xAA, 0x3, 0, 0, 0, 0x08, 0};
-
-/* Given a data frame determine the 802.1p/1d tag to use.  */
-static unsigned int classify_1d(struct sk_buff *skb)
-{
-	unsigned int dscp;
-
-	/* skb->priority values from 256->263 are magic values to
-	 * directly indicate a specific 802.1d priority.  This is used
-	 * to allow 802.1d priority to be passed directly in from VLAN
-	 * tags, etc.
-	 */
-	if (skb->priority >= 256 && skb->priority <= 263)
-		return skb->priority - 256;
-
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		dscp = ip_hdr(skb)->tos & 0xfc;
-		break;
-
-	default:
-		return 0;
-	}
-
-	return dscp >> 5;
-}
-
-
 static int wme_downgrade_ac(struct sk_buff *skb)
 {
 	switch (skb->priority) {
@@ -94,7 +66,7 @@ static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb)
 
 	/* use the data classifier to determine what 802.1d tag the
 	 * data frame has */
-	skb->priority = classify_1d(skb);
+	skb->priority = cfg80211_classify8021d(skb);
 
 	/* in case we are a client verify acm is not set for this ac */
 	while (unlikely(local->wmm_acm & BIT(skb->priority))) {
diff --git a/net/wireless/util.c b/net/wireless/util.c
index b94c8604ad7..d072bff463a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -4,7 +4,9 @@
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  */
 #include <linux/bitops.h>
+#include <linux/etherdevice.h>
 #include <net/cfg80211.h>
+#include <net/ip.h>
 #include "core.h"
 
 struct ieee80211_rate *
@@ -198,3 +200,306 @@ int cfg80211_validate_key_settings(struct key_params *params, int key_idx,
 
 	return 0;
 }
+
+/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
+/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
+const unsigned char rfc1042_header[] __aligned(2) =
+	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
+EXPORT_SYMBOL(rfc1042_header);
+
+/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
+const unsigned char bridge_tunnel_header[] __aligned(2) =
+	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
+EXPORT_SYMBOL(bridge_tunnel_header);
+
+unsigned int ieee80211_hdrlen(__le16 fc)
+{
+	unsigned int hdrlen = 24;
+
+	if (ieee80211_is_data(fc)) {
+		if (ieee80211_has_a4(fc))
+			hdrlen = 30;
+		if (ieee80211_is_data_qos(fc))
+			hdrlen += IEEE80211_QOS_CTL_LEN;
+		goto out;
+	}
+
+	if (ieee80211_is_ctl(fc)) {
+		/*
+		 * ACK and CTS are 10 bytes, all others 16. To see how
+		 * to get this condition consider
+		 *   subtype mask:   0b0000000011110000 (0x00F0)
+		 *   ACK subtype:    0b0000000011010000 (0x00D0)
+		 *   CTS subtype:    0b0000000011000000 (0x00C0)
+		 *   bits that matter:         ^^^      (0x00E0)
+		 *   value of those: 0b0000000011000000 (0x00C0)
+		 */
+		if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0))
+			hdrlen = 10;
+		else
+			hdrlen = 16;
+	}
+out:
+	return hdrlen;
+}
+EXPORT_SYMBOL(ieee80211_hdrlen);
+
+unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
+{
+	const struct ieee80211_hdr *hdr =
+			(const struct ieee80211_hdr *)skb->data;
+	unsigned int hdrlen;
+
+	if (unlikely(skb->len < 10))
+		return 0;
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (unlikely(hdrlen > skb->len))
+		return 0;
+	return hdrlen;
+}
+EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);
+
+int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+{
+	int ae = meshhdr->flags & MESH_FLAGS_AE;
+	/* 7.1.3.5a.2 */
+	switch (ae) {
+	case 0:
+		return 6;
+	case 1:
+		return 12;
+	case 2:
+		return 18;
+	case 3:
+		return 24;
+	default:
+		return 6;
+	}
+}
+
+int ieee80211_data_to_8023(struct sk_buff *skb, u8 *addr,
+			   enum nl80211_iftype iftype)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	u16 hdrlen, ethertype;
+	u8 *payload;
+	u8 dst[ETH_ALEN];
+	u8 src[ETH_ALEN] __aligned(2);
+
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+		return -1;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+	/* convert IEEE 802.11 header + possible LLC headers into Ethernet
+	 * header
+	 * IEEE 802.11 address fields:
+	 * ToDS FromDS Addr1 Addr2 Addr3 Addr4
+	 *   0     0   DA    SA    BSSID n/a
+	 *   0     1   DA    BSSID SA    n/a
+	 *   1     0   BSSID SA    DA    n/a
+	 *   1     1   RA    TA    DA    SA
+	 */
+	memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
+	memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
+
+	switch (hdr->frame_control &
+		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
+	case cpu_to_le16(IEEE80211_FCTL_TODS):
+		if (unlikely(iftype != NL80211_IFTYPE_AP &&
+			     iftype != NL80211_IFTYPE_AP_VLAN))
+			return -1;
+		break;
+	case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
+		if (unlikely(iftype != NL80211_IFTYPE_WDS &&
+			     iftype != NL80211_IFTYPE_MESH_POINT))
+			return -1;
+		if (iftype == NL80211_IFTYPE_MESH_POINT) {
+			struct ieee80211s_hdr *meshdr =
+				(struct ieee80211s_hdr *) (skb->data + hdrlen);
+			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+			if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
+				memcpy(dst, meshdr->eaddr1, ETH_ALEN);
+				memcpy(src, meshdr->eaddr2, ETH_ALEN);
+			}
+		}
+		break;
+	case cpu_to_le16(IEEE80211_FCTL_FROMDS):
+		if (iftype != NL80211_IFTYPE_STATION ||
+		    (is_multicast_ether_addr(dst) &&
+		     !compare_ether_addr(src, addr)))
+			return -1;
+		break;
+	case cpu_to_le16(0):
+		if (iftype != NL80211_IFTYPE_ADHOC)
+			return -1;
+		break;
+	}
+
+	if (unlikely(skb->len - hdrlen < 8))
+		return -1;
+
+	payload = skb->data + hdrlen;
+	ethertype = (payload[6] << 8) | payload[7];
+
+	if (likely((compare_ether_addr(payload, rfc1042_header) == 0 &&
+		    ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+		   compare_ether_addr(payload, bridge_tunnel_header) == 0)) {
+		/* remove RFC1042 or Bridge-Tunnel encapsulation and
+		 * replace EtherType */
+		skb_pull(skb, hdrlen + 6);
+		memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+		memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+	} else {
+		struct ethhdr *ehdr;
+		__be16 len;
+
+		skb_pull(skb, hdrlen);
+		len = htons(skb->len);
+		ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr));
+		memcpy(ehdr->h_dest, dst, ETH_ALEN);
+		memcpy(ehdr->h_source, src, ETH_ALEN);
+		ehdr->h_proto = len;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_data_to_8023);
+
+int ieee80211_data_from_8023(struct sk_buff *skb, u8 *addr,
+			     enum nl80211_iftype iftype, u8 *bssid, bool qos)
+{
+	struct ieee80211_hdr hdr;
+	u16 hdrlen, ethertype;
+	__le16 fc;
+	const u8 *encaps_data;
+	int encaps_len, skip_header_bytes;
+	int nh_pos, h_pos;
+	int head_need;
+
+	if (unlikely(skb->len < ETH_HLEN))
+		return -EINVAL;
+
+	nh_pos = skb_network_header(skb) - skb->data;
+	h_pos = skb_transport_header(skb) - skb->data;
+
+	/* convert Ethernet header to proper 802.11 header (based on
+	 * operation mode) */
+	ethertype = (skb->data[12] << 8) | skb->data[13];
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+	switch (iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		/* DA BSSID SA */
+		memcpy(hdr.addr1, skb->data, ETH_ALEN);
+		memcpy(hdr.addr2, addr, ETH_ALEN);
+		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	case NL80211_IFTYPE_STATION:
+		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+		/* BSSID SA DA */
+		memcpy(hdr.addr1, bssid, ETH_ALEN);
+		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+		memcpy(hdr.addr3, skb->data, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		/* DA SA BSSID */
+		memcpy(hdr.addr1, skb->data, ETH_ALEN);
+		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+		memcpy(hdr.addr3, bssid, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (qos) {
+		fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+		hdrlen += 2;
+	}
+
+	hdr.frame_control = fc;
+	hdr.duration_id = 0;
+	hdr.seq_ctrl = 0;
+
+	skip_header_bytes = ETH_HLEN;
+	if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) {
+		encaps_data = bridge_tunnel_header;
+		encaps_len = sizeof(bridge_tunnel_header);
+		skip_header_bytes -= 2;
+	} else if (ethertype > 0x600) {
+		encaps_data = rfc1042_header;
+		encaps_len = sizeof(rfc1042_header);
+		skip_header_bytes -= 2;
+	} else {
+		encaps_data = NULL;
+		encaps_len = 0;
+	}
+
+	skb_pull(skb, skip_header_bytes);
+	nh_pos -= skip_header_bytes;
+	h_pos -= skip_header_bytes;
+
+	head_need = hdrlen + encaps_len - skb_headroom(skb);
+
+	if (head_need > 0 || skb_cloned(skb)) {
+		head_need = max(head_need, 0);
+		if (head_need)
+			skb_orphan(skb);
+
+		if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) {
+			printk(KERN_ERR "failed to reallocate Tx buffer\n");
+			return -ENOMEM;
+		}
+		skb->truesize += head_need;
+	}
+
+	if (encaps_data) {
+		memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
+		nh_pos += encaps_len;
+		h_pos += encaps_len;
+	}
+
+	memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
+
+	nh_pos += hdrlen;
+	h_pos += hdrlen;
+
+	/* Update skb pointers to various headers since this modified frame
+	 * is going to go through Linux networking code that may potentially
+	 * need things like pointer to IP header. */
+	skb_set_mac_header(skb, 0);
+	skb_set_network_header(skb, nh_pos);
+	skb_set_transport_header(skb, h_pos);
+
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_data_from_8023);
+
+/* Given a data frame determine the 802.1p/1d tag to use. */
+unsigned int cfg80211_classify8021d(struct sk_buff *skb)
+{
+	unsigned int dscp;
+
+	/* skb->priority values from 256->263 are magic values to
+	 * directly indicate a specific 802.1d priority.  This is used
+	 * to allow 802.1d priority to be passed directly in from VLAN
+	 * tags, etc.
+	 */
+	if (skb->priority >= 256 && skb->priority <= 263)
+		return skb->priority - 256;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		dscp = ip_hdr(skb)->tos & 0xfc;
+		break;
+	default:
+		return 0;
+	}
+
+	return dscp >> 5;
+}
+EXPORT_SYMBOL(cfg80211_classify8021d);
-- 
cgit v1.2.3-70-g09d2


From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:49 -0400
Subject: block: Do away with the notion of hardsect_size

Until now we have had a 1:1 mapping between storage device physical
block size and the logical block sized used when addressing the device.
With SATA 4KB drives coming out that will no longer be the case.  The
sector size will be 4KB but the logical block size will remain
512-bytes.  Hence we need to distinguish between the physical block size
and the logical ditto.

This patch renames hardsect_size to logical_block_size.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/powerpc/sysdev/axonram.c       |  2 +-
 block/blk-integrity.c               |  2 +-
 block/blk-settings.c                | 21 ++++++++++-----------
 block/blk-sysfs.c                   | 12 +++++++++---
 block/compat_ioctl.c                |  2 +-
 block/ioctl.c                       |  2 +-
 drivers/block/cciss.c               |  6 +++---
 drivers/block/cpqarray.c            |  4 ++--
 drivers/block/hd.c                  |  2 +-
 drivers/block/mg_disk.c             |  2 +-
 drivers/block/pktcdvd.c             |  2 +-
 drivers/block/ps3disk.c             |  2 +-
 drivers/block/ub.c                  |  6 +++---
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xen-blkfront.c        |  2 +-
 drivers/block/xsysace.c             |  2 +-
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               |  4 ++--
 drivers/char/raw.c                  |  2 +-
 drivers/ide/ide-cd.c                | 12 ++++++------
 drivers/md/bitmap.c                 |  4 ++--
 drivers/md/dm-exception-store.c     |  2 +-
 drivers/md/dm-log.c                 |  3 ++-
 drivers/md/dm-snap-persistent.c     |  2 +-
 drivers/md/dm-table.c               | 12 +++++++-----
 drivers/md/md.c                     |  2 +-
 drivers/memstick/core/mspro_block.c |  2 +-
 drivers/message/i2o/i2o_block.c     |  5 +++--
 drivers/mmc/card/block.c            |  2 +-
 drivers/mtd/mtd_blkdevs.c           |  2 +-
 drivers/s390/block/dasd.c           |  2 +-
 drivers/s390/block/dcssblk.c        |  2 +-
 drivers/s390/block/xpram.c          |  2 +-
 drivers/s390/char/tape_block.c      |  2 +-
 drivers/scsi/sd.c                   |  2 +-
 drivers/scsi/sr.c                   |  2 +-
 fs/bio.c                            |  3 ++-
 fs/block_dev.c                      |  6 +++---
 fs/buffer.c                         |  6 +++---
 fs/direct-io.c                      |  2 +-
 fs/ext3/super.c                     |  4 ++--
 fs/ext4/super.c                     |  2 +-
 fs/gfs2/ops_fstype.c                |  4 ++--
 fs/gfs2/rgrp.c                      |  2 +-
 fs/nilfs2/the_nilfs.c               |  2 +-
 fs/ntfs/super.c                     |  6 +++---
 fs/ocfs2/cluster/heartbeat.c        |  2 +-
 fs/ocfs2/super.c                    |  2 +-
 fs/partitions/ibm.c                 |  2 +-
 fs/partitions/msdos.c               |  4 ++--
 fs/udf/super.c                      |  2 +-
 fs/xfs/linux-2.6/xfs_buf.c          |  2 +-
 include/linux/blkdev.h              | 14 +++++++-------
 include/linux/device-mapper.h       |  2 +-
 54 files changed, 108 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 9e105cbc5e5..a4779912a5c 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -250,7 +250,7 @@ axon_ram_probe(struct of_device *device, const struct of_device_id *device_id)
 
 	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
 	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
-	blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
+	blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
 	add_disk(bank->disk);
 
 	bank->irq_id = irq_of_parse_and_map(device->node, 0);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 91fa8e06b6a..73e28d35568 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -340,7 +340,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 		kobject_uevent(&bi->kobj, KOBJ_ADD);
 
 		bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE;
-		bi->sector_size = disk->queue->hardsect_size;
+		bi->sector_size = queue_logical_block_size(disk->queue);
 		disk->integrity = bi;
 	} else
 		bi = disk->integrity;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 57af728d94b..15c3164537b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -134,7 +134,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
-	blk_queue_hardsect_size(q, 512);
+	blk_queue_logical_block_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
@@ -288,21 +288,20 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
 /**
- * blk_queue_hardsect_size - set hardware sector size for the queue
+ * blk_queue_logical_block_size - set logical block size for the queue
  * @q:  the request queue for the device
- * @size:  the hardware sector size, in bytes
+ * @size:  the logical block size, in bytes
  *
  * Description:
- *   This should typically be set to the lowest possible sector size
- *   that the hardware can operate on (possible without reverting to
- *   even internal read-modify-write operations). Usually the default
- *   of 512 covers most hardware.
+ *   This should be set to the lowest possible block size that the
+ *   storage device can address.  The default of 512 covers most
+ *   hardware.
  **/
-void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
+void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->hardsect_size = size;
+	q->logical_block_size = size;
 }
-EXPORT_SYMBOL(blk_queue_hardsect_size);
+EXPORT_SYMBOL(blk_queue_logical_block_size);
 
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
@@ -324,7 +323,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
 	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
+	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ff9bba3379..13d38b7e4d0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -100,9 +100,9 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 	return queue_var_show(max_sectors_kb, (page));
 }
 
-static ssize_t queue_hw_sector_size_show(struct request_queue *q, char *page)
+static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(q->hardsect_size, page);
+	return queue_var_show(queue_logical_block_size(q), page);
 }
 
 static ssize_t
@@ -249,7 +249,12 @@ static struct queue_sysfs_entry queue_iosched_entry = {
 
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
-	.show = queue_hw_sector_size_show,
+	.show = queue_logical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_logical_block_size_entry = {
+	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_nonrot_entry = {
@@ -283,6 +288,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
+	&queue_logical_block_size_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615dea46..9eaa1940273 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -763,7 +763,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
 		return compat_put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return compat_put_int(arg, bdev_hardsect_size(bdev));
+		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
 					 bdev_get_queue(bdev)->max_sectors);
diff --git a/block/ioctl.c b/block/ioctl.c
index ad474d4bbcc..7aa97f65da8 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -311,7 +311,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
 		return put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return put_int(arg, bdev_hardsect_size(bdev));
+		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
 	case BLKRASET:
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e714e7cce6f..94474f5f8bc 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1389,8 +1389,8 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 
 	disk->queue->queuedata = h;
 
-	blk_queue_hardsect_size(disk->queue,
-				h->drv[drv_index].block_size);
+	blk_queue_logical_block_size(disk->queue,
+				     h->drv[drv_index].block_size);
 
 	/* Make sure all queue data is written out before */
 	/* setting h->drv[drv_index].queue, as setting this */
@@ -2298,7 +2298,7 @@ static int cciss_revalidate(struct gendisk *disk)
 	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
 			       inq_buff, drv);
 
-	blk_queue_hardsect_size(drv->queue, drv->block_size);
+	blk_queue_logical_block_size(drv->queue, drv->block_size);
 	set_capacity(disk, drv->nr_blocks);
 
 	kfree(inq_buff);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a02dcfc00f1..44fa2018f6b 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -474,7 +474,7 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		disk->fops = &ida_fops;
 		if (j && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(hba[i]->queue, drv->blk_size);
+		blk_queue_logical_block_size(hba[i]->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = hba[i]->queue;
 		disk->private_data = drv;
@@ -1546,7 +1546,7 @@ static int revalidate_allvol(ctlr_info_t *host)
 		drv_info_t *drv = &host->drv[i];
 		if (i && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(host->queue, drv->blk_size);
+		blk_queue_logical_block_size(host->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = host->queue;
 		disk->private_data = drv;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 961de56d00a..f65b3f369eb 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -724,7 +724,7 @@ static int __init hd_init(void)
 	blk_queue_max_sectors(hd_queue, 255);
 	init_timer(&device_timer);
 	device_timer.function = hd_times_out;
-	blk_queue_hardsect_size(hd_queue, 512);
+	blk_queue_logical_block_size(hd_queue, 512);
 
 	if (!NR_HD) {
 		/*
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index c0cd0a03f69..60de5a01e71 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -996,7 +996,7 @@ static int mg_probe(struct platform_device *plat_dev)
 		goto probe_err_6;
 	}
 	blk_queue_max_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_hardsect_size(host->breq, MG_SECTOR_SIZE);
+	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
 
 	init_timer(&host->timer);
 	host->timer.function = mg_times_out;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index dc7a8c352da..293f5858921 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2657,7 +2657,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
 	struct request_queue *q = pd->disk->queue;
 
 	blk_queue_make_request(q, pkt_make_request);
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
 	blk_queue_merge_bvec(q, pkt_merge_bvec);
 	q->queuedata = pd;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 338cee4cc0b..aaeeb544228 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -477,7 +477,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_max_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
-	blk_queue_hardsect_size(queue, dev->blk_size);
+	blk_queue_logical_block_size(queue, dev->blk_size);
 
 	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
 			  ps3disk_prepare_flush);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index e67bbae9547..cc54473b8e7 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -722,7 +722,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	/*
 	 * build the command
 	 *
-	 * The call to blk_queue_hardsect_size() guarantees that request
+	 * The call to blk_queue_logical_block_size() guarantees that request
 	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
 	block = blk_rq_pos(rq) >> lun->capacity.bshift;
@@ -1749,7 +1749,7 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	ub_revalidate(lun->udev, lun);
 
 	/* XXX Support sector size switching like in sr.c */
-	blk_queue_hardsect_size(disk->queue, lun->capacity.bsize);
+	blk_queue_logical_block_size(disk->queue, lun->capacity.bsize);
 	set_capacity(disk, lun->capacity.nsec);
 	// set_disk_ro(sdkp->disk, lun->readonly);
 
@@ -2324,7 +2324,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
 	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, 0xffffffff);	/* Dubious. */
 	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	blk_queue_hardsect_size(q, lun->capacity.bsize);
+	blk_queue_logical_block_size(q, lun->capacity.bsize);
 
 	lun->disk = disk;
 	q->queuedata = lun;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 511d4ae2d17..c4845b16946 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -347,7 +347,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 				offsetof(struct virtio_blk_config, blk_size),
 				&blk_size);
 	if (!err)
-		blk_queue_hardsect_size(vblk->disk->queue, blk_size);
+		blk_queue_logical_block_size(vblk->disk->queue, blk_size);
 
 	add_disk(vblk->disk);
 	return 0;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 132120ae4bd..c1996829d5e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -344,7 +344,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
-	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_max_sectors(rq, 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 3a4397edab7..f08491a3a81 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -984,7 +984,7 @@ static int __devinit ace_setup(struct ace_device *ace)
 	ace->queue = blk_init_queue(ace_request, &ace->lock);
 	if (ace->queue == NULL)
 		goto err_blk_initq;
-	blk_queue_hardsect_size(ace->queue, 512);
+	blk_queue_logical_block_size(ace->queue, 512);
 
 	/*
 	 * Allocate and initialize GD structure
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 1e366ad8f68..b5621f27c4b 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -739,7 +739,7 @@ static void __devinit probe_gdrom_setupdisk(void)
 
 static int __devinit probe_gdrom_setupqueue(void)
 {
-	blk_queue_hardsect_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
+	blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
 	/* using DMA so memory will need to be contiguous */
 	blk_queue_max_hw_segments(gd.gdrom_rq, 1);
 	/* set a large max size to get most from DMA */
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index f177c2d4017..0fff646cc2f 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -469,8 +469,8 @@ static void vio_handle_cd_event(struct HvLpEvent *event)
 	case viocdopen:
 		if (event->xRc == 0) {
 			di = &viocd_diskinfo[bevent->disk];
-			blk_queue_hardsect_size(di->viocd_disk->queue,
-					bevent->block_size);
+			blk_queue_logical_block_size(di->viocd_disk->queue,
+						     bevent->block_size);
 			set_capacity(di->viocd_disk,
 					bevent->media_size *
 					bevent->block_size / 512);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 20d90e6a6e5..db32f0e4c7d 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 	err = bd_claim(bdev, raw_open);
 	if (err)
 		goto out1;
-	err = set_blocksize(bdev, bdev_hardsect_size(bdev));
+	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
 		goto out2;
 	filp->f_flags |= O_DIRECT;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 1799328decf..424140c6c40 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -182,7 +182,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 				 (sense->information[2] <<  8) |
 				 (sense->information[3]);
 
-			if (drive->queue->hardsect_size == 2048)
+			if (queue_logical_block_size(drive->queue) == 2048)
 				/* device sector size is 2K */
 				sector <<= 2;
 
@@ -737,7 +737,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(q) >> SECTOR_BITS;
+		queue_logical_block_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 				  "secs_per_frame: %u",
@@ -1021,8 +1021,8 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	/* save a private copy of the TOC capacity for error handling */
 	drive->probed_capacity = toc->capacity * sectors_per_frame;
 
-	blk_queue_hardsect_size(drive->queue,
-				sectors_per_frame << SECTOR_BITS);
+	blk_queue_logical_block_size(drive->queue,
+				     sectors_per_frame << SECTOR_BITS);
 
 	/* first read just the header, so we know how long the TOC is */
 	stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
@@ -1338,7 +1338,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 /* standard prep_rq_fn that builds 10 byte cmds */
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
-	int hard_sect = queue_hardsect_size(q);
+	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
 	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
@@ -1543,7 +1543,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	nslots = ide_cdrom_probe_capabilities(drive);
 
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 
 	if (ide_cdrom_register(drive, nslots)) {
 		printk(KERN_ERR PFX "%s: %s failed to register device with the"
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 47c68bc75a1..06b0ded1ce2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
 		target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
 
 		if (sync_page_io(rdev->bdev, target,
-				 roundup(size, bdev_hardsect_size(rdev->bdev)),
+				 roundup(size, bdev_logical_block_size(rdev->bdev)),
 				 page, READ)) {
 			page->index = index;
 			attach_page_buffers(page, NULL); /* so that free_buffer will
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			int size = PAGE_SIZE;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
-					       bdev_hardsect_size(rdev->bdev));
+					       bdev_logical_block_size(rdev->bdev));
 			/* Just make sure we aren't corrupting data or
 			 * metadata
 			 */
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index a2e26c24214..75d8081a904 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store,
 	}
 
 	/* Validate the chunk size against the device block size */
-	if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+	if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
 		*error = "Chunk size is not a multiple of device blocksize";
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index be233bc4d91..6fa8ccf91c7 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -413,7 +413,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		 * Buffer holds both header and bitset.
 		 */
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
-				       bitset_size, ti->limits.hardsect_size);
+				       bitset_size,
+				       ti->limits.logical_block_size);
 
 		if (buf_size > dev->bdev->bd_inode->i_size) {
 			DMWARN("log device %s too small: need %llu bytes",
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e75c6dd76a9..2662a41337e 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	 */
 	if (!ps->store->chunk_size) {
 		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+		    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
 		ps->store->chunk_mask = ps->store->chunk_size - 1;
 		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 		chunk_size_supplied = 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 429b50b975d..65e2d975985 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -108,7 +108,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 	lhs->max_hw_segments =
 		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
 
-	lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
+	lhs->logical_block_size = max(lhs->logical_block_size,
+				      rhs->logical_block_size);
 
 	lhs->max_segment_size =
 		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
@@ -529,7 +530,8 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	rs->max_hw_segments =
 		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
 
-	rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+	rs->logical_block_size = max(rs->logical_block_size,
+				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
 		min_not_zero(rs->max_segment_size, q->max_segment_size);
@@ -683,8 +685,8 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
 	if (!rs->max_hw_segments)
 		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->hardsect_size)
-		rs->hardsect_size = 1 << SECTOR_SHIFT;
+	if (!rs->logical_block_size)
+		rs->logical_block_size = 1 << SECTOR_SHIFT;
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
@@ -914,7 +916,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_sectors(q, t->limits.max_sectors);
 	q->max_phys_segments = t->limits.max_phys_segments;
 	q->max_hw_segments = t->limits.max_hw_segments;
-	q->hardsect_size = t->limits.hardsect_size;
+	q->logical_block_size = t->limits.logical_block_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->max_hw_sectors = t->limits.max_hw_sectors;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fccc8343a25..4cbc19f5c30 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1202,7 +1202,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
-	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
 	if (rdev->sb_size & bmask)
 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c0bebc6a2f2..7847bbc1440 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1242,7 +1242,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 
 	sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
 
-	blk_queue_hardsect_size(msb->queue, msb->page_size);
+	blk_queue_logical_block_size(msb->queue, msb->page_size);
 
 	capacity = be16_to_cpu(sys_info->user_block_count);
 	capacity *= be16_to_cpu(sys_info->block_size);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 6573ef4408f..335d4c78a77 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -794,8 +794,9 @@ static int i2o_block_transfer(struct request *req)
 	if (c->adaptec) {
 		u8 cmd[10];
 		u32 scsi_flags;
-		u16 hwsec = queue_hardsect_size(req->q) >> KERNEL_SECTOR_SHIFT;
+		u16 hwsec;
 
+		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
 		memset(cmd, 0, 10);
 
 		sgl_offset = SGL_OFFSET_12;
@@ -1078,7 +1079,7 @@ static int i2o_block_probe(struct device *dev)
 	 */
 	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
 	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_hardsect_size(queue, le32_to_cpu(blocksize));
+		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
 	} else
 		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
 
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c5df8654645..98ffc41eaf2 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -521,7 +521,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 
 	sprintf(md->disk->disk_name, "mmcblk%d", devidx);
 
-	blk_queue_hardsect_size(md->queue.queue, 512);
+	blk_queue_logical_block_size(md->queue.queue, 512);
 
 	if (!mmc_card_sd(card) && mmc_card_blockaddr(card)) {
 		/*
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 502622f628b..aaac3b6800b 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -378,7 +378,7 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	}
 
 	tr->blkcore_priv->rq->queuedata = tr;
-	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize);
 	if (tr->discard)
 		blk_queue_set_discard(tr->blkcore_priv->rq,
 				      blktrans_discard_request);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e64f62d5e0f..27a1be0cd4d 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1990,7 +1990,7 @@ static void dasd_setup_queue(struct dasd_block *block)
 {
 	int max;
 
-	blk_queue_hardsect_size(block->request_queue, block->bp_block);
+	blk_queue_logical_block_size(block->request_queue, block->bp_block);
 	max = block->base->discipline->max_blocks << block->s2b_shift;
 	blk_queue_max_sectors(block->request_queue, max);
 	blk_queue_max_phys_segments(block->request_queue, -1L);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index cfdcf1aed33..a4c7ffcd998 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -602,7 +602,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	dev_info->gd->private_data = dev_info;
 	dev_info->gd->driverfs_dev = &dev_info->dev;
 	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
-	blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096);
+	blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
 
 	seg_byte_size = (dev_info->end - dev_info->start + 1);
 	set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 76814f3e898..0ae0c83ef87 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -343,7 +343,7 @@ static int __init xpram_setup_blkdev(void)
 			goto out;
 		}
 		blk_queue_make_request(xpram_queues[i], xpram_make_request);
-		blk_queue_hardsect_size(xpram_queues[i], 4096);
+		blk_queue_logical_block_size(xpram_queues[i], 4096);
 	}
 
 	/*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1e796767598..47ff695255e 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -222,7 +222,7 @@ tapeblock_setup_device(struct tape_device * device)
 	if (rc)
 		goto cleanup_queue;
 
-	blk_queue_hardsect_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
+	blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
 	blk_queue_max_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC);
 	blk_queue_max_phys_segments(blkdat->request_queue, -1L);
 	blk_queue_max_hw_segments(blkdat->request_queue, -1L);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 40d2860f235..bcf3bd40bbd 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1510,7 +1510,7 @@ got_data:
 		 */
 		sector_size = 512;
 	}
-	blk_queue_hardsect_size(sdp->request_queue, sector_size);
+	blk_queue_logical_block_size(sdp->request_queue, sector_size);
 
 	{
 		char cap_str_2[10], cap_str_10[10];
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index fddba53c7fe..cd350dfc121 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -727,7 +727,7 @@ static void get_sectorsize(struct scsi_cd *cd)
 	}
 
 	queue = cd->device->request_queue;
-	blk_queue_hardsect_size(queue, sector_size);
+	blk_queue_logical_block_size(queue, sector_size);
 
 	return;
 }
diff --git a/fs/bio.c b/fs/bio.c
index 81dc93e7253..4445c382173 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 			   unsigned int offset)
 {
-	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	unsigned int sector_sz;
 	struct bio_vec *bv;
 	sector_t sectors;
 	int i;
 
+	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
 	sectors = 0;
 
 	if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a85fe310fc6..a29b4dcc1bc 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
-	if (size < bdev_hardsect_size(bdev))
+	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
 	/* Don't change the size if it is same as current */
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-	int minsize = bdev_hardsect_size(sb->s_bdev);
+	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
@@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change);
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_hardsect_size(bdev);
+	unsigned bsize = bdev_logical_block_size(bdev);
 
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb..36e2bbc60ec 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
-	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
-		printk(KERN_ERR "hardsect size: %d\n",
-					bdev_hardsect_size(bdev));
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
 
 		dump_stack();
 		return NULL;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc205..8b10b87dc01 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		rw = WRITE_ODIRECT;
 
 	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
 	if (offset & blocksize_mask) {
 		if (bdev)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c..acbb94fdf90 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	hblock = bdev_hardsect_size(sb->s_bdev);
+	hblock = bdev_logical_block_size(sb->s_bdev);
 	if (sb->s_blocksize != blocksize) {
 		/*
 		 * Make sure the blocksize for the filesystem is larger
@@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT3-fs: blocksize too small for journal device.\n");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f22..a30549f7a30 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT4-fs: blocksize too small for journal device.\n");
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea75..a3b2ac989fc 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 	}
 
 	/* Set up the buffer cache and SB for real */
-	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
 		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
-		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa..a971d24e10c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 	struct super_block *sb = sdp->sd_vfs;
 	struct block_device *bdev = sb->s_bdev;
 	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-					   bdev_hardsect_size(sb->s_bdev);
+					   bdev_logical_block_size(sb->s_bdev);
 	u64 blk;
 	sector_t start = 0;
 	sector_t nr_sects = 0;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa..a91f15b8673 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
 	if (sb->s_blocksize != blocksize) {
-		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
 		if (blocksize < hw_blocksize) {
 			printk(KERN_ERR
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a..6aa7c471353 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto err_out_now;
 
 	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
-	if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
 		if (!silent)
 			ntfs_error(sb, "Device has unsupported sector size "
 					"(%i).  The maximum supported sector "
 					"size on this architecture is %lu "
 					"bytes.",
-					bdev_hardsect_size(sb->s_bdev),
+					bdev_logical_block_size(sb->s_bdev),
 					PAGE_CACHE_SIZE);
 		goto err_out_now;
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab37..09cc25d0461 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 	bdevname(reg->hr_bdev, reg->hr_dev_name);
 
-	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e..5c6163f5503 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 	*bh = NULL;
 
 	/* may be > 512 */
-	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
 	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
 		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
 		     *sector_size, OCFS2_MAX_BLOCKSIZE);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd3..fc71aab0846 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	Sector sect;
 
 	res = 0;
-	blocksize = bdev_hardsect_size(bdev);
+	blocksize = bdev_logical_block_size(bdev);
 	if (blocksize <= 0)
 		goto out_exit;
 	i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f2..0028d2ef066 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 	Sector sect;
 	unsigned char *data;
 	u32 this_sector, this_size;
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -415,7 +415,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a..0ba44107d8f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 	} else {
-		uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
 			if (!silent)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b..1418b916fc2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
 	struct block_device	*bdev)
 {
 	return xfs_setsize_buftarg_flags(btp,
-			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+			PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56ce53fce72..872b78b7a10 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -391,7 +391,7 @@ struct request_queue
 	unsigned int		max_hw_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
-	unsigned short		hardsect_size;
+	unsigned short		logical_block_size;
 	unsigned int		max_segment_size;
 
 	unsigned long		seg_boundary_mask;
@@ -901,7 +901,7 @@ extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
-extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
+extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -988,19 +988,19 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
-static inline int queue_hardsect_size(struct request_queue *q)
+static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->hardsect_size)
-		retval = q->hardsect_size;
+	if (q && q->logical_block_size)
+		retval = q->logical_block_size;
 
 	return retval;
 }
 
-static inline int bdev_hardsect_size(struct block_device *bdev)
+static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
-	return queue_hardsect_size(bdev_get_queue(bdev));
+	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
 static inline int queue_dma_alignment(struct request_queue *q)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ded2d7c4266..49c2362977f 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -149,7 +149,7 @@ struct io_restrictions {
 	unsigned max_hw_sectors;
 	unsigned max_sectors;
 	unsigned max_segment_size;
-	unsigned short hardsect_size;
+	unsigned short logical_block_size;
 	unsigned short max_hw_segments;
 	unsigned short max_phys_segments;
 	unsigned char no_cluster; /* inverted so that 0 is default */
-- 
cgit v1.2.3-70-g09d2


From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:50 -0400
Subject: block: Use accessor functions for queue limits

Convert all external users of queue limits to using wrapper functions
instead of poking the request queue variables directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c            |  8 ++++----
 block/blk-core.c               | 16 ++++++++--------
 block/blk-map.c                |  4 ++--
 block/blk-merge.c              | 27 ++++++++++++++-------------
 block/blk-settings.c           | 15 ++++++++++++---
 block/blk-sysfs.c              |  8 ++++----
 block/compat_ioctl.c           |  2 +-
 block/ioctl.c                  | 10 +++++-----
 block/scsi_ioctl.c             |  8 ++++----
 drivers/block/pktcdvd.c        |  6 ++++--
 drivers/cdrom/cdrom.c          |  4 ++--
 drivers/md/dm-table.c          | 28 ++++++++++++++--------------
 drivers/md/linear.c            |  2 +-
 drivers/md/multipath.c         |  4 ++--
 drivers/md/raid0.c             |  2 +-
 drivers/md/raid1.c             |  4 ++--
 drivers/md/raid10.c            |  8 ++++----
 drivers/md/raid5.c             |  4 ++--
 drivers/scsi/sg.c              | 15 ++++++++-------
 drivers/scsi/st.c              |  4 ++--
 drivers/usb/storage/scsiglue.c |  4 ++--
 fs/bio.c                       | 19 ++++++++++---------
 include/linux/bio.h            |  2 +-
 include/linux/blkdev.h         | 36 ++++++++++++++++++++++++++++++++++++
 mm/bounce.c                    |  4 ++--
 25 files changed, 147 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0d98054cdbd..30022b4e2f6 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -388,10 +388,10 @@ int blkdev_issue_discard(struct block_device *bdev,
 
 		bio->bi_sector = sector;
 
-		if (nr_sects > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			nr_sects -= q->max_hw_sectors;
-			sector += q->max_hw_sectors;
+		if (nr_sects > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			nr_sects -= queue_max_hw_sectors(q);
+			sector += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 59c4af52311..7a4c40184a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1437,11 +1437,11 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > q->max_hw_sectors)) {
+		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-				bdevname(bio->bi_bdev, b),
-				bio_sectors(bio),
-				q->max_hw_sectors);
+			       bdevname(bio->bi_bdev, b),
+			       bio_sectors(bio),
+			       queue_max_hw_sectors(q));
 			goto end_io;
 		}
 
@@ -1608,8 +1608,8 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
+	if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
+	    blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1621,8 +1621,8 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > q->max_phys_segments ||
-	    rq->nr_phys_segments > q->max_hw_segments) {
+	if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
+	    rq->nr_phys_segments > queue_max_hw_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
diff --git a/block/blk-map.c b/block/blk-map.c
index ef2492adca7..9083cf0180c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -115,7 +115,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	struct bio *bio = NULL;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
@@ -292,7 +292,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	struct bio *bio;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4974dd5767e..39ce64432ba 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -32,11 +32,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			 * never considered part of another segment, since that
 			 * might change with the bounce page.
 			 */
-			high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+			high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q);
 			if (high || highprv)
 				goto new_segment;
 			if (cluster) {
-				if (seg_size + bv->bv_len > q->max_segment_size)
+				if (seg_size + bv->bv_len
+				    > queue_max_segment_size(q))
 					goto new_segment;
 				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
 					goto new_segment;
@@ -91,7 +92,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 		return 0;
 
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
-	    q->max_segment_size)
+	    queue_max_segment_size(q))
 		return 0;
 
 	if (!bio_has_data(bio))
@@ -134,7 +135,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		int nbytes = bvec->bv_len;
 
 		if (bvprv && cluster) {
-			if (sg->length + nbytes > q->max_segment_size)
+			if (sg->length + nbytes > queue_max_segment_size(q))
 				goto new_segment;
 
 			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
@@ -205,8 +206,8 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
-	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
+	    req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -227,9 +228,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -251,9 +252,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
@@ -287,7 +288,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	/*
 	 * Will it become too large?
 	 */
-	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors)
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -299,10 +300,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		total_phys_segments--;
 	}
 
-	if (total_phys_segments > q->max_phys_segments)
+	if (total_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
-	if (total_phys_segments > q->max_hw_segments)
+	if (total_phys_segments > queue_max_hw_segments(q))
 		return 0;
 
 	/* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 15c3164537b..0b32f984eed 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -219,6 +219,15 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
+{
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+	else
+		q->max_hw_sectors = max_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_hw_sectors);
+
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
@@ -395,11 +404,11 @@ int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
-	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
+	if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
-	--q->max_hw_segments;
-	--q->max_phys_segments;
+	blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
+	blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 13d38b7e4d0..142a4acddd4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -95,7 +95,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 {
-	int max_sectors_kb = q->max_sectors >> 1;
+	int max_sectors_kb = queue_max_sectors(q) >> 1;
 
 	return queue_var_show(max_sectors_kb, (page));
 }
@@ -109,7 +109,7 @@ static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long max_sectors_kb,
-			max_hw_sectors_kb = q->max_hw_sectors >> 1,
+		max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
 			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
 
@@ -117,7 +117,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 		return -EINVAL;
 
 	spin_lock_irq(q->queue_lock);
-	q->max_sectors = max_sectors_kb << 1;
+	blk_queue_max_sectors(q, max_sectors_kb << 1);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
@@ -125,7 +125,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 {
-	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+	int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
 
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9eaa1940273..df18a156d01 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -766,7 +766,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
-					 bdev_get_queue(bdev)->max_sectors);
+					 queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET: /* compatible, but no compat_ptr (!) */
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/block/ioctl.c b/block/ioctl.c
index 7aa97f65da8..500e4c73cc5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -152,10 +152,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 		bio->bi_private = &wait;
 		bio->bi_sector = start;
 
-		if (len > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			len -= q->max_hw_sectors;
-			start += q->max_hw_sectors;
+		if (len > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			len -= queue_max_hw_sectors(q);
+			start += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = len << 9;
 			len = 0;
@@ -313,7 +313,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKSSZGET: /* get block device hardware sector size */
 		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
-		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
+		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a9670dd4b5d..5f8e798ede4 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -75,7 +75,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-	unsigned val = min(q->sg_reserved_size, q->max_sectors << 9);
+	unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
 
 	return put_user(val, p);
 }
@@ -89,8 +89,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
 	if (size < 0)
 		return -EINVAL;
-	if (size > (q->max_sectors << 9))
-		size = q->max_sectors << 9;
+	if (size > (queue_max_sectors(q) << 9))
+		size = queue_max_sectors(q) << 9;
 
 	q->sg_reserved_size = size;
 	return 0;
@@ -264,7 +264,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (hdr->cmd_len > BLK_MAX_CDB)
 		return -EINVAL;
 
-	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
+	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
 		return -EIO;
 
 	if (hdr->dxfer_len)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 293f5858921..d57f1175948 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -991,13 +991,15 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
  */
 static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
 {
-	if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) {
+	if ((pd->settings.size << 9) / CD_FRAMESIZE
+	    <= queue_max_phys_segments(q)) {
 		/*
 		 * The cdrom device can handle one segment/frame
 		 */
 		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
-	} else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) {
+	} else if ((pd->settings.size << 9) / PAGE_SIZE
+		   <= queue_max_phys_segments(q)) {
 		/*
 		 * We can handle this case at the expense of some extra memory
 		 * copies during write operations
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index cceace61ef2..71d1b9bab70 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2101,8 +2101,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		nr = nframes;
 		if (cdi->cdda_method == CDDA_BPC_SINGLE)
 			nr = 1;
-		if (nr * CD_FRAMESIZE_RAW > (q->max_sectors << 9))
-			nr = (q->max_sectors << 9) / CD_FRAMESIZE_RAW;
+		if (nr * CD_FRAMESIZE_RAW > (queue_max_sectors(q) << 9))
+			nr = (queue_max_sectors(q) << 9) / CD_FRAMESIZE_RAW;
 
 		len = nr * CD_FRAMESIZE_RAW;
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 65e2d975985..e9a73bb242b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -510,7 +510,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 *        combine_restrictions_low()
 	 */
 	rs->max_sectors =
-		min_not_zero(rs->max_sectors, q->max_sectors);
+		min_not_zero(rs->max_sectors, queue_max_sectors(q));
 
 	/*
 	 * Check if merge fn is supported.
@@ -525,25 +525,25 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 
 	rs->max_phys_segments =
 		min_not_zero(rs->max_phys_segments,
-			     q->max_phys_segments);
+			     queue_max_phys_segments(q));
 
 	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
 
 	rs->logical_block_size = max(rs->logical_block_size,
 				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, q->max_segment_size);
+		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
 
 	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, q->max_hw_sectors);
+		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
 
 	rs->seg_boundary_mask =
 		min_not_zero(rs->seg_boundary_mask,
-			     q->seg_boundary_mask);
+			     queue_segment_boundary(q));
 
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
+	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
 
 	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
@@ -914,13 +914,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	 * restrictions.
 	 */
 	blk_queue_max_sectors(q, t->limits.max_sectors);
-	q->max_phys_segments = t->limits.max_phys_segments;
-	q->max_hw_segments = t->limits.max_hw_segments;
-	q->logical_block_size = t->limits.logical_block_size;
-	q->max_segment_size = t->limits.max_segment_size;
-	q->max_hw_sectors = t->limits.max_hw_sectors;
-	q->seg_boundary_mask = t->limits.seg_boundary_mask;
-	q->bounce_pfn = t->limits.bounce_pfn;
+	blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
+	blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
+	blk_queue_logical_block_size(q, t->limits.logical_block_size);
+	blk_queue_max_segment_size(q, t->limits.max_segment_size);
+	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
+	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7a36e38393a..64f1f3e046e 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->num_sectors = rdev->sectors;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 41ced0cbe82..4ee31aa13c4 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * merge_bvec_fn will be involved in multipath.)
 		 */
 			if (q->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(q) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			conf->working_disks++;
@@ -467,7 +467,7 @@ static int multipath_run (mddev_t *mddev)
 		 * violating it, not that we ever expect a device with
 		 * a merge_bvec_fn to be involved in multipath */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!test_bit(Faulty, &rdev->flags))
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d7559be5..925507e7d67 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -144,7 +144,7 @@ static int create_strip_zones (mddev_t *mddev)
 		 */
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!smallest || (rdev1->sectors < smallest->sectors))
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36df9109cde..e23758b4a34 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 499620afb44..750550c1166 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -2145,8 +2145,8 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4616bc3a6e7..7970dc8c522 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3463,10 +3463,10 @@ static int bio_fits_rdev(struct bio *bi)
 {
 	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
 
-	if ((bi->bi_size>>9) > q->max_sectors)
+	if ((bi->bi_size>>9) > queue_max_sectors(q))
 		return 0;
 	blk_recount_segments(q, bi);
-	if (bi->bi_phys_segments > q->max_phys_segments)
+	if (bi->bi_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
 	if (q->merge_bvec_fn)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0fc2c0ae769..9bd407fa98e 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -289,8 +289,8 @@ sg_open(struct inode *inode, struct file *filp)
 	if (list_empty(&sdp->sfds)) {	/* no existing opens on this device */
 		sdp->sgdebug = 0;
 		q = sdp->device->request_queue;
-		sdp->sg_tablesize = min(q->max_hw_segments,
-					q->max_phys_segments);
+		sdp->sg_tablesize = min(queue_max_hw_segments(q),
+					queue_max_phys_segments(q));
 	}
 	if ((sfp = sg_add_sfp(sdp, dev)))
 		filp->private_data = sfp;
@@ -909,7 +909,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
                 if (val < 0)
                         return -EINVAL;
 		val = min_t(int, val,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		if (val != sfp->reserve.bufflen) {
 			if (sg_res_in_use(sfp) || sfp->mmap_called)
 				return -EBUSY;
@@ -919,7 +919,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return 0;
 	case SG_GET_RESERVED_SIZE:
 		val = min_t(int, sfp->reserve.bufflen,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		return put_user(val, ip);
 	case SG_SET_COMMAND_Q:
 		result = get_user(val, ip);
@@ -1059,7 +1059,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 			return -ENODEV;
 		return scsi_ioctl(sdp->device, cmd_in, p);
 	case BLKSECTGET:
-		return put_user(sdp->device->request_queue->max_sectors * 512,
+		return put_user(queue_max_sectors(sdp->device->request_queue) * 512,
 				ip);
 	case BLKTRACESETUP:
 		return blk_trace_setup(sdp->device->request_queue,
@@ -1377,7 +1377,8 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
 	sdp->device = scsidp;
 	INIT_LIST_HEAD(&sdp->sfds);
 	init_waitqueue_head(&sdp->o_excl_wait);
-	sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments);
+	sdp->sg_tablesize = min(queue_max_hw_segments(q),
+				queue_max_phys_segments(q));
 	sdp->index = k;
 	kref_init(&sdp->d_ref);
 
@@ -2055,7 +2056,7 @@ sg_add_sfp(Sg_device * sdp, int dev)
 		sg_big_buff = def_reserved_size;
 
 	bufflen = min_t(int, sg_big_buff,
-			sdp->device->request_queue->max_sectors * 512);
+			queue_max_sectors(sdp->device->request_queue) * 512);
 	sg_build_reserve(sfp, bufflen);
 	SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp:   bufflen=%d, k_use_sg=%d\n",
 			   sfp->reserve.bufflen, sfp->reserve.k_use_sg));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 8681b708344..89bd438e1fe 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3983,8 +3983,8 @@ static int st_probe(struct device *dev)
 		return -ENODEV;
 	}
 
-	i = min(SDp->request_queue->max_hw_segments,
-		SDp->request_queue->max_phys_segments);
+	i = min(queue_max_hw_segments(SDp->request_queue),
+		queue_max_phys_segments(SDp->request_queue));
 	if (st_max_sg_segs < i)
 		i = st_max_sg_segs;
 	buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i);
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 4ca3b586064..cfa26d56ce6 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -132,7 +132,7 @@ static int slave_configure(struct scsi_device *sdev)
 
 		if (us->fflags & US_FL_MAX_SECTORS_MIN)
 			max_sectors = PAGE_CACHE_SIZE >> 9;
-		if (sdev->request_queue->max_sectors > max_sectors)
+		if (queue_max_sectors(sdev->request_queue) > max_sectors)
 			blk_queue_max_sectors(sdev->request_queue,
 					      max_sectors);
 	} else if (sdev->type == TYPE_TAPE) {
@@ -483,7 +483,7 @@ static ssize_t show_max_sectors(struct device *dev, struct device_attribute *att
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
 
-	return sprintf(buf, "%u\n", sdev->request_queue->max_sectors);
+	return sprintf(buf, "%u\n", queue_max_sectors(sdev->request_queue));
 }
 
 /* Input routine for the sysfs max_sectors file */
diff --git a/fs/bio.c b/fs/bio.c
index 4445c382173..ab423a1024a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	struct request_queue *q = bdev_get_queue(bdev);
 	int nr_pages;
 
-	nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > q->max_phys_segments)
-		nr_pages = q->max_phys_segments;
-	if (nr_pages > q->max_hw_segments)
-		nr_pages = q->max_hw_segments;
+	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > queue_max_phys_segments(q))
+		nr_pages = queue_max_phys_segments(q);
+	if (nr_pages > queue_max_hw_segments(q))
+		nr_pages = queue_max_hw_segments(q);
 
 	return nr_pages;
 }
@@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_phys_segments >= q->max_hw_segments) {
+	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
+	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
 
 		if (retried_segments)
 			return 0;
@@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+	return __bio_add_page(q, bio, page, len, offset,
+			      queue_max_hw_sectors(q));
 }
 
 /**
@@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 
 struct bio_map_data {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d30ec6f30dd..12737be5860 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -279,7 +279,7 @@ static inline int bio_has_allocated_vec(struct bio *bio)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
-	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
+	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 #define BIO_SEG_BOUNDARY(q, b1, b2) \
 	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 872b78b7a10..29b48f7b4ba 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -898,6 +898,7 @@ extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
+extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
@@ -988,6 +989,41 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
+static inline unsigned long queue_bounce_pfn(struct request_queue *q)
+{
+	return q->bounce_pfn;
+}
+
+static inline unsigned long queue_segment_boundary(struct request_queue *q)
+{
+	return q->seg_boundary_mask;
+}
+
+static inline unsigned int queue_max_sectors(struct request_queue *q)
+{
+	return q->max_sectors;
+}
+
+static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
+{
+	return q->max_hw_sectors;
+}
+
+static inline unsigned short queue_max_hw_segments(struct request_queue *q)
+{
+	return q->max_hw_segments;
+}
+
+static inline unsigned short queue_max_phys_segments(struct request_queue *q)
+{
+	return q->max_phys_segments;
+}
+
+static inline unsigned int queue_max_segment_size(struct request_queue *q)
+{
+	return q->max_segment_size;
+}
+
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a..8dcd4315e01 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -192,7 +192,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		/*
 		 * is destination page below bounce pfn?
 		 */
-		if (page_to_pfn(page) <= q->bounce_pfn)
+		if (page_to_pfn(page) <= queue_bounce_pfn(q))
 			continue;
 
 		/*
@@ -284,7 +284,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->bounce_pfn >= blk_max_pfn)
+		if (queue_bounce_pfn(q) >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 025146e13b63483add912706c101fb0fb6f015cc Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:51 -0400
Subject: block: Move queue limits to an embedded struct

To accommodate stacking drivers that do not have an associated request
queue we're moving the limits to a separate, embedded structure.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 55 +++++++++++++++++++++++++++++++-------------------
 include/linux/blkdev.h | 44 +++++++++++++++++++++++-----------------
 2 files changed, 60 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0b32f984eed..b0f547cecfb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -179,16 +179,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->bounce_pfn = max_low_pfn;
+	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-	q->bounce_pfn = b_pfn;
+	q->limits.bounce_pfn = b_pfn;
 #endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->bounce_pfn = b_pfn;
+		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
@@ -211,10 +211,10 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 	}
 
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->max_hw_sectors = q->max_sectors = max_sectors;
+		q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors;
 	else {
-		q->max_sectors = BLK_DEF_MAX_SECTORS;
-		q->max_hw_sectors = max_sectors;
+		q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
+		q->limits.max_hw_sectors = max_sectors;
 	}
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -222,9 +222,9 @@ EXPORT_SYMBOL(blk_queue_max_sectors);
 void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
 {
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+		q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
 	else
-		q->max_hw_sectors = max_sectors;
+		q->limits.max_hw_sectors = max_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
@@ -247,7 +247,7 @@ void blk_queue_max_phys_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->max_phys_segments = max_segments;
+	q->limits.max_phys_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_phys_segments);
 
@@ -271,7 +271,7 @@ void blk_queue_max_hw_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->max_hw_segments = max_segments;
+	q->limits.max_hw_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_segments);
 
@@ -292,7 +292,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 		       __func__, max_size);
 	}
 
-	q->max_segment_size = max_size;
+	q->limits.max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
@@ -308,7 +308,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
  **/
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->logical_block_size = size;
+	q->limits.logical_block_size = size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
@@ -325,14 +325,27 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
 	/* zero is "infinity" */
-	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
-	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
-	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
-
-	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
-	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
-	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
+	t->limits.max_sectors = min_not_zero(queue_max_sectors(t),
+					     queue_max_sectors(b));
+
+	t->limits.max_hw_sectors = min_not_zero(queue_max_hw_sectors(t),
+						queue_max_hw_sectors(b));
+
+	t->limits.seg_boundary_mask = min_not_zero(queue_segment_boundary(t),
+						   queue_segment_boundary(b));
+
+	t->limits.max_phys_segments = min_not_zero(queue_max_phys_segments(t),
+						   queue_max_phys_segments(b));
+
+	t->limits.max_hw_segments = min_not_zero(queue_max_hw_segments(t),
+						 queue_max_hw_segments(b));
+
+	t->limits.max_segment_size = min_not_zero(queue_max_segment_size(t),
+						  queue_max_segment_size(b));
+
+	t->limits.logical_block_size = max(queue_logical_block_size(t),
+					   queue_logical_block_size(b));
+
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
@@ -430,7 +443,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 		       __func__, mask);
 	}
 
-	q->seg_boundary_mask = mask;
+	q->limits.seg_boundary_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 29b48f7b4ba..b7bb6fdba12 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,21 @@ struct blk_cmd_filter {
 	struct kobject kobj;
 };
 
+struct queue_limits {
+	unsigned long		bounce_pfn;
+	unsigned long		seg_boundary_mask;
+
+	unsigned int		max_hw_sectors;
+	unsigned int		max_sectors;
+	unsigned int		max_segment_size;
+
+	unsigned short		logical_block_size;
+	unsigned short		max_hw_segments;
+	unsigned short		max_phys_segments;
+
+	unsigned char		no_cluster;
+};
+
 struct request_queue
 {
 	/*
@@ -358,7 +373,6 @@ struct request_queue
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
-	unsigned long		bounce_pfn;
 	gfp_t			bounce_gfp;
 
 	/*
@@ -387,14 +401,6 @@ struct request_queue
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 
-	unsigned int		max_sectors;
-	unsigned int		max_hw_sectors;
-	unsigned short		max_phys_segments;
-	unsigned short		max_hw_segments;
-	unsigned short		logical_block_size;
-	unsigned int		max_segment_size;
-
-	unsigned long		seg_boundary_mask;
 	void			*dma_drain_buffer;
 	unsigned int		dma_drain_size;
 	unsigned int		dma_pad_mask;
@@ -410,6 +416,8 @@ struct request_queue
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
 
+	struct queue_limits	limits;
+
 	/*
 	 * sg stuff
 	 */
@@ -991,45 +999,45 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 static inline unsigned long queue_bounce_pfn(struct request_queue *q)
 {
-	return q->bounce_pfn;
+	return q->limits.bounce_pfn;
 }
 
 static inline unsigned long queue_segment_boundary(struct request_queue *q)
 {
-	return q->seg_boundary_mask;
+	return q->limits.seg_boundary_mask;
 }
 
 static inline unsigned int queue_max_sectors(struct request_queue *q)
 {
-	return q->max_sectors;
+	return q->limits.max_sectors;
 }
 
 static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
 {
-	return q->max_hw_sectors;
+	return q->limits.max_hw_sectors;
 }
 
 static inline unsigned short queue_max_hw_segments(struct request_queue *q)
 {
-	return q->max_hw_segments;
+	return q->limits.max_hw_segments;
 }
 
 static inline unsigned short queue_max_phys_segments(struct request_queue *q)
 {
-	return q->max_phys_segments;
+	return q->limits.max_phys_segments;
 }
 
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
 {
-	return q->max_segment_size;
+	return q->limits.max_segment_size;
 }
 
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->logical_block_size)
-		retval = q->logical_block_size;
+	if (q && q->limits.logical_block_size)
+		retval = q->limits.logical_block_size;
 
 	return retval;
 }
-- 
cgit v1.2.3-70-g09d2


From c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:53 -0400
Subject: block: Export I/O topology for block devices and partitions

To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment.  This patch adds support for exposing
I/O topology characteristics as devices are stacked.

  logical_block_size is the smallest unit the device can address.

  physical_block_size indicates the smallest I/O the device can write
  without incurring a read-modify-write penalty.

  The io_min parameter is the smallest preferred I/O size reported by
  the device.  In many cases this is the same as the physical block
  size.  However, the io_min parameter can be scaled up when stacking
  (RAID5 chunk size > physical block size).

  The io_opt characteristic indicates the optimal I/O size reported by
  the device.  This is usually the stripe width for arrays.

  The alignment_offset parameter indicates the number of bytes the start
  of the device/partition is offset from the device's natural alignment.
  Partition tools and MD/DM utilities can use this to pad their offsets
  so filesystems start on proper boundaries.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/ABI/testing/sysfs-block |  59 +++++++++++
 block/blk-settings.c                  | 186 ++++++++++++++++++++++++++++++++++
 block/blk-sysfs.c                     |  33 ++++++
 block/genhd.c                         |  11 ++
 fs/partitions/check.c                 |  10 ++
 include/linux/blkdev.h                |  47 +++++++++
 include/linux/genhd.h                 |   1 +
 7 files changed, 347 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 44f52a4f590..cbbd3e06994 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -60,3 +60,62 @@ Description:
 		Indicates whether the block layer should automatically
 		generate checksums for write requests bound for
 		devices that support receiving integrity metadata.
+
+What:		/sys/block/<disk>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the device is
+		offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/<partition>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the partition
+		is offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/queue/logical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can
+		address.  It is typically 512 bytes.
+
+What:		/sys/block/<disk>/queue/physical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can write
+		without resorting to read-modify-write operation.  It is
+		usually the same as the logical block size but may be
+		bigger.  One example is SATA drives with 4KB sectors
+		that expose a 512-byte logical block size to the
+		operating system.
+
+What:		/sys/block/<disk>/queue/minimum_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a preferred minimum I/O size,
+		which is the smallest request the device can perform
+		without incurring a read-modify-write penalty.  For disk
+		drives this is often the physical block size.  For RAID
+		arrays it is often the stripe chunk size.
+
+What:		/sys/block/<disk>/queue/optimal_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report an optimal I/O size, which is
+		the device's preferred unit of receiving I/O.  This is
+		rarely reported for disk drives.  For RAID devices it is
+		usually the stripe width or the internal block size.
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b0f547cecfb..5649f34adb4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -309,9 +309,94 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
 	q->limits.logical_block_size = size;
+
+	if (q->limits.physical_block_size < size)
+		q->limits.physical_block_size = size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
+/**
+ * blk_queue_physical_block_size - set physical block size for the queue
+ * @q:  the request queue for the device
+ * @size:  the physical block size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible sector size that the
+ *   hardware can operate on without reverting to read-modify-write
+ *   operations.
+ */
+void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+{
+	q->limits.physical_block_size = size;
+
+	if (q->limits.physical_block_size < q->limits.logical_block_size)
+		q->limits.physical_block_size = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_physical_block_size);
+
+/**
+ * blk_queue_alignment_offset - set physical block alignment offset
+ * @q:	the request queue for the device
+ * @alignment:	alignment offset in bytes
+ *
+ * Description:
+ *   Some devices are naturally misaligned to compensate for things like
+ *   the legacy DOS partition table 63-sector offset.  Low-level drivers
+ *   should call this function for devices whose first sector is not
+ *   naturally aligned.
+ */
+void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
+{
+	q->limits.alignment_offset =
+		offset & (q->limits.physical_block_size - 1);
+	q->limits.misaligned = 0;
+}
+EXPORT_SYMBOL(blk_queue_alignment_offset);
+
+/**
+ * blk_queue_io_min - set minimum request size for the queue
+ * @q:	the request queue for the device
+ * @io_min:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Some devices have an internal block size bigger than the reported
+ *   hardware sector size.  This function can be used to signal the
+ *   smallest I/O the device can perform without incurring a performance
+ *   penalty.
+ */
+void blk_queue_io_min(struct request_queue *q, unsigned int min)
+{
+	q->limits.io_min = min;
+
+	if (q->limits.io_min < q->limits.logical_block_size)
+		q->limits.io_min = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_io_min);
+
+/**
+ * blk_queue_io_opt - set optimal request size for the queue
+ * @q:	the request queue for the device
+ * @io_opt:  optimal request size in bytes
+ *
+ * Description:
+ *   Drivers can call this function to set the preferred I/O request
+ *   size for devices that report such a value.
+ */
+void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
+{
+	q->limits.io_opt = opt;
+}
+EXPORT_SYMBOL(blk_queue_io_opt);
+
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
@@ -357,6 +442,107 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+/**
+ * blk_stack_limits - adjust queue_limits for stacked devices
+ * @t:	the stacking driver limits (top)
+ * @bdev:  the underlying queue limits (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges two queue_limit structs.  Returns 0 if alignment didn't
+ *    change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+		     sector_t offset)
+{
+	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+					    b->seg_boundary_mask);
+
+	t->max_phys_segments = min_not_zero(t->max_phys_segments,
+					    b->max_phys_segments);
+
+	t->max_hw_segments = min_not_zero(t->max_hw_segments,
+					  b->max_hw_segments);
+
+	t->max_segment_size = min_not_zero(t->max_segment_size,
+					   b->max_segment_size);
+
+	t->logical_block_size = max(t->logical_block_size,
+				    b->logical_block_size);
+
+	t->physical_block_size = max(t->physical_block_size,
+				     b->physical_block_size);
+
+	t->io_min = max(t->io_min, b->io_min);
+	t->no_cluster |= b->no_cluster;
+
+	/* Bottom device offset aligned? */
+	if (offset &&
+	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	/* If top has no alignment offset, inherit from bottom */
+	if (!t->alignment_offset)
+		t->alignment_offset =
+			b->alignment_offset & (b->physical_block_size - 1);
+
+	/* Top device aligned on logical block boundary? */
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * disk_stack_limits - adjust queue limits for stacked drivers
+ * @t:	MD/DM gendisk (top)
+ * @bdev:  the underlying block device (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges the limits for two queues.  Returns 0 if alignment
+ *    didn't change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+		       sector_t offset)
+{
+	struct request_queue *t = disk->queue;
+	struct request_queue *b = bdev_get_queue(bdev);
+
+	offset += get_start_sect(bdev) << 9;
+
+	if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, top);
+		bdevname(bdev, bottom);
+
+		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
+		       top, bottom);
+	}
+
+	if (!t->queue_lock)
+		WARN_ON_ONCE(1);
+	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(t->queue_lock, flags);
+		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
+		spin_unlock_irqrestore(t->queue_lock, flags);
+	}
+}
+EXPORT_SYMBOL(disk_stack_limits);
+
 /**
  * blk_queue_dma_pad - set pad mask
  * @q:     the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ccdadb8e20..9337e17f911 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,21 @@ static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page
 	return queue_var_show(queue_logical_block_size(q), page);
 }
 
+static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_physical_block_size(q), page);
+}
+
+static ssize_t queue_io_min_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_min(q), page);
+}
+
+static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_opt(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -257,6 +272,21 @@ static struct queue_sysfs_entry queue_logical_block_size_entry = {
 	.show = queue_logical_block_size_show,
 };
 
+static struct queue_sysfs_entry queue_physical_block_size_entry = {
+	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.show = queue_physical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_io_min_entry = {
+	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.show = queue_io_min_show,
+};
+
+static struct queue_sysfs_entry queue_io_opt_entry = {
+	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.show = queue_io_opt_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -289,6 +319,9 @@ static struct attribute *default_attrs[] = {
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
+	&queue_physical_block_size_entry.attr,
+	&queue_io_min_entry.attr,
+	&queue_io_opt_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 1a4916e0173..fe7ccc0a618 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,11 +852,21 @@ static ssize_t disk_capability_show(struct device *dev,
 	return sprintf(buf, "%x\n", disk->flags);
 }
 
+static ssize_t disk_alignment_offset_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -875,6 +885,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_removable.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be..0af36085eb2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_alignment_offset_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
+	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b7bb6fdba12..5e740a135e7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -314,11 +314,16 @@ struct queue_limits {
 	unsigned int		max_hw_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
+	unsigned int		physical_block_size;
+	unsigned int		alignment_offset;
+	unsigned int		io_min;
+	unsigned int		io_opt;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 
+	unsigned char		misaligned;
 	unsigned char		no_cluster;
 };
 
@@ -911,6 +916,15 @@ extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_alignment_offset(struct request_queue *q,
+				       unsigned int alignment);
+extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+			    sector_t offset);
+extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -1047,6 +1061,39 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
+static inline unsigned int queue_physical_block_size(struct request_queue *q)
+{
+	return q->limits.physical_block_size;
+}
+
+static inline unsigned int queue_io_min(struct request_queue *q)
+{
+	return q->limits.io_min;
+}
+
+static inline unsigned int queue_io_opt(struct request_queue *q)
+{
+	return q->limits.io_opt;
+}
+
+static inline int queue_alignment_offset(struct request_queue *q)
+{
+	if (q && q->limits.misaligned)
+		return -1;
+
+	if (q && q->limits.alignment_offset)
+		return q->limits.alignment_offset;
+
+	return 0;
+}
+
+static inline int queue_sector_alignment_offset(struct request_queue *q,
+						sector_t sector)
+{
+	return ((sector << 9) - q->limits.alignment_offset)
+		& (q->limits.io_min - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a1a28caed23..149fda264c8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -90,6 +90,7 @@ struct disk_stats {
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
+	sector_t alignment_offset;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-- 
cgit v1.2.3-70-g09d2


From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:55 +0200
Subject: perf_counter: Fix dynamic irq_period logging

We call perf_adjust_freq() from perf_counter_task_tick() which
is is called under the rq->lock causing lock recursion.
However, it's no longer required to be called under the
rq->lock, so remove it from under it.

Also, fix up some related comments.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.476197912@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 +
 kernel/perf_counter.c        | 3 ++-
 kernel/sched.c               | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2eedae8498d..23ddd29730f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -260,6 +260,7 @@ enum perf_event_type {
 	/*
 	 * struct {
 	 * 	struct perf_event_header	header;
+	 * 	u64				time;
 	 * 	u64				irq_period;
 	 * };
 	 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c10055416de..2f410ea2cb3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2559,7 +2559,8 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- *
+ * Log irq_period changes so that analyzing tools can re-normalize the
+ * event flow.
  */
 
 static void perf_log_period(struct perf_counter *counter, u64 period)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4c0d58bce6b..ad079f07c9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4875,9 +4875,10 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	perf_counter_task_tick(curr, cpu);
 	spin_unlock(&rq->lock);
 
+	perf_counter_task_tick(curr, cpu);
+
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
-- 
cgit v1.2.3-70-g09d2


From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:56 +0200
Subject: perf_counter: Sanitize counter->mutex

s/counter->mutex/counter->child_mutex/ and make sure its only
used to protect child_list.

The usage in __perf_counter_exit_task() doesn't appear to be
problematic since ctx->mutex also covers anything related to fd
tear-down.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.533186528@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++----
 kernel/perf_counter.c        | 47 ++++++++++++++++++--------------------------
 2 files changed, 22 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 23ddd29730f..4ab8050eb9e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -452,9 +452,6 @@ struct perf_counter {
 	struct perf_counter_context	*ctx;
 	struct file			*filp;
 
-	struct perf_counter		*parent;
-	struct list_head		child_list;
-
 	/*
 	 * These accumulate total time (in nanoseconds) that children
 	 * counters have been enabled and running, respectively.
@@ -465,7 +462,9 @@ struct perf_counter {
 	/*
 	 * Protect attach/detach and child_list:
 	 */
-	struct mutex			mutex;
+	struct mutex			child_mutex;
+	struct list_head		child_list;
+	struct perf_counter		*parent;
 
 	int				oncpu;
 	int				cpu;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2f410ea2cb3..679c3b5bb7d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,6 +111,10 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -136,7 +140,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 /*
  * Remove a counter from the lists for its context.
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -276,7 +280,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -1407,11 +1411,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	file->private_data = NULL;
 
 	mutex_lock(&ctx->mutex);
-	mutex_lock(&counter->mutex);
-
 	perf_counter_remove_from_context(counter);
-
-	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
 	free_counter(counter);
@@ -1437,7 +1437,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -1446,7 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
 		return -EINVAL;
@@ -1510,11 +1510,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	func(counter);
 	list_for_each_entry(child, &counter->child_list, child_list)
 		func(child);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 }
 
 static void perf_counter_for_each(struct perf_counter *counter,
@@ -1522,11 +1522,11 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	perf_counter_for_each_sibling(counter, func);
 	list_for_each_entry(child, &counter->child_list, child_list)
 		perf_counter_for_each_sibling(child, func);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -3106,7 +3106,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	if (!group_leader)
 		group_leader = counter;
 
-	mutex_init(&counter->mutex);
+	mutex_init(&counter->child_mutex);
+	INIT_LIST_HEAD(&counter->child_list);
+
 	INIT_LIST_HEAD(&counter->list_entry);
 	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
@@ -3114,8 +3116,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mmap_mutex);
 
-	INIT_LIST_HEAD(&counter->child_list);
-
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
@@ -3346,10 +3346,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link this into the parent counter's child list
 	 */
-	mutex_lock(&parent_counter->mutex);
+	mutex_lock(&parent_counter->child_mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-
-	mutex_unlock(&parent_counter->mutex);
+	mutex_unlock(&parent_counter->child_mutex);
 
 	return child_counter;
 }
@@ -3396,9 +3395,9 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	/*
 	 * Remove this counter from the parent's list
 	 */
-	mutex_lock(&parent_counter->mutex);
+	mutex_lock(&parent_counter->child_mutex);
 	list_del_init(&child_counter->child_list);
-	mutex_unlock(&parent_counter->mutex);
+	mutex_unlock(&parent_counter->child_mutex);
 
 	/*
 	 * Release the parent counter, if this was the last
@@ -3414,17 +3413,9 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 
-	/*
-	 * Protect against concurrent operations on child_counter
-	 * due its fd getting closed, etc.
-	 */
-	mutex_lock(&child_counter->mutex);
-
 	update_counter_times(child_counter);
 	list_del_counter(child_counter, child_ctx);
 
-	mutex_unlock(&child_counter->mutex);
-
 	parent_counter = child_counter->parent;
 	/*
 	 * It can happen that parent exits first, and has counters
-- 
cgit v1.2.3-70-g09d2


From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:29:00 +0200
Subject: perf_counter: Change pctrl() behaviour

Instead of en/dis-abling all counters acting on a particular
task, en/dis- able all counters we created.

[ v2: fix crash on first counter enable ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.916937244@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h    | 10 +++++
 include/linux/perf_counter.h |  3 ++
 include/linux/sched.h        |  2 +
 kernel/perf_counter.c        | 87 ++++++++++++--------------------------------
 4 files changed, 39 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641..353c0ac7723 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,15 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)					\
+	.perf_counter_mutex = 						\
+		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\
+	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +180,7 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
+	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4ab8050eb9e..4159ee5940f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -469,6 +469,9 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
+	struct list_head		owner_entry;
+	struct task_struct		*owner;
+
 	/* mmap bits */
 	struct mutex			mmap_mutex;
 	atomic_t			mmap_count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9714d450f41..bc9326dcdde 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,8 @@ struct task_struct {
 #endif
 #ifdef CONFIG_PERF_COUNTERS
 	struct perf_counter_context *perf_counter_ctxp;
+	struct mutex perf_counter_mutex;
+	struct list_head perf_counter_list;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e97f896133..4c86a636976 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1076,79 +1076,26 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
-int perf_counter_task_disable(void)
+int perf_counter_task_enable(void)
 {
-	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
-	unsigned long flags;
-
-	if (!ctx || !ctx->nr_counters)
-		return 0;
-
-	local_irq_save(flags);
 
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
-
-	/*
-	 * Disable all the counters:
-	 */
-	perf_disable();
-
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ERROR) {
-			update_group_times(counter);
-			counter->state = PERF_COUNTER_STATE_OFF;
-		}
-	}
-
-	perf_enable();
-
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_enable(counter);
+	mutex_unlock(&current->perf_counter_mutex);
 
 	return 0;
 }
 
-int perf_counter_task_enable(void)
+int perf_counter_task_disable(void)
 {
-	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
-	unsigned long flags;
-	int cpu;
-
-	if (!ctx || !ctx->nr_counters)
-		return 0;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
 
-	/*
-	 * Disable all the counters:
-	 */
-	perf_disable();
-
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state > PERF_COUNTER_STATE_OFF)
-			continue;
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled =
-			ctx->time - counter->total_time_enabled;
-		counter->hw_event.disabled = 0;
-	}
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-
-	perf_counter_task_sched_in(curr, cpu);
-
-	local_irq_restore(flags);
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_disable(counter);
+	mutex_unlock(&current->perf_counter_mutex);
 
 	return 0;
 }
@@ -1416,6 +1363,11 @@ static int perf_release(struct inode *inode, struct file *file)
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&ctx->mutex);
 
+	mutex_lock(&counter->owner->perf_counter_mutex);
+	list_del_init(&counter->owner_entry);
+	mutex_unlock(&counter->owner->perf_counter_mutex);
+	put_task_struct(counter->owner);
+
 	free_counter(counter);
 	put_context(ctx);
 
@@ -3272,6 +3224,12 @@ SYSCALL_DEFINE5(perf_counter_open,
 	perf_install_in_context(ctx, counter, cpu);
 	mutex_unlock(&ctx->mutex);
 
+	counter->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_counter_mutex);
+	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+	mutex_unlock(&current->perf_counter_mutex);
+
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
@@ -3488,6 +3446,9 @@ void perf_counter_init_task(struct task_struct *child)
 
 	child->perf_counter_ctxp = NULL;
 
+	mutex_init(&child->perf_counter_mutex);
+	INIT_LIST_HEAD(&child->perf_counter_list);
+
 	/*
 	 * This is executed from the parent task context, so inherit
 	 * counters that have been marked for cloning.
-- 
cgit v1.2.3-70-g09d2


From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:29:01 +0200
Subject: perf_counter: Remove perf_counter_context::nr_enabled

now that pctrl() no longer disables other people's counters,
remove the PMU cache code that deals with that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163013.032998331@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 -
 kernel/perf_counter.c        | 11 +----------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4159ee5940f..2ddf5e3c551 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -516,7 +516,6 @@ struct perf_counter_context {
 	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
-	int			nr_enabled;
 	int			is_active;
 	atomic_t		refcount;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c86a636976..cb4062559b4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -134,8 +134,6 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		ctx->nr_enabled++;
 }
 
 /*
@@ -150,8 +148,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		ctx->nr_enabled--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -406,7 +402,6 @@ static void __perf_counter_disable(void *info)
 		else
 			counter_sched_out(counter, cpuctx, ctx);
 		counter->state = PERF_COUNTER_STATE_OFF;
-		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -448,7 +443,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
-		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irq(&ctx->lock);
@@ -759,7 +753,6 @@ static void __perf_counter_enable(void *info)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-	ctx->nr_enabled++;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -850,7 +843,6 @@ static void perf_counter_enable(struct perf_counter *counter)
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->tstamp_enabled =
 			ctx->time - counter->total_time_enabled;
-		ctx->nr_enabled++;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -910,8 +902,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
 			 struct perf_counter_context *ctx2)
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen
-		&& ctx1->nr_enabled == ctx2->nr_enabled;
+		&& ctx1->parent_gen == ctx2->parent_gen;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 7df3bb8f59ca8e346bb834006c257cc367c6250a Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Fri, 22 May 2009 11:04:44 +0000
Subject: mISDN: Add watchdog functionality to hfcmulti driver

This patch was made by Titus Moldovan and provides IOCTL functions for enabling
and disabling the controller's built in watchdog. The use is optional.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hardware/mISDN/hfc_multi.h |  1 +
 drivers/isdn/hardware/mISDN/hfcmulti.c  | 32 +++++++++++++++++++++++++++++++-
 include/linux/mISDNif.h                 |  3 ++-
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfc_multi.h b/drivers/isdn/hardware/mISDN/hfc_multi.h
index 663b77f578b..5765e196291 100644
--- a/drivers/isdn/hardware/mISDN/hfc_multi.h
+++ b/drivers/isdn/hardware/mISDN/hfc_multi.h
@@ -62,6 +62,7 @@ struct hfcm_hw {
 	u_char	r_sci_msk;
 	u_char	r_tx0, r_tx1;
 	u_char	a_st_ctrl0[8];
+	u_char	r_bert_wd_md;
 	timer_t	timer;
 };
 
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 0b28141e43b..ca153de6954 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -4013,11 +4013,41 @@ open_bchannel(struct hfc_multi *hc, struct dchannel *dch,
 static int
 channel_dctrl(struct dchannel *dch, struct mISDN_ctrl_req *cq)
 {
+	struct hfc_multi	*hc = dch->hw;
 	int	ret = 0;
+	int	wd_mode, wd_cnt;
 
 	switch (cq->op) {
 	case MISDN_CTRL_GETOP:
-		cq->op = 0;
+		cq->op = MISDN_CTRL_HFC_OP;
+		break;
+	case MISDN_CTRL_HFC_WD_INIT: /* init the watchdog */
+		wd_cnt = cq->p1 & 0xf;
+		wd_mode = !!(cq->p1 >> 4);
+		if (debug & DEBUG_HFCMULTI_MSG)
+			printk(KERN_DEBUG
+			    "%s: MISDN_CTRL_HFC_WD_INIT mode %s counter 0x%x\n",
+			    __func__, wd_mode ? "AUTO" : "MANUAL", wd_cnt);
+		/* set the watchdog timer */
+		HFC_outb(hc, R_TI_WD, poll_timer | (wd_cnt << 4));
+		hc->hw.r_bert_wd_md = (wd_mode ? V_AUTO_WD_RES : 0);
+		if (hc->ctype == HFC_TYPE_XHFC)
+			hc->hw.r_bert_wd_md |= 0x40 /* V_WD_EN */;
+		/* init the watchdog register and reset the counter */
+		HFC_outb(hc, R_BERT_WD_MD, hc->hw.r_bert_wd_md | V_WD_RES);
+		if (test_bit(HFC_CHIP_PLXSD, &hc->chip)) {
+			/* enable the watchdog output for Speech-Design */
+			HFC_outb(hc, R_GPIO_SEL, V_GPIO_SEL7);
+			HFC_outb(hc, R_GPIO_EN1, V_GPIO_EN15);
+			HFC_outb(hc, R_GPIO_OUT1, 0);
+			HFC_outb(hc, R_GPIO_OUT1, V_GPIO_OUT15);
+		}
+		break;
+	case MISDN_CTRL_HFC_WD_RESET: /* reset the watchdog counter */
+		if (debug & DEBUG_HFCMULTI_MSG)
+			printk(KERN_DEBUG "%s: MISDN_CTRL_HFC_WD_RESET\n",
+			    __func__);
+		HFC_outb(hc, R_BERT_WD_MD, hc->hw.r_bert_wd_md | V_WD_RES);
 		break;
 	default:
 		printk(KERN_WARNING "%s: unknown Op %x\n",
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 5da3d95b27f..cf974593a99 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -362,7 +362,8 @@ clear_channelmap(u_int nr, u_char *map)
 #define MISDN_CTRL_HFC_RECEIVE_ON	0x4006
 #define MISDN_CTRL_HFC_ECHOCAN_ON 	0x4007
 #define MISDN_CTRL_HFC_ECHOCAN_OFF 	0x4008
-
+#define MISDN_CTRL_HFC_WD_INIT		0x4009
+#define MISDN_CTRL_HFC_WD_RESET		0x400A
 
 /* socket options */
 #define MISDN_TIME_STAMP		0x0001
-- 
cgit v1.2.3-70-g09d2


From 7cfa153dd709f15188fe84b78ae76387841fe17b Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Fri, 22 May 2009 11:04:46 +0000
Subject: mISDN: Echo canceler now gets delay information from hardware

Added tx-fifo information for calculation of current delay to sync tx and rx
streams for echo canceler.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hardware/mISDN/hfc_multi.h |  1 +
 drivers/isdn/hardware/mISDN/hfcmulti.c  |  8 +++-
 drivers/isdn/hardware/mISDN/hfcpci.c    | 70 +++++++++++++++++++--------------
 drivers/isdn/hardware/mISDN/hfcsusb.c   |  4 +-
 drivers/isdn/mISDN/dsp.h                |  2 +-
 drivers/isdn/mISDN/dsp_core.c           |  2 +-
 drivers/isdn/mISDN/dsp_pipeline.c       |  5 ++-
 drivers/isdn/mISDN/hwchannel.c          |  4 +-
 include/linux/mISDNdsp.h                |  3 +-
 include/linux/mISDNhw.h                 |  2 +-
 10 files changed, 60 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfc_multi.h b/drivers/isdn/hardware/mISDN/hfc_multi.h
index 5765e196291..c4878cc712c 100644
--- a/drivers/isdn/hardware/mISDN/hfc_multi.h
+++ b/drivers/isdn/hardware/mISDN/hfc_multi.h
@@ -44,6 +44,7 @@ struct hfc_chan {
 	int		conf;	/* conference setting of TX slot */
 	int		txpending;	/* if there is currently data in */
 					/* the FIFO 0=no, 1=yes, 2=splloop */
+	int		Zfill;	/* rx-fifo level on last hfcmulti_tx */
 	int		rx_off; /* set to turn fifo receive off */
 	int		coeff_count; /* curren coeff block */
 	s32		*coeff; /* memory pointer to 8 coeff blocks */
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index ca153de6954..3a7c26ce12c 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -1945,6 +1945,9 @@ next_frame:
 				"%d!=%d\n", __func__, hc->id + 1, temp, z2);
 		z2 = temp; /* repeat unti Z2 is equal */
 	}
+	hc->chan[ch].Zfill = z1 - z2;
+	if (hc->chan[ch].Zfill < 0)
+		hc->chan[ch].Zfill += hc->Zlen;
 	Zspace = z2 - z1;
 	if (Zspace <= 0)
 		Zspace += hc->Zlen;
@@ -2031,6 +2034,7 @@ next_frame:
 
 	/* Have to prep the audio data */
 	hc->write_fifo(hc, d, ii - i);
+	hc->chan[ch].Zfill += ii - i;
 	*idxp = ii;
 
 	/* if not all data has been written */
@@ -2226,7 +2230,7 @@ next_frame:
 			if (dch)
 				recv_Dchannel(dch);
 			else
-				recv_Bchannel(bch);
+				recv_Bchannel(bch, MISDN_ID_ANY);
 			*sp = skb;
 			again++;
 			goto next_frame;
@@ -2258,7 +2262,7 @@ next_frame:
 			    "(z1=%04x, z2=%04x) TRANS\n",
 				__func__, hc->id + 1, ch, Zsize, z1, z2);
 		/* only bch is transparent */
-		recv_Bchannel(bch);
+		recv_Bchannel(bch, hc->chan[ch].Zfill);
 		*sp = skb;
 	}
 }
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index 641a9cd1a53..60dc92562c6 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -452,7 +452,7 @@ hfcpci_empty_bfifo(struct bchannel *bch, struct bzfifo *bz,
 		}
 		bz->za[new_f2].z2 = cpu_to_le16(new_z2);
 		bz->f2 = new_f2;	/* next buffer */
-		recv_Bchannel(bch);
+		recv_Bchannel(bch, MISDN_ID_ANY);
 	}
 }
 
@@ -541,35 +541,45 @@ receive_dmsg(struct hfc_pci *hc)
  * check for transparent receive data and read max one 'poll' size if avail
  */
 static void
-hfcpci_empty_fifo_trans(struct bchannel *bch, struct bzfifo *bz, u_char *bdata)
+hfcpci_empty_fifo_trans(struct bchannel *bch, struct bzfifo *rxbz,
+	struct bzfifo *txbz, u_char *bdata)
 {
-	 __le16 *z1r, *z2r;
-	int		new_z2, fcnt, maxlen;
-	u_char		*ptr, *ptr1;
+	 __le16	*z1r, *z2r, *z1t, *z2t;
+	int	new_z2, fcnt_rx, fcnt_tx, maxlen;
+	u_char	*ptr, *ptr1;
 
-	z1r = &bz->za[MAX_B_FRAMES].z1;		/* pointer to z reg */
+	z1r = &rxbz->za[MAX_B_FRAMES].z1;	/* pointer to z reg */
 	z2r = z1r + 1;
+	z1t = &txbz->za[MAX_B_FRAMES].z1;
+	z2t = z1t + 1;
 
-	fcnt = le16_to_cpu(*z1r) - le16_to_cpu(*z2r);
-	if (!fcnt)
+	fcnt_rx = le16_to_cpu(*z1r) - le16_to_cpu(*z2r);
+	if (!fcnt_rx)
 		return;	/* no data avail */
 
-	if (fcnt <= 0)
-		fcnt += B_FIFO_SIZE;	/* bytes actually buffered */
-	new_z2 = le16_to_cpu(*z2r) + fcnt;	/* new position in fifo */
+	if (fcnt_rx <= 0)
+		fcnt_rx += B_FIFO_SIZE;	/* bytes actually buffered */
+	new_z2 = le16_to_cpu(*z2r) + fcnt_rx;	/* new position in fifo */
 	if (new_z2 >= (B_FIFO_SIZE + B_SUB_VAL))
 		new_z2 -= B_FIFO_SIZE;	/* buffer wrap */
 
-	if (fcnt > MAX_DATA_SIZE) {	/* flush, if oversized */
+	if (fcnt_rx > MAX_DATA_SIZE) {	/* flush, if oversized */
 		*z2r = cpu_to_le16(new_z2);		/* new position */
 		return;
 	}
 
-	bch->rx_skb = mI_alloc_skb(fcnt, GFP_ATOMIC);
+	fcnt_tx = le16_to_cpu(*z2t) - le16_to_cpu(*z1t);
+	if (fcnt_tx <= 0)
+		fcnt_tx += B_FIFO_SIZE;
+		    /* fcnt_tx contains available bytes in tx-fifo */
+	fcnt_tx = B_FIFO_SIZE - fcnt_tx;
+		    /* remaining bytes to send (bytes in tx-fifo) */
+
+	bch->rx_skb = mI_alloc_skb(fcnt_rx, GFP_ATOMIC);
 	if (bch->rx_skb) {
-		ptr = skb_put(bch->rx_skb, fcnt);
-		if (le16_to_cpu(*z2r) + fcnt <= B_FIFO_SIZE + B_SUB_VAL)
-			maxlen = fcnt;	/* complete transfer */
+		ptr = skb_put(bch->rx_skb, fcnt_rx);
+		if (le16_to_cpu(*z2r) + fcnt_rx <= B_FIFO_SIZE + B_SUB_VAL)
+			maxlen = fcnt_rx;	/* complete transfer */
 		else
 			maxlen = B_FIFO_SIZE + B_SUB_VAL - le16_to_cpu(*z2r);
 			    /* maximum */
@@ -577,14 +587,14 @@ hfcpci_empty_fifo_trans(struct bchannel *bch, struct bzfifo *bz, u_char *bdata)
 		ptr1 = bdata + (le16_to_cpu(*z2r) - B_SUB_VAL);
 		    /* start of data */
 		memcpy(ptr, ptr1, maxlen);	/* copy data */
-		fcnt -= maxlen;
+		fcnt_rx -= maxlen;
 
-		if (fcnt) {	/* rest remaining */
+		if (fcnt_rx) {	/* rest remaining */
 			ptr += maxlen;
 			ptr1 = bdata;	/* start of buffer */
-			memcpy(ptr, ptr1, fcnt);	/* rest */
+			memcpy(ptr, ptr1, fcnt_rx);	/* rest */
 		}
-		recv_Bchannel(bch);
+		recv_Bchannel(bch, fcnt_tx); /* bch, id */
 	} else
 		printk(KERN_WARNING "HFCPCI: receive out of memory\n");
 
@@ -600,26 +610,28 @@ main_rec_hfcpci(struct bchannel *bch)
 	struct hfc_pci	*hc = bch->hw;
 	int		rcnt, real_fifo;
 	int		receive = 0, count = 5;
-	struct bzfifo	*bz;
+	struct bzfifo	*txbz, *rxbz;
 	u_char		*bdata;
 	struct zt	*zp;
 
 	if ((bch->nr & 2) && (!hc->hw.bswapped)) {
-		bz = &((union fifo_area *)(hc->hw.fifos))->b_chans.rxbz_b2;
+		rxbz = &((union fifo_area *)(hc->hw.fifos))->b_chans.rxbz_b2;
+		txbz = &((union fifo_area *)(hc->hw.fifos))->b_chans.txbz_b2;
 		bdata = ((union fifo_area *)(hc->hw.fifos))->b_chans.rxdat_b2;
 		real_fifo = 1;
 	} else {
-		bz = &((union fifo_area *)(hc->hw.fifos))->b_chans.rxbz_b1;
+		rxbz = &((union fifo_area *)(hc->hw.fifos))->b_chans.rxbz_b1;
+		txbz = &((union fifo_area *)(hc->hw.fifos))->b_chans.txbz_b1;
 		bdata = ((union fifo_area *)(hc->hw.fifos))->b_chans.rxdat_b1;
 		real_fifo = 0;
 	}
 Begin:
 	count--;
-	if (bz->f1 != bz->f2) {
+	if (rxbz->f1 != rxbz->f2) {
 		if (bch->debug & DEBUG_HW_BCHANNEL)
 			printk(KERN_DEBUG "hfcpci rec ch(%x) f1(%d) f2(%d)\n",
-			    bch->nr, bz->f1, bz->f2);
-		zp = &bz->za[bz->f2];
+			    bch->nr, rxbz->f1, rxbz->f2);
+		zp = &rxbz->za[rxbz->f2];
 
 		rcnt = le16_to_cpu(zp->z1) - le16_to_cpu(zp->z2);
 		if (rcnt < 0)
@@ -630,8 +642,8 @@ Begin:
 			    "hfcpci rec ch(%x) z1(%x) z2(%x) cnt(%d)\n",
 			    bch->nr, le16_to_cpu(zp->z1),
 			    le16_to_cpu(zp->z2), rcnt);
-		hfcpci_empty_bfifo(bch, bz, bdata, rcnt);
-		rcnt = bz->f1 - bz->f2;
+		hfcpci_empty_bfifo(bch, rxbz, bdata, rcnt);
+		rcnt = rxbz->f1 - rxbz->f2;
 		if (rcnt < 0)
 			rcnt += MAX_B_FRAMES + 1;
 		if (hc->hw.last_bfifo_cnt[real_fifo] > rcnt + 1) {
@@ -644,7 +656,7 @@ Begin:
 		else
 			receive = 0;
 	} else if (test_bit(FLG_TRANSPARENT, &bch->Flags)) {
-		hfcpci_empty_fifo_trans(bch, bz, bdata);
+		hfcpci_empty_fifo_trans(bch, rxbz, txbz, bdata);
 		return;
 	} else
 		receive = 0;
diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.c b/drivers/isdn/hardware/mISDN/hfcsusb.c
index 9c427fb204e..6b7704c41b9 100644
--- a/drivers/isdn/hardware/mISDN/hfcsusb.c
+++ b/drivers/isdn/hardware/mISDN/hfcsusb.c
@@ -947,7 +947,7 @@ hfcsusb_rx_frame(struct usb_fifo *fifo, __u8 *data, unsigned int len,
 				if (fifo->dch)
 					recv_Dchannel(fifo->dch);
 				if (fifo->bch)
-					recv_Bchannel(fifo->bch);
+					recv_Bchannel(fifo->bch, MISDN_ID_ANY);
 				if (fifo->ech)
 					recv_Echannel(fifo->ech,
 						     &hw->dch);
@@ -969,7 +969,7 @@ hfcsusb_rx_frame(struct usb_fifo *fifo, __u8 *data, unsigned int len,
 	} else {
 		/* deliver transparent data to layer2 */
 		if (rx_skb->len >= poll)
-			recv_Bchannel(fifo->bch);
+			recv_Bchannel(fifo->bch, MISDN_ID_ANY);
 	}
 	spin_unlock(&hw->lock);
 }
diff --git a/drivers/isdn/mISDN/dsp.h b/drivers/isdn/mISDN/dsp.h
index 0a4a362b89c..4a1c444d73a 100644
--- a/drivers/isdn/mISDN/dsp.h
+++ b/drivers/isdn/mISDN/dsp.h
@@ -262,5 +262,5 @@ extern int  dsp_pipeline_build(struct dsp_pipeline *pipeline, const char *cfg);
 extern void dsp_pipeline_process_tx(struct dsp_pipeline *pipeline, u8 *data,
 		int len);
 extern void dsp_pipeline_process_rx(struct dsp_pipeline *pipeline, u8 *data,
-		int len);
+		int len, unsigned int txlen);
 
diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c
index 3083338716b..1c49368e0a9 100644
--- a/drivers/isdn/mISDN/dsp_core.c
+++ b/drivers/isdn/mISDN/dsp_core.c
@@ -710,7 +710,7 @@ dsp_function(struct mISDNchannel *ch,  struct sk_buff *skb)
 		/* pipeline */
 		if (dsp->pipeline.inuse)
 			dsp_pipeline_process_rx(&dsp->pipeline, skb->data,
-				skb->len);
+				skb->len, hh->id);
 		/* change volume if requested */
 		if (dsp->rx_volume)
 			dsp_change_volume(skb, dsp->rx_volume);
diff --git a/drivers/isdn/mISDN/dsp_pipeline.c b/drivers/isdn/mISDN/dsp_pipeline.c
index 18cf87c113e..ac61f198eb3 100644
--- a/drivers/isdn/mISDN/dsp_pipeline.c
+++ b/drivers/isdn/mISDN/dsp_pipeline.c
@@ -347,7 +347,8 @@ void dsp_pipeline_process_tx(struct dsp_pipeline *pipeline, u8 *data, int len)
 			entry->elem->process_tx(entry->p, data, len);
 }
 
-void dsp_pipeline_process_rx(struct dsp_pipeline *pipeline, u8 *data, int len)
+void dsp_pipeline_process_rx(struct dsp_pipeline *pipeline, u8 *data, int len,
+	unsigned int txlen)
 {
 	struct dsp_pipeline_entry *entry;
 
@@ -356,7 +357,7 @@ void dsp_pipeline_process_rx(struct dsp_pipeline *pipeline, u8 *data, int len)
 
 	list_for_each_entry_reverse(entry, &pipeline->list, list)
 		if (entry->elem->process_rx)
-			entry->elem->process_rx(entry->p, data, len);
+			entry->elem->process_rx(entry->p, data, len, txlen);
 }
 
 
diff --git a/drivers/isdn/mISDN/hwchannel.c b/drivers/isdn/mISDN/hwchannel.c
index ab1168a110a..0481a0cdf6d 100644
--- a/drivers/isdn/mISDN/hwchannel.c
+++ b/drivers/isdn/mISDN/hwchannel.c
@@ -185,13 +185,13 @@ recv_Echannel(struct dchannel *ech, struct dchannel *dch)
 EXPORT_SYMBOL(recv_Echannel);
 
 void
-recv_Bchannel(struct bchannel *bch)
+recv_Bchannel(struct bchannel *bch, unsigned int id)
 {
 	struct mISDNhead *hh;
 
 	hh = mISDN_HEAD_P(bch->rx_skb);
 	hh->prim = PH_DATA_IND;
-	hh->id = MISDN_ID_ANY;
+	hh->id = id;
 	if (bch->rcount >= 64) {
 		printk(KERN_WARNING "B-channel %p receive queue overflow, "
 			"fushing!\n", bch);
diff --git a/include/linux/mISDNdsp.h b/include/linux/mISDNdsp.h
index 6b71d2dce50..2c483d45f14 100644
--- a/include/linux/mISDNdsp.h
+++ b/include/linux/mISDNdsp.h
@@ -12,7 +12,8 @@ struct mISDN_dsp_element {
 	void	*(*new)(const char *arg);
 	void	(*free)(void *p);
 	void	(*process_tx)(void *p, unsigned char *data, int len);
-	void	(*process_rx)(void *p, unsigned char *data, int len);
+	void	(*process_rx)(void *p, unsigned char *data, int len,
+			unsigned int txlen);
 	int	num_args;
 	struct mISDN_dsp_element_arg
 		*args;
diff --git a/include/linux/mISDNhw.h b/include/linux/mISDNhw.h
index 97ffdc1d344..ce900f4c245 100644
--- a/include/linux/mISDNhw.h
+++ b/include/linux/mISDNhw.h
@@ -185,7 +185,7 @@ extern int	dchannel_senddata(struct dchannel *, struct sk_buff *);
 extern int	bchannel_senddata(struct bchannel *, struct sk_buff *);
 extern void	recv_Dchannel(struct dchannel *);
 extern void	recv_Echannel(struct dchannel *, struct dchannel *);
-extern void	recv_Bchannel(struct bchannel *);
+extern void	recv_Bchannel(struct bchannel *, unsigned int id);
 extern void	recv_Dchannel_skb(struct dchannel *, struct sk_buff *);
 extern void	recv_Bchannel_skb(struct bchannel *, struct sk_buff *);
 extern void	confirm_Bsend(struct bchannel *bch);
-- 
cgit v1.2.3-70-g09d2


From e73f6b2260daf02793071e5ce06ea87df762920a Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Fri, 22 May 2009 11:04:48 +0000
Subject: mISDN: Added layer-1-hold feature

Add IMHOLD_L1 ioctl.
The feature will be disabled on closing.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/mISDN/socket.c | 21 ++++++++++++++++++---
 drivers/isdn/mISDN/tei.c    | 43 +++++++++++++++++++++++++++++++------------
 include/linux/mISDNif.h     |  2 ++
 3 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 508945d1b9c..530f6897736 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -292,7 +292,7 @@ static int
 data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p)
 {
 	struct mISDN_ctrl_req	cq;
-	int			err = -EINVAL, val;
+	int			err = -EINVAL, val[2];
 	struct mISDNchannel	*bchan, *next;
 
 	lock_sock(sk);
@@ -328,12 +328,27 @@ data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p)
 			err = -EINVAL;
 			break;
 		}
-		if (get_user(val, (int __user *)p)) {
+		val[0] = cmd;
+		if (get_user(val[1], (int __user *)p)) {
 			err = -EFAULT;
 			break;
 		}
 		err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr,
-		    CONTROL_CHANNEL, &val);
+		    CONTROL_CHANNEL, val);
+		break;
+	case IMHOLD_L1:
+		if (sk->sk_protocol != ISDN_P_LAPD_NT
+		 && sk->sk_protocol != ISDN_P_LAPD_TE) {
+			err = -EINVAL;
+			break;
+		}
+		val[0] = cmd;
+		if (get_user(val[1], (int __user *)p)) {
+			err = -EFAULT;
+			break;
+		}
+		err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr,
+		    CONTROL_CHANNEL, val);
 		break;
 	default:
 		err = -EINVAL;
diff --git a/drivers/isdn/mISDN/tei.c b/drivers/isdn/mISDN/tei.c
index b452dead8fd..c75af762a06 100644
--- a/drivers/isdn/mISDN/tei.c
+++ b/drivers/isdn/mISDN/tei.c
@@ -122,8 +122,11 @@ da_deactivate(struct FsmInst *fi, int event, void *arg)
 	}
 	read_unlock_irqrestore(&mgr->lock, flags);
 	/* All TEI are inactiv */
-	mISDN_FsmAddTimer(&mgr->datimer, DATIMER_VAL, EV_DATIMER, NULL, 1);
-	mISDN_FsmChangeState(fi, ST_L1_DEACT_PENDING);
+	if (!test_bit(OPTION_L1_HOLD, &mgr->options)) {
+		mISDN_FsmAddTimer(&mgr->datimer, DATIMER_VAL, EV_DATIMER,
+			NULL, 1);
+		mISDN_FsmChangeState(fi, ST_L1_DEACT_PENDING);
+	}
 }
 
 static void
@@ -132,9 +135,11 @@ da_ui(struct FsmInst *fi, int event, void *arg)
 	struct manager	*mgr = fi->userdata;
 
 	/* restart da timer */
-	mISDN_FsmDelTimer(&mgr->datimer, 2);
-	mISDN_FsmAddTimer(&mgr->datimer, DATIMER_VAL, EV_DATIMER, NULL, 2);
-
+	if (!test_bit(OPTION_L1_HOLD, &mgr->options)) {
+		mISDN_FsmDelTimer(&mgr->datimer, 2);
+		mISDN_FsmAddTimer(&mgr->datimer, DATIMER_VAL, EV_DATIMER,
+			NULL, 2);
+	}
 }
 
 static void
@@ -1103,6 +1108,7 @@ free_teimanager(struct manager *mgr)
 {
 	struct layer2	*l2, *nl2;
 
+	test_and_clear_bit(OPTION_L1_HOLD, &mgr->options);
 	if (test_bit(MGR_OPT_NETWORK, &mgr->options)) {
 		/* not locked lock is taken in release tei */
 		mgr->up = NULL;
@@ -1133,13 +1139,26 @@ static int
 ctrl_teimanager(struct manager *mgr, void *arg)
 {
 	/* currently we only have one option */
-	int	clean = *((int *)arg);
-
-	if (clean)
-		test_and_set_bit(OPTION_L2_CLEANUP, &mgr->options);
-	else
-		test_and_clear_bit(OPTION_L2_CLEANUP, &mgr->options);
-	return 0;
+	int	*val = (int *)arg;
+	int	ret = 0;
+
+	switch (val[0]) {
+	case IMCLEAR_L2:
+		if (val[1])
+			test_and_set_bit(OPTION_L2_CLEANUP, &mgr->options);
+		else
+			test_and_clear_bit(OPTION_L2_CLEANUP, &mgr->options);
+		break;
+	case IMHOLD_L1:
+		if (val[1])
+			test_and_set_bit(OPTION_L1_HOLD, &mgr->options);
+		else
+			test_and_clear_bit(OPTION_L1_HOLD, &mgr->options);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
 }
 
 /* This function does create a L2 for fixed TEI in NT Mode */
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index cf974593a99..0b28fd1e393 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -229,6 +229,7 @@
 #define OPTION_L2_PTP		2
 #define OPTION_L2_FIXEDTEI	3
 #define OPTION_L2_CLEANUP	4
+#define OPTION_L1_HOLD		5
 
 /* should be in sync with linux/kobject.h:KOBJ_NAME_LEN */
 #define MISDN_MAX_IDLEN		20
@@ -317,6 +318,7 @@ struct ph_info {
 #define IMCTRLREQ	_IOR('I', 69, int)
 #define IMCLEAR_L2	_IOR('I', 70, int)
 #define IMSETDEVNAME	_IOR('I', 71, struct mISDN_devrename)
+#define IMHOLD_L1	_IOR('I', 72, int)
 
 static inline int
 test_channelmap(u_int nr, u_char *map)
-- 
cgit v1.2.3-70-g09d2


From db9bb63a1b5b65df41d112a8c21adbbfc6a4ac08 Mon Sep 17 00:00:00 2001
From: Karsten Keil <keil@b1-systems.de>
Date: Fri, 22 May 2009 11:04:53 +0000
Subject: mISDN: Add XHFC support for embedded Speech-Design board to hfcmulti

New version without emulating arch specific stuff for the other
architectures, the special IO and init functions for the 8xx
microcontroller are in a separate include file.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hardware/mISDN/Kconfig         |  11 +-
 drivers/isdn/hardware/mISDN/hfc_multi.h     |  45 ++-
 drivers/isdn/hardware/mISDN/hfc_multi_8xx.h | 167 +++++++++++
 drivers/isdn/hardware/mISDN/hfcmulti.c      | 416 +++++++++++++++++++---------
 drivers/isdn/mISDN/dsp_cmx.c                |   4 +
 include/linux/mISDNdsp.h                    |   1 +
 6 files changed, 500 insertions(+), 144 deletions(-)
 create mode 100644 drivers/isdn/hardware/mISDN/hfc_multi_8xx.h

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/Kconfig b/drivers/isdn/hardware/mISDN/Kconfig
index fd112ae252c..3024566dd09 100644
--- a/drivers/isdn/hardware/mISDN/Kconfig
+++ b/drivers/isdn/hardware/mISDN/Kconfig
@@ -13,7 +13,7 @@ config MISDN_HFCPCI
 
 config MISDN_HFCMULTI
 	tristate "Support for HFC multiport cards (HFC-4S/8S/E1)"
-	depends on PCI
+	depends on PCI || 8xx
 	depends on MISDN
 	help
 	  Enable support for cards with Cologne Chip AG's HFC multiport
@@ -23,6 +23,15 @@ config MISDN_HFCMULTI
 	   * HFC-8S (8 S/T interfaces on one chip)
 	   * HFC-E1 (E1 interface for 2Mbit ISDN)
 
+config MISDN_HFCMULTI_8xx
+	boolean "Support for XHFC embedded board in HFC multiport driver"
+	depends on MISDN
+	depends on MISDN_HFCMULTI
+	depends on 8xx
+	default 8xx
+	help
+	  Enable support for the XHFC embedded solution from Speech Design.
+
 config MISDN_HFCUSB
 	tristate "Support for HFC-S USB based TAs"
 	depends on USB
diff --git a/drivers/isdn/hardware/mISDN/hfc_multi.h b/drivers/isdn/hardware/mISDN/hfc_multi.h
index c4878cc712c..0c773866efc 100644
--- a/drivers/isdn/hardware/mISDN/hfc_multi.h
+++ b/drivers/isdn/hardware/mISDN/hfc_multi.h
@@ -17,6 +17,16 @@
 #define	PCI_ENA_REGIO	0x01
 #define	PCI_ENA_MEMIO	0x02
 
+#define XHFC_IRQ	4		/* SIU_IRQ2 */
+#define XHFC_MEMBASE	0xFE000000
+#define XHFC_MEMSIZE    0x00001000
+#define XHFC_OFFSET	0x00001000
+#define PA_XHFC_A0	0x0020		/* PA10 */
+#define PB_XHFC_IRQ1	0x00000100	/* PB23 */
+#define PB_XHFC_IRQ2	0x00000200	/* PB22 */
+#define PB_XHFC_IRQ3	0x00000400	/* PB21 */
+#define PB_XHFC_IRQ4	0x00000800	/* PB20 */
+
 /*
  * NOTE: some registers are assigned multiple times due to different modes
  *       also registers are assigned differen for HFC-4s/8s and HFC-E1
@@ -81,6 +91,11 @@ struct hfcm_hw {
 #define	HFC_CFG_CRC4		10 /* disable CRC-4 Multiframe mode, */
 					/* use double frame instead. */
 
+#define HFC_TYPE_E1		1 /* controller is HFC-E1 */
+#define HFC_TYPE_4S		4 /* controller is HFC-4S */
+#define HFC_TYPE_8S		8 /* controller is HFC-8S */
+#define HFC_TYPE_XHFC		5 /* controller is XHFC */
+
 #define	HFC_CHIP_EXRAM_128	0 /* external ram 128k */
 #define	HFC_CHIP_EXRAM_512	1 /* external ram 256k */
 #define	HFC_CHIP_REVISION0	2 /* old fifo handling */
@@ -88,19 +103,22 @@ struct hfcm_hw {
 #define	HFC_CHIP_PCM_MASTER	4 /* PCM is master */
 #define	HFC_CHIP_RX_SYNC	5 /* disable pll sync for pcm */
 #define	HFC_CHIP_DTMF		6 /* DTMF decoding is enabled */
-#define	HFC_CHIP_ULAW		7 /* ULAW mode */
-#define	HFC_CHIP_CLOCK2		8 /* double clock mode */
-#define	HFC_CHIP_E1CLOCK_GET	9 /* always get clock from E1 interface */
-#define	HFC_CHIP_E1CLOCK_PUT	10 /* always put clock from E1 interface */
-#define	HFC_CHIP_WATCHDOG	11 /* whether we should send signals */
+#define	HFC_CHIP_CONF		7 /* conference handling is enabled */
+#define	HFC_CHIP_ULAW		8 /* ULAW mode */
+#define	HFC_CHIP_CLOCK2		9 /* double clock mode */
+#define	HFC_CHIP_E1CLOCK_GET	10 /* always get clock from E1 interface */
+#define	HFC_CHIP_E1CLOCK_PUT	11 /* always put clock from E1 interface */
+#define	HFC_CHIP_WATCHDOG	12 /* whether we should send signals */
 					/* to the watchdog */
-#define	HFC_CHIP_B410P		12 /* whether we have a b410p with echocan in */
+#define	HFC_CHIP_B410P		13 /* whether we have a b410p with echocan in */
 					/* hw */
-#define	HFC_CHIP_PLXSD		13 /* whether we have a Speech-Design PLX */
+#define	HFC_CHIP_PLXSD		14 /* whether we have a Speech-Design PLX */
+#define	HFC_CHIP_EMBSD          15 /* whether we have a SD Embedded board */
 
 #define HFC_IO_MODE_PCIMEM	0x00 /* normal memory mapped IO */
 #define HFC_IO_MODE_REGIO	0x01 /* PCI io access */
 #define HFC_IO_MODE_PLXSD	0x02 /* access HFC via PLX9030 */
+#define HFC_IO_MODE_EMBSD	0x03 /* direct access */
 
 /* table entry in the PCI devices list */
 struct hm_map {
@@ -113,6 +131,7 @@ struct hm_map {
 	int opticalsupport;
 	int dip_type;
 	int io_mode;
+	int irq;
 };
 
 struct hfc_multi {
@@ -120,7 +139,7 @@ struct hfc_multi {
 	struct hm_map	*mtyp;
 	int		id;
 	int		pcm;	/* id of pcm bus */
-	int		type;
+	int		ctype;	/* controller type */
 	int		ports;
 
 	u_int		irq;	/* irq used by card */
@@ -160,10 +179,16 @@ struct hfc_multi {
 				int len);
 	void		(*write_fifo)(struct hfc_multi *hc, u_char *data,
 				int len);
-	u_long		pci_origmembase, plx_origmembase, dsp_origmembase;
+	u_long		pci_origmembase, plx_origmembase;
 	void __iomem	*pci_membase; /* PCI memory */
 	void __iomem	*plx_membase; /* PLX memory */
-	u_char		*dsp_membase; /* DSP on PLX */
+	u_long		xhfc_origmembase;
+	u_char		*xhfc_membase;
+	u_long		*xhfc_memaddr, *xhfc_memdata;
+#ifdef CONFIG_MISDN_HFCMULTI_8xx
+	struct immap	*immap;
+#endif
+	u_long		pb_irqmsk;	/* Portbit mask to check the IRQ line */
 	u_long		pci_iobase; /* PCI IO */
 	struct hfcm_hw	hw;	/* remember data of write-only-registers */
 
diff --git a/drivers/isdn/hardware/mISDN/hfc_multi_8xx.h b/drivers/isdn/hardware/mISDN/hfc_multi_8xx.h
new file mode 100644
index 00000000000..45ddced956d
--- /dev/null
+++ b/drivers/isdn/hardware/mISDN/hfc_multi_8xx.h
@@ -0,0 +1,167 @@
+/*
+ * For License see notice in hfc_multi.c
+ *
+ * special IO and init functions for the embedded XHFC board
+ * from Speech Design
+ *
+ */
+
+#include <asm/8xx_immap.h>
+
+/* Change this to the value used by your board */
+#ifndef IMAP_ADDR
+#define IMAP_ADDR	0xFFF00000
+#endif
+
+static void
+#ifdef HFC_REGISTER_DEBUG
+HFC_outb_embsd(struct hfc_multi *hc, u_char reg, u_char val,
+		const char *function, int line)
+#else
+HFC_outb_embsd(struct hfc_multi *hc, u_char reg, u_char val)
+#endif
+{
+	hc->immap->im_ioport.iop_padat |= PA_XHFC_A0;
+	writeb(reg, hc->xhfc_memaddr);
+	hc->immap->im_ioport.iop_padat &= ~(PA_XHFC_A0);
+	writeb(val, hc->xhfc_memdata);
+}
+static u_char
+#ifdef HFC_REGISTER_DEBUG
+HFC_inb_embsd(struct hfc_multi *hc, u_char reg, const char *function, int line)
+#else
+HFC_inb_embsd(struct hfc_multi *hc, u_char reg)
+#endif
+{
+	hc->immap->im_ioport.iop_padat |= PA_XHFC_A0;
+	writeb(reg, hc->xhfc_memaddr);
+	hc->immap->im_ioport.iop_padat &= ~(PA_XHFC_A0);
+	return readb(hc->xhfc_memdata);
+}
+static u_short
+#ifdef HFC_REGISTER_DEBUG
+HFC_inw_embsd(struct hfc_multi *hc, u_char reg, const char *function, int line)
+#else
+HFC_inw_embsd(struct hfc_multi *hc, u_char reg)
+#endif
+{
+	hc->immap->im_ioport.iop_padat |= PA_XHFC_A0;
+	writeb(reg, hc->xhfc_memaddr);
+	hc->immap->im_ioport.iop_padat &= ~(PA_XHFC_A0);
+	return readb(hc->xhfc_memdata);
+}
+static void
+#ifdef HFC_REGISTER_DEBUG
+HFC_wait_embsd(struct hfc_multi *hc, const char *function, int line)
+#else
+HFC_wait_embsd(struct hfc_multi *hc)
+#endif
+{
+	hc->immap->im_ioport.iop_padat |= PA_XHFC_A0;
+	writeb(R_STATUS, hc->xhfc_memaddr);
+	hc->immap->im_ioport.iop_padat &= ~(PA_XHFC_A0);
+	while (readb(hc->xhfc_memdata) & V_BUSY)
+		cpu_relax();
+}
+
+/* write fifo data (EMBSD) */
+void
+write_fifo_embsd(struct hfc_multi *hc, u_char *data, int len)
+{
+	hc->immap->im_ioport.iop_padat |= PA_XHFC_A0;
+	*hc->xhfc_memaddr = A_FIFO_DATA0;
+	hc->immap->im_ioport.iop_padat &= ~(PA_XHFC_A0);
+	while (len) {
+		*hc->xhfc_memdata = *data;
+		data++;
+		len--;
+	}
+}
+
+/* read fifo data (EMBSD) */
+void
+read_fifo_embsd(struct hfc_multi *hc, u_char *data, int len)
+{
+	hc->immap->im_ioport.iop_padat |= PA_XHFC_A0;
+	*hc->xhfc_memaddr = A_FIFO_DATA0;
+	hc->immap->im_ioport.iop_padat &= ~(PA_XHFC_A0);
+	while (len) {
+		*data = (u_char)(*hc->xhfc_memdata);
+		data++;
+		len--;
+	}
+}
+
+static int
+setup_embedded(struct hfc_multi *hc, struct hm_map *m)
+{
+	printk(KERN_INFO
+	    "HFC-multi: card manufacturer: '%s' card name: '%s' clock: %s\n",
+	    m->vendor_name, m->card_name, m->clock2 ? "double" : "normal");
+
+	hc->pci_dev = NULL;
+	if (m->clock2)
+		test_and_set_bit(HFC_CHIP_CLOCK2, &hc->chip);
+
+	hc->leds = m->leds;
+	hc->ledstate = 0xAFFEAFFE;
+	hc->opticalsupport = m->opticalsupport;
+
+	hc->pci_iobase = 0;
+	hc->pci_membase = 0;
+	hc->xhfc_membase = NULL;
+	hc->xhfc_memaddr = NULL;
+	hc->xhfc_memdata = NULL;
+
+	/* set memory access methods */
+	if (m->io_mode) /* use mode from card config */
+		hc->io_mode = m->io_mode;
+	switch (hc->io_mode) {
+	case HFC_IO_MODE_EMBSD:
+		test_and_set_bit(HFC_CHIP_EMBSD, &hc->chip);
+		hc->slots = 128; /* required */
+		/* fall through */
+		hc->HFC_outb = HFC_outb_embsd;
+		hc->HFC_inb = HFC_inb_embsd;
+		hc->HFC_inw = HFC_inw_embsd;
+		hc->HFC_wait = HFC_wait_embsd;
+		hc->read_fifo = read_fifo_embsd;
+		hc->write_fifo = write_fifo_embsd;
+		hc->xhfc_origmembase = XHFC_MEMBASE + XHFC_OFFSET * hc->id;
+		hc->xhfc_membase = (u_char *)ioremap(hc->xhfc_origmembase,
+				XHFC_MEMSIZE);
+		if (!hc->xhfc_membase) {
+			printk(KERN_WARNING
+			    "HFC-multi: failed to remap xhfc address space. "
+			    "(internal error)\n");
+			return -EIO;
+		}
+		hc->xhfc_memaddr = (u_long *)(hc->xhfc_membase + 4);
+		hc->xhfc_memdata = (u_long *)(hc->xhfc_membase);
+		printk(KERN_INFO
+		    "HFC-multi: xhfc_membase:%#lx xhfc_origmembase:%#lx "
+		    "xhfc_memaddr:%#lx xhfc_memdata:%#lx\n",
+		    (u_long)hc->xhfc_membase, hc->xhfc_origmembase,
+		    (u_long)hc->xhfc_memaddr, (u_long)hc->xhfc_memdata);
+		break;
+	default:
+		printk(KERN_WARNING "HFC-multi: Invalid IO mode.\n");
+		return -EIO;
+	}
+
+	/* Prepare the MPC8XX PortA 10 as output (address/data selector) */
+	hc->immap = (struct immap *)(IMAP_ADDR);
+	hc->immap->im_ioport.iop_papar &= ~(PA_XHFC_A0);
+	hc->immap->im_ioport.iop_paodr &= ~(PA_XHFC_A0);
+	hc->immap->im_ioport.iop_padir |=   PA_XHFC_A0;
+
+	/* Prepare the MPC8xx PortB __X__ as input (ISDN__X__IRQ) */
+	hc->pb_irqmsk = (PB_XHFC_IRQ1 << hc->id);
+	hc->immap->im_cpm.cp_pbpar &= ~(hc->pb_irqmsk);
+	hc->immap->im_cpm.cp_pbodr &= ~(hc->pb_irqmsk);
+	hc->immap->im_cpm.cp_pbdir &= ~(hc->pb_irqmsk);
+
+	/* At this point the needed config is done */
+	/* fifos are still not enabled */
+	return 0;
+}
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index bc0d3efeb56..5be4edf631d 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -139,6 +139,10 @@
  *	Selects interface with clock source for mISDN and applications.
  *	Set to card number starting with 1. Set to -1 to disable.
  *	By default, the first card is used as clock source.
+ *
+ * hwid:
+ *	NOTE: only one hwid value must be given once
+ * 	Enable special embedded devices with XHFC controllers.
  */
 
 /*
@@ -206,6 +210,11 @@ static int	clock;
 static uint	timer;
 static uint	clockdelay_te = CLKDEL_TE;
 static uint	clockdelay_nt = CLKDEL_NT;
+#define HWID_NONE	0
+#define HWID_MINIP4	1
+#define HWID_MINIP8	2
+#define HWID_MINIP16	3
+static uint	hwid = HWID_NONE;
 
 static int	HFC_cnt, Port_cnt, PCM_cnt = 99;
 
@@ -223,6 +232,7 @@ module_param_array(pcm, int, NULL, S_IRUGO | S_IWUSR);
 module_param_array(dslot, int, NULL, S_IRUGO | S_IWUSR);
 module_param_array(iomode, uint, NULL, S_IRUGO | S_IWUSR);
 module_param_array(port, uint, NULL, S_IRUGO | S_IWUSR);
+module_param(hwid, uint, S_IRUGO | S_IWUSR); /* The hardware ID */
 
 #ifdef HFC_REGISTER_DEBUG
 #define HFC_outb(hc, reg, val) \
@@ -252,6 +262,10 @@ module_param_array(port, uint, NULL, S_IRUGO | S_IWUSR);
 #define HFC_wait_nodebug(hc)		(hc->HFC_wait_nodebug(hc))
 #endif
 
+#ifdef CONFIG_MISDN_HFCMULTI_8xx
+#include "hfc_multi_8xx.h"
+#endif
+
 /* HFC_IO_MODE_PCIMEM */
 static void
 #ifdef HFC_REGISTER_DEBUG
@@ -928,7 +942,7 @@ hfcmulti_resync(struct hfc_multi *locked, struct hfc_multi *newmaster, int rm)
 			writel(pv, plx_acc_32);
 			if (test_bit(HFC_CHIP_PCM_MASTER, &hc->chip)) {
 				pcmmaster = hc;
-				if (hc->type == 1) {
+				if (hc->ctype == HFC_TYPE_E1) {
 					if (debug & DEBUG_HFCMULTI_PLXSD)
 						printk(KERN_DEBUG
 							"Schedule SYNC_I\n");
@@ -949,7 +963,8 @@ hfcmulti_resync(struct hfc_multi *locked, struct hfc_multi *newmaster, int rm)
 		pv |= PLX_SYNC_O_EN;
 		writel(pv, plx_acc_32);
 		/* switch to jatt PLL, if not disabled by RX_SYNC */
-		if (hc->type == 1 && !test_bit(HFC_CHIP_RX_SYNC, &hc->chip)) {
+		if (hc->ctype == HFC_TYPE_E1
+				&& !test_bit(HFC_CHIP_RX_SYNC, &hc->chip)) {
 			if (debug & DEBUG_HFCMULTI_PLXSD)
 				printk(KERN_DEBUG "Schedule jatt PLL\n");
 			hc->e1_resync |= 2; /* switch to jatt */
@@ -961,7 +976,7 @@ hfcmulti_resync(struct hfc_multi *locked, struct hfc_multi *newmaster, int rm)
 				printk(KERN_DEBUG
 					"id=%d (0x%p) = PCM master syncronized "
 					"with QUARTZ\n", hc->id, hc);
-			if (hc->type == 1) {
+			if (hc->ctype == HFC_TYPE_E1) {
 				/* Use the crystal clock for the PCM
 				   master card */
 				if (debug & DEBUG_HFCMULTI_PLXSD)
@@ -972,7 +987,7 @@ hfcmulti_resync(struct hfc_multi *locked, struct hfc_multi *newmaster, int rm)
 				if (debug & DEBUG_HFCMULTI_PLXSD)
 					printk(KERN_DEBUG
 					    "QUARTZ is automatically "
-					    "enabled by HFC-%dS\n", hc->type);
+					    "enabled by HFC-%dS\n", hc->ctype);
 			}
 			plx_acc_32 = hc->plx_membase + PLX_GPIOC;
 			pv = readl(plx_acc_32);
@@ -1060,13 +1075,16 @@ release_io_hfcmulti(struct hfc_multi *hc)
 
 	/* disable memory mapped ports / io ports */
 	test_and_clear_bit(HFC_CHIP_PLXSD, &hc->chip); /* prevent resync */
-	pci_write_config_word(hc->pci_dev, PCI_COMMAND, 0);
+	if (hc->pci_dev)
+		pci_write_config_word(hc->pci_dev, PCI_COMMAND, 0);
 	if (hc->pci_membase)
 		iounmap(hc->pci_membase);
 	if (hc->plx_membase)
 		iounmap(hc->plx_membase);
 	if (hc->pci_iobase)
 		release_region(hc->pci_iobase, 8);
+	if (hc->xhfc_membase)
+		iounmap((void *)hc->xhfc_membase);
 
 	if (hc->pci_dev) {
 		pci_disable_device(hc->pci_dev);
@@ -1100,8 +1118,9 @@ init_chip(struct hfc_multi *hc)
 	/* revision check */
 	if (debug & DEBUG_HFCMULTI_INIT)
 		printk(KERN_DEBUG "%s: entered\n", __func__);
-	val = HFC_inb(hc, R_CHIP_ID)>>4;
-	if (val != 0x8 && val != 0xc && val != 0xe) {
+	val = HFC_inb(hc, R_CHIP_ID);
+	if ((val>>4) != 0x8 && (val>>4) != 0xc && (val>>4) != 0xe
+			&& (val>>1) != 0x31) {
 		printk(KERN_INFO "HFC_multi: unknown CHIP_ID:%x\n", (u_int)val);
 		err = -EIO;
 		goto out;
@@ -1109,8 +1128,9 @@ init_chip(struct hfc_multi *hc)
 	rev = HFC_inb(hc, R_CHIP_RV);
 	printk(KERN_INFO
 	    "HFC_multi: detected HFC with chip ID=0x%lx revision=%ld%s\n",
-	    val, rev, (rev == 0) ? " (old FIFO handling)" : "");
-	if (rev == 0) {
+	    val, rev, (rev == 0 && (hc->ctype != HFC_TYPE_XHFC)) ?
+		" (old FIFO handling)" : "");
+	if (hc->ctype != HFC_TYPE_XHFC && rev == 0) {
 		test_and_set_bit(HFC_CHIP_REVISION0, &hc->chip);
 		printk(KERN_WARNING
 		    "HFC_multi: NOTE: Your chip is revision 0, "
@@ -1152,6 +1172,12 @@ init_chip(struct hfc_multi *hc)
 		hc->Zlen = 8000;
 		hc->DTMFbase = 0x2000;
 	}
+	if (hc->ctype == HFC_TYPE_XHFC) {
+		hc->Flen = 0x8;
+		hc->Zmin = 0x0;
+		hc->Zlen = 64;
+		hc->DTMFbase = 0x0;
+	}
 	hc->max_trans = poll << 1;
 	if (hc->max_trans > hc->Zlen)
 		hc->max_trans = hc->Zlen;
@@ -1211,6 +1237,9 @@ init_chip(struct hfc_multi *hc)
 		hc->hw.r_pcm_md0 = V_F0_LEN; /* shift clock for DSP */
 	}
 
+	if (test_bit(HFC_CHIP_EMBSD, &hc->chip))
+		hc->hw.r_pcm_md0 = V_F0_LEN; /* shift clock for DSP */
+
 	/* we only want the real Z2 read-pointer for revision > 0 */
 	if (!test_bit(HFC_CHIP_REVISION0, &hc->chip))
 		hc->hw.r_ram_sz |= V_FZ_MD;
@@ -1234,15 +1263,24 @@ init_chip(struct hfc_multi *hc)
 
 	/* soft reset */
 	HFC_outb(hc, R_CTRL, hc->hw.r_ctrl);
-	HFC_outb(hc, R_RAM_SZ, hc->hw.r_ram_sz);
+	if (hc->ctype == HFC_TYPE_XHFC)
+		HFC_outb(hc, 0x0C /* R_FIFO_THRES */,
+				0x11 /* 16 Bytes TX/RX */);
+	else
+		HFC_outb(hc, R_RAM_SZ, hc->hw.r_ram_sz);
 	HFC_outb(hc, R_FIFO_MD, 0);
-	hc->hw.r_cirm = V_SRES | V_HFCRES | V_PCMRES | V_STRES | V_RLD_EPR;
+	if (hc->ctype == HFC_TYPE_XHFC)
+		hc->hw.r_cirm = V_SRES | V_HFCRES | V_PCMRES | V_STRES;
+	else
+		hc->hw.r_cirm = V_SRES | V_HFCRES | V_PCMRES | V_STRES
+			| V_RLD_EPR;
 	HFC_outb(hc, R_CIRM, hc->hw.r_cirm);
 	udelay(100);
 	hc->hw.r_cirm = 0;
 	HFC_outb(hc, R_CIRM, hc->hw.r_cirm);
 	udelay(100);
-	HFC_outb(hc, R_RAM_SZ, hc->hw.r_ram_sz);
+	if (hc->ctype != HFC_TYPE_XHFC)
+		HFC_outb(hc, R_RAM_SZ, hc->hw.r_ram_sz);
 
 	/* Speech Design PLX bridge pcm and sync mode */
 	if (test_bit(HFC_CHIP_PLXSD, &hc->chip)) {
@@ -1278,13 +1316,16 @@ init_chip(struct hfc_multi *hc)
 	HFC_outb(hc, R_PCM_MD0, hc->hw.r_pcm_md0 | 0xa0);
 	if (test_bit(HFC_CHIP_PLXSD, &hc->chip))
 		HFC_outb(hc, R_PCM_MD2, V_SYNC_SRC); /* sync via SYNC_I / O */
+	else if (test_bit(HFC_CHIP_EMBSD, &hc->chip))
+		HFC_outb(hc, R_PCM_MD2, 0x10); /* V_C2O_EN */
 	else
 		HFC_outb(hc, R_PCM_MD2, 0x00); /* sync from interface */
 	HFC_outb(hc, R_PCM_MD0, hc->hw.r_pcm_md0 | 0x00);
 	for (i = 0; i < 256; i++) {
 		HFC_outb_nodebug(hc, R_SLOT, i);
 		HFC_outb_nodebug(hc, A_SL_CFG, 0);
-		HFC_outb_nodebug(hc, A_CONF, 0);
+		if (hc->ctype != HFC_TYPE_XHFC)
+			HFC_outb_nodebug(hc, A_CONF, 0);
 		hc->slot_owner[i] = -1;
 	}
 
@@ -1296,6 +1337,9 @@ init_chip(struct hfc_multi *hc)
 		HFC_outb(hc, R_BRG_PCM_CFG, V_PCM_CLK);
 	}
 
+	if (test_bit(HFC_CHIP_EMBSD, &hc->chip))
+		HFC_outb(hc, 0x02 /* R_CLK_CFG */, 0x40 /* V_CLKO_OFF */);
+
 	/* B410P GPIO */
 	if (test_bit(HFC_CHIP_B410P, &hc->chip)) {
 		printk(KERN_NOTICE "Setting GPIOs\n");
@@ -1424,7 +1468,7 @@ controller_fail:
 	hc->hw.r_irqmsk_misc |= V_TI_IRQMSK;
 
 	/* set E1 state machine IRQ */
-	if (hc->type == 1)
+	if (hc->ctype == HFC_TYPE_E1)
 		hc->hw.r_irqmsk_misc |= V_STA_IRQMSK;
 
 	/* set DTMF detection */
@@ -1444,7 +1488,8 @@ controller_fail:
 		r_conf_en = V_CONF_EN | V_ULAW;
 	else
 		r_conf_en = V_CONF_EN;
-	HFC_outb(hc, R_CONF_EN, r_conf_en);
+	if (hc->ctype != HFC_TYPE_XHFC)
+		HFC_outb(hc, R_CONF_EN, r_conf_en);
 
 	/* setting leds */
 	switch (hc->leds) {
@@ -1468,16 +1513,23 @@ controller_fail:
 		break;
 	}
 
+	if (test_bit(HFC_CHIP_EMBSD, &hc->chip)) {
+		hc->hw.r_st_sync = 0x10; /* V_AUTO_SYNCI */
+		HFC_outb(hc, R_ST_SYNC, hc->hw.r_st_sync);
+	}
+
 	/* set master clock */
 	if (hc->masterclk >= 0) {
 		if (debug & DEBUG_HFCMULTI_INIT)
 			printk(KERN_DEBUG "%s: setting ST master clock "
 			    "to port %d (0..%d)\n",
 			    __func__, hc->masterclk, hc->ports-1);
-		hc->hw.r_st_sync = hc->masterclk | V_AUTO_SYNC;
+		hc->hw.r_st_sync |= (hc->masterclk | V_AUTO_SYNC);
 		HFC_outb(hc, R_ST_SYNC, hc->hw.r_st_sync);
 	}
 
+
+
 	/* setting misc irq */
 	HFC_outb(hc, R_IRQMSK_MISC, hc->hw.r_irqmsk_misc);
 	if (debug & DEBUG_HFCMULTI_INIT)
@@ -1929,7 +1981,7 @@ next_frame:
 				Fspace = 1;
 		}
 		/* one frame only for ST D-channels, to allow resending */
-		if (hc->type != 1 && dch) {
+		if (hc->ctype != HFC_TYPE_E1 && dch) {
 			if (f1 != f2)
 				Fspace = 0;
 		}
@@ -1971,12 +2023,22 @@ next_frame:
 					    "slot_tx %d\n",
 					    __func__, ch, slot_tx);
 				/* connect slot */
-				HFC_outb(hc, A_CON_HDLC, 0xc0 | 0x00 |
-				    V_HDLC_TRP | V_IFF);
+				if (hc->ctype == HFC_TYPE_XHFC)
+					HFC_outb(hc, A_CON_HDLC, 0xc0
+					    | 0x07 << 2 | V_HDLC_TRP | V_IFF);
+						/* Enable FIFO, no interrupt */
+				else
+					HFC_outb(hc, A_CON_HDLC, 0xc0 | 0x00 |
+					    V_HDLC_TRP | V_IFF);
 				HFC_outb_nodebug(hc, R_FIFO, ch<<1 | 1);
 				HFC_wait_nodebug(hc);
-				HFC_outb(hc, A_CON_HDLC, 0xc0 | 0x00 |
-				    V_HDLC_TRP | V_IFF);
+				if (hc->ctype == HFC_TYPE_XHFC)
+					HFC_outb(hc, A_CON_HDLC, 0xc0
+					    | 0x07 << 2 | V_HDLC_TRP | V_IFF);
+						/* Enable FIFO, no interrupt */
+				else
+					HFC_outb(hc, A_CON_HDLC, 0xc0 | 0x00 |
+					    V_HDLC_TRP | V_IFF);
 				HFC_outb_nodebug(hc, R_FIFO, ch<<1);
 				HFC_wait_nodebug(hc);
 			}
@@ -2004,10 +2066,22 @@ next_frame:
 			    "FIFO data: channel %d slot_tx %d\n",
 			    __func__, ch, slot_tx);
 		/* disconnect slot */
-		HFC_outb(hc, A_CON_HDLC, 0x80 | 0x00 | V_HDLC_TRP | V_IFF);
+		if (hc->ctype == HFC_TYPE_XHFC)
+			HFC_outb(hc, A_CON_HDLC, 0x80
+			    | 0x07 << 2 | V_HDLC_TRP | V_IFF);
+				/* Enable FIFO, no interrupt */
+		else
+			HFC_outb(hc, A_CON_HDLC, 0x80 | 0x00 |
+			    V_HDLC_TRP | V_IFF);
 		HFC_outb_nodebug(hc, R_FIFO, ch<<1 | 1);
 		HFC_wait_nodebug(hc);
-		HFC_outb(hc, A_CON_HDLC, 0x80 | 0x00 | V_HDLC_TRP | V_IFF);
+		if (hc->ctype == HFC_TYPE_XHFC)
+			HFC_outb(hc, A_CON_HDLC, 0x80
+			    | 0x07 << 2 | V_HDLC_TRP | V_IFF);
+				/* Enable FIFO, no interrupt */
+		else
+			HFC_outb(hc, A_CON_HDLC, 0x80 | 0x00 |
+			    V_HDLC_TRP | V_IFF);
 		HFC_outb_nodebug(hc, R_FIFO, ch<<1);
 		HFC_wait_nodebug(hc);
 	}
@@ -2327,7 +2401,7 @@ handle_timer_irq(struct hfc_multi *hc)
 		spin_unlock_irqrestore(&HFClock, flags);
 	}
 
-	if (hc->type != 1 || hc->e1_state == 1)
+	if (hc->ctype != HFC_TYPE_E1 || hc->e1_state == 1)
 		for (ch = 0; ch <= 31; ch++) {
 			if (hc->created[hc->chan[ch].port]) {
 				hfcmulti_tx(hc, ch);
@@ -2350,7 +2424,7 @@ handle_timer_irq(struct hfc_multi *hc)
 				}
 			}
 		}
-	if (hc->type == 1 && hc->created[0]) {
+	if (hc->ctype == HFC_TYPE_E1 && hc->created[0]) {
 		dch = hc->chan[hc->dslot].dch;
 		if (test_bit(HFC_CFG_REPORT_LOS, &hc->chan[hc->dslot].cfg)) {
 			/* LOS */
@@ -2610,7 +2684,10 @@ hfcmulti_interrupt(int intno, void *dev_id)
 		"card %d, this is no bug.\n", hc->id + 1, irqsem);
 	irqsem = hc->id + 1;
 #endif
-
+#ifdef CONFIG_MISDN_HFCMULTI_8xx
+	if (hc->immap->im_cpm.cp_pbdat & hc->pb_irqmsk)
+		goto irq_notforus;
+#endif
 	if (test_bit(HFC_CHIP_PLXSD, &hc->chip)) {
 		spin_lock_irqsave(&plx_lock, flags);
 		plx_acc = hc->plx_membase + PLX_INTCSR;
@@ -2650,7 +2727,7 @@ hfcmulti_interrupt(int intno, void *dev_id)
 	}
 	hc->irqcnt++;
 	if (r_irq_statech) {
-		if (hc->type != 1)
+		if (hc->ctype != HFC_TYPE_E1)
 			ph_state_irq(hc, r_irq_statech);
 	}
 	if (status & V_EXT_IRQSTA)
@@ -2664,7 +2741,7 @@ hfcmulti_interrupt(int intno, void *dev_id)
 		r_irq_misc = HFC_inb_nodebug(hc, R_IRQ_MISC);
 		r_irq_misc &= hc->hw.r_irqmsk_misc; /* ignore disabled irqs */
 		if (r_irq_misc & V_STA_IRQ) {
-			if (hc->type == 1) {
+			if (hc->ctype == HFC_TYPE_E1) {
 				/* state machine */
 				dch = hc->chan[hc->dslot].dch;
 				e1_syncsta = HFC_inb_nodebug(hc, R_SYNC_STA);
@@ -2786,7 +2863,8 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		if (hc->slot_owner[oslot_tx<<1] == ch) {
 			HFC_outb(hc, R_SLOT, oslot_tx << 1);
 			HFC_outb(hc, A_SL_CFG, 0);
-			HFC_outb(hc, A_CONF, 0);
+			if (hc->ctype != HFC_TYPE_XHFC)
+				HFC_outb(hc, A_CONF, 0);
 			hc->slot_owner[oslot_tx<<1] = -1;
 		} else {
 			if (debug & DEBUG_HFCMULTI_MODE)
@@ -2839,7 +2917,9 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 			    flow_tx, routing, conf);
 		HFC_outb(hc, R_SLOT, slot_tx << 1);
 		HFC_outb(hc, A_SL_CFG, (ch<<1) | routing);
-		HFC_outb(hc, A_CONF, (conf < 0) ? 0 : (conf | V_CONF_SL));
+		if (hc->ctype != HFC_TYPE_XHFC)
+			HFC_outb(hc, A_CONF,
+				(conf < 0) ? 0 : (conf | V_CONF_SL));
 		hc->slot_owner[slot_tx << 1] = ch;
 		hc->chan[ch].slot_tx = slot_tx;
 		hc->chan[ch].bank_tx = bank_tx;
@@ -2889,7 +2969,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		HFC_outb(hc, A_IRQ_MSK, 0);
 		HFC_outb(hc, R_INC_RES_FIFO, V_RES_F);
 		HFC_wait(hc);
-		if (hc->chan[ch].bch && hc->type != 1) {
+		if (hc->chan[ch].bch && hc->ctype != HFC_TYPE_E1) {
 			hc->hw.a_st_ctrl0[hc->chan[ch].port] &=
 			    ((ch & 0x3) == 0)? ~V_B1_EN: ~V_B2_EN;
 			HFC_outb(hc, R_ST_SEL, hc->chan[ch].port);
@@ -2965,8 +3045,13 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 			/* enable TX fifo */
 			HFC_outb(hc, R_FIFO, ch << 1);
 			HFC_wait(hc);
-			HFC_outb(hc, A_CON_HDLC, flow_tx | 0x00 |
-			    V_HDLC_TRP | V_IFF);
+			if (hc->ctype == HFC_TYPE_XHFC)
+				HFC_outb(hc, A_CON_HDLC, flow_tx | 0x07 << 2 |
+					V_HDLC_TRP | V_IFF);
+					/* Enable FIFO, no interrupt */
+			else
+				HFC_outb(hc, A_CON_HDLC, flow_tx | 0x00 |
+					V_HDLC_TRP | V_IFF);
 			HFC_outb(hc, A_SUBCH_CFG, 0);
 			HFC_outb(hc, A_IRQ_MSK, 0);
 			HFC_outb(hc, R_INC_RES_FIFO, V_RES_F);
@@ -2976,13 +3061,19 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 			/* enable RX fifo */
 			HFC_outb(hc, R_FIFO, (ch<<1)|1);
 			HFC_wait(hc);
-			HFC_outb(hc, A_CON_HDLC, flow_rx | 0x00 | V_HDLC_TRP);
+			if (hc->ctype == HFC_TYPE_XHFC)
+				HFC_outb(hc, A_CON_HDLC, flow_rx | 0x07 << 2 |
+					V_HDLC_TRP);
+					/* Enable FIFO, no interrupt*/
+			else
+				HFC_outb(hc, A_CON_HDLC, flow_rx | 0x00 |
+						V_HDLC_TRP);
 			HFC_outb(hc, A_SUBCH_CFG, 0);
 			HFC_outb(hc, A_IRQ_MSK, 0);
 			HFC_outb(hc, R_INC_RES_FIFO, V_RES_F);
 			HFC_wait(hc);
 		}
-		if (hc->type != 1) {
+		if (hc->ctype != HFC_TYPE_E1) {
 			hc->hw.a_st_ctrl0[hc->chan[ch].port] |=
 			    ((ch & 0x3) == 0) ? V_B1_EN : V_B2_EN;
 			HFC_outb(hc, R_ST_SEL, hc->chan[ch].port);
@@ -3003,7 +3094,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		/* enable TX fifo */
 		HFC_outb(hc, R_FIFO, ch<<1);
 		HFC_wait(hc);
-		if (hc->type == 1 || hc->chan[ch].bch) {
+		if (hc->ctype == HFC_TYPE_E1 || hc->chan[ch].bch) {
 			/* E1 or B-channel */
 			HFC_outb(hc, A_CON_HDLC, flow_tx | 0x04);
 			HFC_outb(hc, A_SUBCH_CFG, 0);
@@ -3019,7 +3110,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		HFC_outb(hc, R_FIFO, (ch<<1)|1);
 		HFC_wait(hc);
 		HFC_outb(hc, A_CON_HDLC, flow_rx | 0x04);
-		if (hc->type == 1 || hc->chan[ch].bch)
+		if (hc->ctype == HFC_TYPE_E1 || hc->chan[ch].bch)
 			HFC_outb(hc, A_SUBCH_CFG, 0); /* full 8 bits */
 		else
 			HFC_outb(hc, A_SUBCH_CFG, 2); /* 2 bits dchannel */
@@ -3028,7 +3119,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		HFC_wait(hc);
 		if (hc->chan[ch].bch) {
 			test_and_set_bit(FLG_HDLC, &hc->chan[ch].bch->Flags);
-			if (hc->type != 1) {
+			if (hc->ctype != HFC_TYPE_E1) {
 				hc->hw.a_st_ctrl0[hc->chan[ch].port] |=
 				  ((ch&0x3) == 0) ? V_B1_EN : V_B2_EN;
 				HFC_outb(hc, R_ST_SEL, hc->chan[ch].port);
@@ -3108,7 +3199,7 @@ hfcm_l1callback(struct dchannel *dch, u_int cmd)
 	case HW_RESET_REQ:
 		/* start activation */
 		spin_lock_irqsave(&hc->lock, flags);
-		if (hc->type == 1) {
+		if (hc->ctype == HFC_TYPE_E1) {
 			if (debug & DEBUG_HFCMULTI_MSG)
 				printk(KERN_DEBUG
 				    "%s: HW_RESET_REQ no BRI\n",
@@ -3129,7 +3220,7 @@ hfcm_l1callback(struct dchannel *dch, u_int cmd)
 	case HW_DEACT_REQ:
 		/* start deactivation */
 		spin_lock_irqsave(&hc->lock, flags);
-		if (hc->type == 1) {
+		if (hc->ctype == HFC_TYPE_E1) {
 			if (debug & DEBUG_HFCMULTI_MSG)
 				printk(KERN_DEBUG
 				    "%s: HW_DEACT_REQ no BRI\n",
@@ -3163,7 +3254,7 @@ hfcm_l1callback(struct dchannel *dch, u_int cmd)
 		break;
 	case HW_POWERUP_REQ:
 		spin_lock_irqsave(&hc->lock, flags);
-		if (hc->type == 1) {
+		if (hc->ctype == HFC_TYPE_E1) {
 			if (debug & DEBUG_HFCMULTI_MSG)
 				printk(KERN_DEBUG
 				    "%s: HW_POWERUP_REQ no BRI\n",
@@ -3240,7 +3331,7 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 				    __func__, hc->chan[dch->slot].port,
 				    hc->ports-1);
 			/* start activation */
-			if (hc->type == 1) {
+			if (hc->ctype == HFC_TYPE_E1) {
 				ph_state_change(dch);
 				if (debug & DEBUG_HFCMULTI_STATE)
 					printk(KERN_DEBUG
@@ -3273,7 +3364,7 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 				    __func__, hc->chan[dch->slot].port,
 				    hc->ports-1);
 			/* start deactivation */
-			if (hc->type == 1) {
+			if (hc->ctype == HFC_TYPE_E1) {
 				if (debug & DEBUG_HFCMULTI_MSG)
 					printk(KERN_DEBUG
 					    "%s: PH_DEACTIVATE no BRI\n",
@@ -3493,6 +3584,8 @@ channel_bctrl(struct bchannel *bch, struct mISDN_ctrl_req *cq)
 		features->hfc_id = hc->id;
 		if (test_bit(HFC_CHIP_DTMF, &hc->chip))
 			features->hfc_dtmf = 1;
+		if (test_bit(HFC_CHIP_CONF, &hc->chip))
+			features->hfc_conf = 1;
 		features->hfc_loops = 0;
 		if (test_bit(HFC_CHIP_B410P, &hc->chip)) {
 			features->hfc_echocanhw = 1;
@@ -3630,7 +3723,7 @@ ph_state_change(struct dchannel *dch)
 	hc = dch->hw;
 	ch = dch->slot;
 
-	if (hc->type == 1) {
+	if (hc->ctype == HFC_TYPE_E1) {
 		if (dch->dev.D.protocol == ISDN_P_TE_E1) {
 			if (debug & DEBUG_HFCMULTI_STATE)
 				printk(KERN_DEBUG
@@ -3755,7 +3848,7 @@ hfcmulti_initmode(struct dchannel *dch)
 	if (debug & DEBUG_HFCMULTI_INIT)
 		printk(KERN_DEBUG "%s: entered\n", __func__);
 
-	if (hc->type == 1) {
+	if (hc->ctype == HFC_TYPE_E1) {
 		hc->chan[hc->dslot].slot_tx = -1;
 		hc->chan[hc->dslot].slot_rx = -1;
 		hc->chan[hc->dslot].conf = -1;
@@ -3904,6 +3997,11 @@ hfcmulti_initmode(struct dchannel *dch)
 		}
 		if (!test_bit(HFC_CFG_NONCAP_TX, &hc->chan[i].cfg))
 			hc->hw.a_st_ctrl0[pt] |= V_TX_LI;
+		if (hc->ctype == HFC_TYPE_XHFC) {
+			hc->hw.a_st_ctrl0[pt] |= 0x40 /* V_ST_PU_CTRL */;
+			HFC_outb(hc, 0x35 /* A_ST_CTRL3 */,
+				0x7c << 1 /* V_ST_PULSE */);
+		}
 		/* line setup */
 		HFC_outb(hc, A_ST_CTRL0,  hc->hw.a_st_ctrl0[pt]);
 		/* disable E-channel */
@@ -3990,7 +4088,7 @@ open_bchannel(struct hfc_multi *hc, struct dchannel *dch,
 		return -EINVAL;
 	if (rq->protocol == ISDN_P_NONE)
 		return -EINVAL;
-	if (hc->type == 1)
+	if (hc->ctype == HFC_TYPE_E1)
 		ch = rq->adr.channel;
 	else
 		ch = (rq->adr.channel - 1) + (dch->slot - 2);
@@ -4081,7 +4179,7 @@ hfcm_dctrl(struct mISDNchannel *ch, u_int cmd, void *arg)
 		switch (rq->protocol) {
 		case ISDN_P_TE_S0:
 		case ISDN_P_NT_S0:
-			if (hc->type == 1) {
+			if (hc->ctype == HFC_TYPE_E1) {
 				err = -EINVAL;
 				break;
 			}
@@ -4089,7 +4187,7 @@ hfcm_dctrl(struct mISDNchannel *ch, u_int cmd, void *arg)
 			break;
 		case ISDN_P_TE_E1:
 		case ISDN_P_NT_E1:
-			if (hc->type != 1) {
+			if (hc->ctype != HFC_TYPE_E1) {
 				err = -EINVAL;
 				break;
 			}
@@ -4156,13 +4254,13 @@ init_card(struct hfc_multi *hc)
 	disable_hwirq(hc);
 	spin_unlock_irqrestore(&hc->lock, flags);
 
-	if (request_irq(hc->pci_dev->irq, hfcmulti_interrupt, IRQF_SHARED,
+	if (request_irq(hc->irq, hfcmulti_interrupt, IRQF_SHARED,
 	    "HFC-multi", hc)) {
 		printk(KERN_WARNING "mISDN: Could not get interrupt %d.\n",
-		    hc->pci_dev->irq);
+		    hc->irq);
+		hc->irq = 0;
 		return -EIO;
 	}
-	hc->irq = hc->pci_dev->irq;
 
 	if (test_bit(HFC_CHIP_PLXSD, &hc->chip)) {
 		spin_lock_irqsave(&plx_lock, plx_flags);
@@ -4269,6 +4367,10 @@ setup_pci(struct hfc_multi *hc, struct pci_dev *pdev,
 	hc->ledstate = 0xAFFEAFFE;
 	hc->opticalsupport = m->opticalsupport;
 
+	hc->pci_iobase = 0;
+	hc->pci_membase = NULL;
+	hc->plx_membase = NULL;
+
 	/* set memory access methods */
 	if (m->io_mode) /* use mode from card config */
 		hc->io_mode = m->io_mode;
@@ -4276,44 +4378,12 @@ setup_pci(struct hfc_multi *hc, struct pci_dev *pdev,
 	case HFC_IO_MODE_PLXSD:
 		test_and_set_bit(HFC_CHIP_PLXSD, &hc->chip);
 		hc->slots = 128; /* required */
-		/* fall through */
-	case HFC_IO_MODE_PCIMEM:
 		hc->HFC_outb = HFC_outb_pcimem;
 		hc->HFC_inb = HFC_inb_pcimem;
 		hc->HFC_inw = HFC_inw_pcimem;
 		hc->HFC_wait = HFC_wait_pcimem;
 		hc->read_fifo = read_fifo_pcimem;
 		hc->write_fifo = write_fifo_pcimem;
-		break;
-	case HFC_IO_MODE_REGIO:
-		hc->HFC_outb = HFC_outb_regio;
-		hc->HFC_inb = HFC_inb_regio;
-		hc->HFC_inw = HFC_inw_regio;
-		hc->HFC_wait = HFC_wait_regio;
-		hc->read_fifo = read_fifo_regio;
-		hc->write_fifo = write_fifo_regio;
-		break;
-	default:
-		printk(KERN_WARNING "HFC-multi: Invalid IO mode.\n");
-		pci_disable_device(hc->pci_dev);
-		return -EIO;
-	}
-	hc->HFC_outb_nodebug = hc->HFC_outb;
-	hc->HFC_inb_nodebug = hc->HFC_inb;
-	hc->HFC_inw_nodebug = hc->HFC_inw;
-	hc->HFC_wait_nodebug = hc->HFC_wait;
-#ifdef HFC_REGISTER_DEBUG
-	hc->HFC_outb = HFC_outb_debug;
-	hc->HFC_inb = HFC_inb_debug;
-	hc->HFC_inw = HFC_inw_debug;
-	hc->HFC_wait = HFC_wait_debug;
-#endif
-	hc->pci_iobase = 0;
-	hc->pci_membase = NULL;
-	hc->plx_membase = NULL;
-
-	switch (hc->io_mode) {
-	case HFC_IO_MODE_PLXSD:
 		hc->plx_origmembase =  hc->pci_dev->resource[0].start;
 		/* MEMBASE 1 is PLX PCI Bridge */
 
@@ -4361,6 +4431,12 @@ setup_pci(struct hfc_multi *hc, struct pci_dev *pdev,
 		pci_write_config_word(hc->pci_dev, PCI_COMMAND, PCI_ENA_MEMIO);
 		break;
 	case HFC_IO_MODE_PCIMEM:
+		hc->HFC_outb = HFC_outb_pcimem;
+		hc->HFC_inb = HFC_inb_pcimem;
+		hc->HFC_inw = HFC_inw_pcimem;
+		hc->HFC_wait = HFC_wait_pcimem;
+		hc->read_fifo = read_fifo_pcimem;
+		hc->write_fifo = write_fifo_pcimem;
 		hc->pci_origmembase = hc->pci_dev->resource[1].start;
 		if (!hc->pci_origmembase) {
 			printk(KERN_WARNING
@@ -4377,12 +4453,18 @@ setup_pci(struct hfc_multi *hc, struct pci_dev *pdev,
 			pci_disable_device(hc->pci_dev);
 			return -EIO;
 		}
-		printk(KERN_INFO "card %d: defined at MEMBASE %#lx (%#lx) IRQ %d "
-		    "HZ %d leds-type %d\n", hc->id, (u_long)hc->pci_membase,
+		printk(KERN_INFO "card %d: defined at MEMBASE %#lx (%#lx) IRQ "
+		    "%d HZ %d leds-type %d\n", hc->id, (u_long)hc->pci_membase,
 		    hc->pci_origmembase, hc->pci_dev->irq, HZ, hc->leds);
 		pci_write_config_word(hc->pci_dev, PCI_COMMAND, PCI_ENA_MEMIO);
 		break;
 	case HFC_IO_MODE_REGIO:
+		hc->HFC_outb = HFC_outb_regio;
+		hc->HFC_inb = HFC_inb_regio;
+		hc->HFC_inw = HFC_inw_regio;
+		hc->HFC_wait = HFC_wait_regio;
+		hc->read_fifo = read_fifo_regio;
+		hc->write_fifo = write_fifo_regio;
 		hc->pci_iobase = (u_int) hc->pci_dev->resource[0].start;
 		if (!hc->pci_iobase) {
 			printk(KERN_WARNING
@@ -4464,7 +4546,7 @@ release_port(struct hfc_multi *hc, struct dchannel *dch)
 		dch->timer.function = NULL;
 	}
 
-	if (hc->type == 1) { /* E1 */
+	if (hc->ctype == HFC_TYPE_E1) { /* E1 */
 		/* remove sync */
 		if (test_bit(HFC_CHIP_PLXSD, &hc->chip)) {
 			hc->syncronized = 0;
@@ -4863,9 +4945,15 @@ init_multi_port(struct hfc_multi *hc, int pt)
 		test_and_set_bit(HFC_CFG_DIS_ECHANNEL,
 		    &hc->chan[i + 2].cfg);
 	}
-	snprintf(name, MISDN_MAX_IDLEN - 1, "hfc-%ds.%d-%d",
-		hc->type, HFC_cnt + 1, pt + 1);
-	ret = mISDN_register_device(&dch->dev, &hc->pci_dev->dev, name);
+	if (hc->ctype == HFC_TYPE_XHFC) {
+		snprintf(name, MISDN_MAX_IDLEN - 1, "xhfc.%d-%d",
+			HFC_cnt + 1, pt + 1);
+		ret = mISDN_register_device(&dch->dev, NULL, name);
+	} else {
+		snprintf(name, MISDN_MAX_IDLEN - 1, "hfc-%ds.%d-%d",
+			hc->ctype, HFC_cnt + 1, pt + 1);
+		ret = mISDN_register_device(&dch->dev, &hc->pci_dev->dev, name);
+	}
 	if (ret)
 		goto free_chan;
 	hc->created[pt] = 1;
@@ -4876,9 +4964,9 @@ free_chan:
 }
 
 static int
-hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+hfcmulti_init(struct hm_map *m, struct pci_dev *pdev,
+    const struct pci_device_id *ent)
 {
-	struct hm_map	*m = (struct hm_map *)ent->driver_data;
 	int		ret_err = 0;
 	int		pt;
 	struct hfc_multi	*hc;
@@ -4913,16 +5001,18 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 	spin_lock_init(&hc->lock);
 	hc->mtyp = m;
-	hc->type =  m->type;
+	hc->ctype =  m->type;
 	hc->ports = m->ports;
 	hc->id = HFC_cnt;
 	hc->pcm = pcm[HFC_cnt];
 	hc->io_mode = iomode[HFC_cnt];
-	if (dslot[HFC_cnt] < 0 && hc->type == 1) {
+	if (dslot[HFC_cnt] < 0 && hc->ctype == HFC_TYPE_E1) {
 		hc->dslot = 0;
 		printk(KERN_INFO "HFC-E1 card has disabled D-channel, but "
 			"31 B-channels\n");
-	} if (dslot[HFC_cnt] > 0 && dslot[HFC_cnt] < 32 && hc->type == 1) {
+	}
+	if (dslot[HFC_cnt] > 0 && dslot[HFC_cnt] < 32
+	    && hc->ctype == HFC_TYPE_E1) {
 		hc->dslot = dslot[HFC_cnt];
 		printk(KERN_INFO "HFC-E1 card has alternating D-channel on "
 			"time slot %d\n", dslot[HFC_cnt]);
@@ -4944,8 +5034,11 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 	for (i = 0; i < (poll >> 1); i++)
 		hc->silence_data[i] = hc->silence;
 
-	if (!(type[HFC_cnt] & 0x200))
-		test_and_set_bit(HFC_CHIP_DTMF, &hc->chip);
+	if (hc->ctype != HFC_TYPE_XHFC) {
+		if (!(type[HFC_cnt] & 0x200))
+			test_and_set_bit(HFC_CHIP_DTMF, &hc->chip);
+		test_and_set_bit(HFC_CHIP_CONF, &hc->chip);
+	}
 
 	if (type[HFC_cnt] & 0x800)
 		test_and_set_bit(HFC_CHIP_PCM_SLAVE, &hc->chip);
@@ -4969,8 +5062,18 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 		printk(KERN_NOTICE "Watchdog enabled\n");
 	}
 
-	/* setup pci, hc->slots may change due to PLXSD */
-	ret_err = setup_pci(hc, pdev, ent);
+	if (pdev && ent)
+		/* setup pci, hc->slots may change due to PLXSD */
+		ret_err = setup_pci(hc, pdev, ent);
+	else
+#ifdef CONFIG_MISDN_HFCMULTI_8xx
+		ret_err = setup_embedded(hc, m);
+#else
+	{
+		printk(KERN_WARNING "Embedded IO Mode not selected\n");
+		ret_err = -EIO;
+	}
+#endif
 	if (ret_err) {
 		if (hc == syncmaster)
 			syncmaster = NULL;
@@ -4978,7 +5081,17 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return ret_err;
 	}
 
-	/* crate channels */
+	hc->HFC_outb_nodebug = hc->HFC_outb;
+	hc->HFC_inb_nodebug = hc->HFC_inb;
+	hc->HFC_inw_nodebug = hc->HFC_inw;
+	hc->HFC_wait_nodebug = hc->HFC_wait;
+#ifdef HFC_REGISTER_DEBUG
+	hc->HFC_outb = HFC_outb_debug;
+	hc->HFC_inb = HFC_inb_debug;
+	hc->HFC_inw = HFC_inw_debug;
+	hc->HFC_wait = HFC_wait_debug;
+#endif
+	/* create channels */
 	for (pt = 0; pt < hc->ports; pt++) {
 		if (Port_cnt >= MAX_PORTS) {
 			printk(KERN_ERR "too many ports (max=%d).\n",
@@ -4986,7 +5099,7 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 			ret_err = -EINVAL;
 			goto free_card;
 		}
-		if (hc->type == 1)
+		if (hc->ctype == HFC_TYPE_E1)
 			ret_err = init_e1_port(hc, m);
 		else
 			ret_err = init_multi_port(hc, pt);
@@ -5070,6 +5183,7 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 		hc->iclock = mISDN_register_clock("HFCMulti", 0, clockctl, hc);
 
 	/* initialize hardware */
+	hc->irq = (m->irq) ? : hc->pci_dev->irq;
 	ret_err = init_card(hc);
 	if (ret_err) {
 		printk(KERN_ERR "init card returns %d\n", ret_err);
@@ -5120,45 +5234,47 @@ static void __devexit hfc_remove_pci(struct pci_dev *pdev)
 #define VENDOR_PRIM	"PrimuX"
 
 static const struct hm_map hfcm_map[] = {
-/*0*/	{VENDOR_BN, "HFC-1S Card (mini PCI)", 4, 1, 1, 3, 0, DIP_4S, 0},
-/*1*/	{VENDOR_BN, "HFC-2S Card", 4, 2, 1, 3, 0, DIP_4S, 0},
-/*2*/	{VENDOR_BN, "HFC-2S Card (mini PCI)", 4, 2, 1, 3, 0, DIP_4S, 0},
-/*3*/	{VENDOR_BN, "HFC-4S Card", 4, 4, 1, 2, 0, DIP_4S, 0},
-/*4*/	{VENDOR_BN, "HFC-4S Card (mini PCI)", 4, 4, 1, 2, 0, 0, 0},
-/*5*/	{VENDOR_CCD, "HFC-4S Eval (old)", 4, 4, 0, 0, 0, 0, 0},
-/*6*/	{VENDOR_CCD, "HFC-4S IOB4ST", 4, 4, 1, 2, 0, DIP_4S, 0},
-/*7*/	{VENDOR_CCD, "HFC-4S", 4, 4, 1, 2, 0, 0, 0},
-/*8*/	{VENDOR_DIG, "HFC-4S Card", 4, 4, 0, 2, 0, 0, HFC_IO_MODE_REGIO},
-/*9*/	{VENDOR_CCD, "HFC-4S Swyx 4xS0 SX2 QuadBri", 4, 4, 1, 2, 0, 0, 0},
-/*10*/	{VENDOR_JH, "HFC-4S (junghanns 2.0)", 4, 4, 1, 2, 0, 0, 0},
-/*11*/	{VENDOR_PRIM, "HFC-2S Primux Card", 4, 2, 0, 0, 0, 0, 0},
-
-/*12*/	{VENDOR_BN, "HFC-8S Card", 8, 8, 1, 0, 0, 0, 0},
+/*0*/	{VENDOR_BN, "HFC-1S Card (mini PCI)", 4, 1, 1, 3, 0, DIP_4S, 0, 0},
+/*1*/	{VENDOR_BN, "HFC-2S Card", 4, 2, 1, 3, 0, DIP_4S, 0, 0},
+/*2*/	{VENDOR_BN, "HFC-2S Card (mini PCI)", 4, 2, 1, 3, 0, DIP_4S, 0, 0},
+/*3*/	{VENDOR_BN, "HFC-4S Card", 4, 4, 1, 2, 0, DIP_4S, 0, 0},
+/*4*/	{VENDOR_BN, "HFC-4S Card (mini PCI)", 4, 4, 1, 2, 0, 0, 0, 0},
+/*5*/	{VENDOR_CCD, "HFC-4S Eval (old)", 4, 4, 0, 0, 0, 0, 0, 0},
+/*6*/	{VENDOR_CCD, "HFC-4S IOB4ST", 4, 4, 1, 2, 0, DIP_4S, 0, 0},
+/*7*/	{VENDOR_CCD, "HFC-4S", 4, 4, 1, 2, 0, 0, 0, 0},
+/*8*/	{VENDOR_DIG, "HFC-4S Card", 4, 4, 0, 2, 0, 0, HFC_IO_MODE_REGIO, 0},
+/*9*/	{VENDOR_CCD, "HFC-4S Swyx 4xS0 SX2 QuadBri", 4, 4, 1, 2, 0, 0, 0, 0},
+/*10*/	{VENDOR_JH, "HFC-4S (junghanns 2.0)", 4, 4, 1, 2, 0, 0, 0, 0},
+/*11*/	{VENDOR_PRIM, "HFC-2S Primux Card", 4, 2, 0, 0, 0, 0, 0, 0},
+
+/*12*/	{VENDOR_BN, "HFC-8S Card", 8, 8, 1, 0, 0, 0, 0, 0},
 /*13*/	{VENDOR_BN, "HFC-8S Card (+)", 8, 8, 1, 8, 0, DIP_8S,
-		HFC_IO_MODE_REGIO},
-/*14*/	{VENDOR_CCD, "HFC-8S Eval (old)", 8, 8, 0, 0, 0, 0, 0},
-/*15*/	{VENDOR_CCD, "HFC-8S IOB4ST Recording", 8, 8, 1, 0, 0, 0, 0},
+		HFC_IO_MODE_REGIO, 0},
+/*14*/	{VENDOR_CCD, "HFC-8S Eval (old)", 8, 8, 0, 0, 0, 0, 0, 0},
+/*15*/	{VENDOR_CCD, "HFC-8S IOB4ST Recording", 8, 8, 1, 0, 0, 0, 0, 0},
 
-/*16*/	{VENDOR_CCD, "HFC-8S IOB8ST", 8, 8, 1, 0, 0, 0, 0},
-/*17*/	{VENDOR_CCD, "HFC-8S", 8, 8, 1, 0, 0, 0, 0},
-/*18*/	{VENDOR_CCD, "HFC-8S", 8, 8, 1, 0, 0, 0, 0},
+/*16*/	{VENDOR_CCD, "HFC-8S IOB8ST", 8, 8, 1, 0, 0, 0, 0, 0},
+/*17*/	{VENDOR_CCD, "HFC-8S", 8, 8, 1, 0, 0, 0, 0, 0},
+/*18*/	{VENDOR_CCD, "HFC-8S", 8, 8, 1, 0, 0, 0, 0, 0},
 
-/*19*/	{VENDOR_BN, "HFC-E1 Card", 1, 1, 0, 1, 0, DIP_E1, 0},
-/*20*/	{VENDOR_BN, "HFC-E1 Card (mini PCI)", 1, 1, 0, 1, 0, 0, 0},
-/*21*/	{VENDOR_BN, "HFC-E1+ Card (Dual)", 1, 1, 0, 1, 0, DIP_E1, 0},
-/*22*/	{VENDOR_BN, "HFC-E1 Card (Dual)", 1, 1, 0, 1, 0, DIP_E1, 0},
+/*19*/	{VENDOR_BN, "HFC-E1 Card", 1, 1, 0, 1, 0, DIP_E1, 0, 0},
+/*20*/	{VENDOR_BN, "HFC-E1 Card (mini PCI)", 1, 1, 0, 1, 0, 0, 0, 0},
+/*21*/	{VENDOR_BN, "HFC-E1+ Card (Dual)", 1, 1, 0, 1, 0, DIP_E1, 0, 0},
+/*22*/	{VENDOR_BN, "HFC-E1 Card (Dual)", 1, 1, 0, 1, 0, DIP_E1, 0, 0},
 
-/*23*/	{VENDOR_CCD, "HFC-E1 Eval (old)", 1, 1, 0, 0, 0, 0, 0},
-/*24*/	{VENDOR_CCD, "HFC-E1 IOB1E1", 1, 1, 0, 1, 0, 0, 0},
-/*25*/	{VENDOR_CCD, "HFC-E1", 1, 1, 0, 1, 0, 0, 0},
+/*23*/	{VENDOR_CCD, "HFC-E1 Eval (old)", 1, 1, 0, 0, 0, 0, 0, 0},
+/*24*/	{VENDOR_CCD, "HFC-E1 IOB1E1", 1, 1, 0, 1, 0, 0, 0, 0},
+/*25*/	{VENDOR_CCD, "HFC-E1", 1, 1, 0, 1, 0, 0, 0, 0},
 
 /*26*/	{VENDOR_CCD, "HFC-4S Speech Design", 4, 4, 0, 0, 0, 0,
-		HFC_IO_MODE_PLXSD},
+		HFC_IO_MODE_PLXSD, 0},
 /*27*/	{VENDOR_CCD, "HFC-E1 Speech Design", 1, 1, 0, 0, 0, 0,
-		HFC_IO_MODE_PLXSD},
-/*28*/	{VENDOR_CCD, "HFC-4S OpenVox", 4, 4, 1, 0, 0, 0, 0},
-/*29*/	{VENDOR_CCD, "HFC-2S OpenVox", 4, 2, 1, 0, 0, 0, 0},
-/*30*/	{VENDOR_CCD, "HFC-8S OpenVox", 8, 8, 1, 0, 0, 0, 0},
+		HFC_IO_MODE_PLXSD, 0},
+/*28*/	{VENDOR_CCD, "HFC-4S OpenVox", 4, 4, 1, 0, 0, 0, 0, 0},
+/*29*/	{VENDOR_CCD, "HFC-2S OpenVox", 4, 2, 1, 0, 0, 0, 0, 0},
+/*30*/	{VENDOR_CCD, "HFC-8S OpenVox", 8, 8, 1, 0, 0, 0, 0, 0},
+/*31*/	{VENDOR_CCD, "XHFC-4S Speech Design", 5, 4, 0, 0, 0, 0,
+		HFC_IO_MODE_EMBSD, XHFC_IRQ},
 };
 
 #undef H
@@ -5265,7 +5381,7 @@ hfcmulti_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		    "Please contact the driver maintainer for support.\n");
 		return -ENODEV;
 	}
-	ret = hfcmulti_init(pdev, ent);
+	ret = hfcmulti_init(m, pdev, ent);
 	if (ret)
 		return ret;
 	HFC_cnt++;
@@ -5295,6 +5411,8 @@ static int __init
 HFCmulti_init(void)
 {
 	int err;
+	int i, xhfc = 0;
+	struct hm_map m;
 
 	printk(KERN_INFO "mISDN: HFC-multi driver %s\n", HFC_MULTI_VERSION);
 
@@ -5342,11 +5460,43 @@ HFCmulti_init(void)
 	if (!clock)
 		clock = 1;
 
+	/* Register the embedded devices.
+	 * This should be done before the PCI cards registration */
+	switch (hwid) {
+	case HWID_MINIP4:
+		xhfc = 1;
+		m = hfcm_map[31];
+		break;
+	case HWID_MINIP8:
+		xhfc = 2;
+		m = hfcm_map[31];
+		break;
+	case HWID_MINIP16:
+		xhfc = 4;
+		m = hfcm_map[31];
+		break;
+	default:
+		xhfc = 0;
+	}
+
+	for (i = 0; i < xhfc; ++i) {
+		err = hfcmulti_init(&m, NULL, NULL);
+		if (err) {
+			printk(KERN_ERR "error registering embedded driver: "
+				"%x\n", err);
+			return -err;
+		}
+		HFC_cnt++;
+		printk(KERN_INFO "%d devices registered\n", HFC_cnt);
+	}
+
+	/* Register the PCI cards */
 	err = pci_register_driver(&hfcmultipci_driver);
 	if (err < 0) {
 		printk(KERN_ERR "error registering pci driver: %x\n", err);
 		return err;
 	}
+
 	return 0;
 }
 
diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c
index d19b4f6d7d8..05866184ba2 100644
--- a/drivers/isdn/mISDN/dsp_cmx.c
+++ b/drivers/isdn/mISDN/dsp_cmx.c
@@ -947,6 +947,10 @@ conf_software:
 	if (current_conf >= 0) {
 join_members:
 		list_for_each_entry(member, &conf->mlist, list) {
+			/* if no conference engine on our chip, change to
+			 * software */
+			if (!member->dsp->features.hfc_conf)
+				goto conf_software;
 			/* in case of hdlc, change to software */
 			if (member->dsp->hdlc)
 				goto conf_software;
diff --git a/include/linux/mISDNdsp.h b/include/linux/mISDNdsp.h
index 2c483d45f14..41d1eeb9b3b 100644
--- a/include/linux/mISDNdsp.h
+++ b/include/linux/mISDNdsp.h
@@ -25,6 +25,7 @@ extern void mISDN_dsp_element_unregister(struct mISDN_dsp_element *elem);
 struct dsp_features {
 	int	hfc_id; /* unique id to identify the chip (or -1) */
 	int	hfc_dtmf; /* set if HFCmulti card supports dtmf */
+	int	hfc_conf; /* set if HFCmulti card supports conferences */
 	int	hfc_loops; /* set if card supports tone loops */
 	int	hfc_echocanhw; /* set if card supports echocancelation*/
 	int	pcm_id; /* unique id to identify the pcm bus (or -1) */
-- 
cgit v1.2.3-70-g09d2


From 7245a2fe3c10ed7c2e9b1c8a83af5919c0cc0a89 Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Fri, 22 May 2009 11:04:55 +0000
Subject: mISDN: Add PCI ID for Junghanns 8S card

new id for HFC-8S

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hardware/mISDN/hfcmulti.c | 3 +++
 include/linux/pci_ids.h                | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 5be4edf631d..50e9f4d88f4 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -5275,6 +5275,7 @@ static const struct hm_map hfcm_map[] = {
 /*30*/	{VENDOR_CCD, "HFC-8S OpenVox", 8, 8, 1, 0, 0, 0, 0, 0},
 /*31*/	{VENDOR_CCD, "XHFC-4S Speech Design", 5, 4, 0, 0, 0, 0,
 		HFC_IO_MODE_EMBSD, XHFC_IRQ},
+/*32*/	{VENDOR_JH, "HFC-8S (junghanns)", 8, 8, 1, 0, 0, 0, 0, 0},
 };
 
 #undef H
@@ -5328,6 +5329,8 @@ static struct pci_device_id hfmultipci_ids[] __devinitdata = {
 		PCI_SUBDEVICE_ID_CCD_HFC8S, 0, 0, H(18)}, /* 8S */
 	{ PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_HFC8S, PCI_VENDOR_ID_CCD,
 		PCI_SUBDEVICE_ID_CCD_OV8S, 0, 0, H(30)}, /* OpenVox 8 */
+	{ PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_HFC8S, PCI_VENDOR_ID_CCD,
+		PCI_SUBDEVICE_ID_CCD_JH8S, 0, 0, H(32)}, /* Junganns 8S  */
 
 
 	/* Cards with HFC-E1 Chip */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3038ac25491..4bdff984ac9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1914,6 +1914,7 @@
 #define PCI_SUBDEVICE_ID_CCD_SWYX4S	0xB540
 #define PCI_SUBDEVICE_ID_CCD_JH4S20	0xB550
 #define PCI_SUBDEVICE_ID_CCD_IOB8ST_1	0xB552
+#define PCI_SUBDEVICE_ID_CCD_JH8S	0xB55B
 #define PCI_SUBDEVICE_ID_CCD_BN4S	0xB560
 #define PCI_SUBDEVICE_ID_CCD_BN8S	0xB562
 #define PCI_SUBDEVICE_ID_CCD_BNE1	0xB563
-- 
cgit v1.2.3-70-g09d2


From eac74af9b547e29c9634ed5eff4d514349e73310 Mon Sep 17 00:00:00 2001
From: Karsten Keil <keil@b1-systems.de>
Date: Fri, 22 May 2009 11:04:56 +0000
Subject: mISDN: Cleanup debug messages

This patch make debug printk's KERN_DEBUG and also fix some
codestyle issues.

Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hardware/mISDN/hfcmulti.c | 159 +++++++++++++++++----------------
 drivers/isdn/hardware/mISDN/hfcpci.c   |  28 +++---
 drivers/isdn/mISDN/dsp_audio.c         |   5 +-
 drivers/isdn/mISDN/dsp_cmx.c           |   7 +-
 drivers/isdn/mISDN/dsp_core.c          |  14 +--
 drivers/isdn/mISDN/dsp_ecdis.h         |   2 +-
 drivers/isdn/mISDN/dsp_tones.c         |  23 +++--
 drivers/isdn/mISDN/l1oip_core.c        |  30 +++----
 drivers/isdn/mISDN/socket.c            |   2 +-
 drivers/isdn/mISDN/tei.c               |   1 -
 drivers/isdn/mISDN/timerdev.c          |   2 +-
 include/linux/mISDNif.h                |  14 +--
 12 files changed, 148 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 50e9f4d88f4..d60f5b7a41d 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -275,7 +275,7 @@ HFC_outb_pcimem(struct hfc_multi *hc, u_char reg, u_char val,
 HFC_outb_pcimem(struct hfc_multi *hc, u_char reg, u_char val)
 #endif
 {
-	writeb(val, (hc->pci_membase)+reg);
+	writeb(val, hc->pci_membase + reg);
 }
 static u_char
 #ifdef HFC_REGISTER_DEBUG
@@ -284,7 +284,7 @@ HFC_inb_pcimem(struct hfc_multi *hc, u_char reg, const char *function, int line)
 HFC_inb_pcimem(struct hfc_multi *hc, u_char reg)
 #endif
 {
-	return readb((hc->pci_membase)+reg);
+	return readb(hc->pci_membase + reg);
 }
 static u_short
 #ifdef HFC_REGISTER_DEBUG
@@ -293,7 +293,7 @@ HFC_inw_pcimem(struct hfc_multi *hc, u_char reg, const char *function, int line)
 HFC_inw_pcimem(struct hfc_multi *hc, u_char reg)
 #endif
 {
-	return readw((hc->pci_membase)+reg);
+	return readw(hc->pci_membase + reg);
 }
 static void
 #ifdef HFC_REGISTER_DEBUG
@@ -302,7 +302,8 @@ HFC_wait_pcimem(struct hfc_multi *hc, const char *function, int line)
 HFC_wait_pcimem(struct hfc_multi *hc)
 #endif
 {
-	while (readb((hc->pci_membase)+R_STATUS) & V_BUSY);
+	while (readb(hc->pci_membase + R_STATUS) & V_BUSY)
+		cpu_relax();
 }
 
 /* HFC_IO_MODE_REGIO */
@@ -314,7 +315,7 @@ HFC_outb_regio(struct hfc_multi *hc, u_char reg, u_char val,
 HFC_outb_regio(struct hfc_multi *hc, u_char reg, u_char val)
 #endif
 {
-	outb(reg, (hc->pci_iobase)+4);
+	outb(reg, hc->pci_iobase + 4);
 	outb(val, hc->pci_iobase);
 }
 static u_char
@@ -324,7 +325,7 @@ HFC_inb_regio(struct hfc_multi *hc, u_char reg, const char *function, int line)
 HFC_inb_regio(struct hfc_multi *hc, u_char reg)
 #endif
 {
-	outb(reg, (hc->pci_iobase)+4);
+	outb(reg, hc->pci_iobase + 4);
 	return inb(hc->pci_iobase);
 }
 static u_short
@@ -334,7 +335,7 @@ HFC_inw_regio(struct hfc_multi *hc, u_char reg, const char *function, int line)
 HFC_inw_regio(struct hfc_multi *hc, u_char reg)
 #endif
 {
-	outb(reg, (hc->pci_iobase)+4);
+	outb(reg, hc->pci_iobase + 4);
 	return inw(hc->pci_iobase);
 }
 static void
@@ -344,8 +345,9 @@ HFC_wait_regio(struct hfc_multi *hc, const char *function, int line)
 HFC_wait_regio(struct hfc_multi *hc)
 #endif
 {
-	outb(R_STATUS, (hc->pci_iobase)+4);
-	while (inb(hc->pci_iobase) & V_BUSY);
+	outb(R_STATUS, hc->pci_iobase + 4);
+	while (inb(hc->pci_iobase) & V_BUSY)
+		cpu_relax();
 }
 
 #ifdef HFC_REGISTER_DEBUG
@@ -364,14 +366,14 @@ HFC_outb_debug(struct hfc_multi *hc, u_char reg, u_char val,
 	if (regname[0] == '\0')
 		strcpy(regname, "register");
 
-	bits[7] = '0'+(!!(val&1));
-	bits[6] = '0'+(!!(val&2));
-	bits[5] = '0'+(!!(val&4));
-	bits[4] = '0'+(!!(val&8));
-	bits[3] = '0'+(!!(val&16));
-	bits[2] = '0'+(!!(val&32));
-	bits[1] = '0'+(!!(val&64));
-	bits[0] = '0'+(!!(val&128));
+	bits[7] = '0' + (!!(val & 1));
+	bits[6] = '0' + (!!(val & 2));
+	bits[5] = '0' + (!!(val & 4));
+	bits[4] = '0' + (!!(val & 8));
+	bits[3] = '0' + (!!(val & 16));
+	bits[2] = '0' + (!!(val & 32));
+	bits[1] = '0' + (!!(val & 64));
+	bits[0] = '0' + (!!(val & 128));
 	printk(KERN_DEBUG
 	    "HFC_outb(chip %d, %02x=%s, 0x%02x=%s); in %s() line %d\n",
 	    hc->id, reg, regname, val, bits, function, line);
@@ -394,14 +396,14 @@ HFC_inb_debug(struct hfc_multi *hc, u_char reg, const char *function, int line)
 	if (regname[0] == '\0')
 		strcpy(regname, "register");
 
-	bits[7] = '0'+(!!(val&1));
-	bits[6] = '0'+(!!(val&2));
-	bits[5] = '0'+(!!(val&4));
-	bits[4] = '0'+(!!(val&8));
-	bits[3] = '0'+(!!(val&16));
-	bits[2] = '0'+(!!(val&32));
-	bits[1] = '0'+(!!(val&64));
-	bits[0] = '0'+(!!(val&128));
+	bits[7] = '0' + (!!(val & 1));
+	bits[6] = '0' + (!!(val & 2));
+	bits[5] = '0' + (!!(val & 4));
+	bits[4] = '0' + (!!(val & 8));
+	bits[3] = '0' + (!!(val & 16));
+	bits[2] = '0' + (!!(val & 32));
+	bits[1] = '0' + (!!(val & 64));
+	bits[0] = '0' + (!!(val & 128));
 	printk(KERN_DEBUG
 	    "HFC_inb(chip %d, %02x=%s) = 0x%02x=%s; in %s() line %d\n",
 	    hc->id, reg, regname, val, bits, function, line);
@@ -481,6 +483,7 @@ write_fifo_pcimem(struct hfc_multi *hc, u_char *data, int len)
 		len--;
 	}
 }
+
 /* read fifo data (REGIO) */
 static void
 read_fifo_regio(struct hfc_multi *hc, u_char *data, int len)
@@ -526,7 +529,6 @@ read_fifo_pcimem(struct hfc_multi *hc, u_char *data, int len)
 	}
 }
 
-
 static void
 enable_hwirq(struct hfc_multi *hc)
 {
@@ -1011,7 +1013,7 @@ plxsd_checksync(struct hfc_multi *hc, int rm)
 	if (hc->syncronized) {
 		if (syncmaster == NULL) {
 			if (debug & DEBUG_HFCMULTI_PLXSD)
-				printk(KERN_WARNING "%s: GOT sync on card %d"
+				printk(KERN_DEBUG "%s: GOT sync on card %d"
 					" (id=%d)\n", __func__, hc->id + 1,
 					hc->id);
 			hfcmulti_resync(hc, hc, rm);
@@ -1019,7 +1021,7 @@ plxsd_checksync(struct hfc_multi *hc, int rm)
 	} else {
 		if (syncmaster == hc) {
 			if (debug & DEBUG_HFCMULTI_PLXSD)
-				printk(KERN_WARNING "%s: LOST sync on card %d"
+				printk(KERN_DEBUG "%s: LOST sync on card %d"
 					" (id=%d)\n", __func__, hc->id + 1,
 					hc->id);
 			hfcmulti_resync(hc, NULL, rm);
@@ -1068,7 +1070,7 @@ release_io_hfcmulti(struct hfc_multi *hc)
 		pv &= ~PLX_DSP_RES_N;
 		writel(pv, plx_acc_32);
 		if (debug & DEBUG_HFCMULTI_INIT)
-			printk(KERN_WARNING "%s: PCM off: PLX_GPIO=%x\n",
+			printk(KERN_DEBUG "%s: PCM off: PLX_GPIO=%x\n",
 				__func__, pv);
 		spin_unlock_irqrestore(&plx_lock, plx_flags);
 	}
@@ -1119,8 +1121,8 @@ init_chip(struct hfc_multi *hc)
 	if (debug & DEBUG_HFCMULTI_INIT)
 		printk(KERN_DEBUG "%s: entered\n", __func__);
 	val = HFC_inb(hc, R_CHIP_ID);
-	if ((val>>4) != 0x8 && (val>>4) != 0xc && (val>>4) != 0xe
-			&& (val>>1) != 0x31) {
+	if ((val >> 4) != 0x8 && (val >> 4) != 0xc && (val >> 4) != 0xe &&
+	    (val >> 1) != 0x31) {
 		printk(KERN_INFO "HFC_multi: unknown CHIP_ID:%x\n", (u_int)val);
 		err = -EIO;
 		goto out;
@@ -1202,7 +1204,7 @@ init_chip(struct hfc_multi *hc)
 		writel(pv, plx_acc_32);
 		spin_unlock_irqrestore(&plx_lock, plx_flags);
 		if (debug & DEBUG_HFCMULTI_INIT)
-			printk(KERN_WARNING "%s: slave/term: PLX_GPIO=%x\n",
+			printk(KERN_DEBUG "%s: slave/term: PLX_GPIO=%x\n",
 				__func__, pv);
 		/*
 		 * If we are the 3rd PLXSD card or higher, we must turn
@@ -1230,8 +1232,9 @@ init_chip(struct hfc_multi *hc)
 			writel(pv, plx_acc_32);
 			spin_unlock_irqrestore(&plx_lock, plx_flags);
 			if (debug & DEBUG_HFCMULTI_INIT)
-			    printk(KERN_WARNING "%s: term off: PLX_GPIO=%x\n",
-					__func__, pv);
+				printk(KERN_DEBUG
+				    "%s: term off: PLX_GPIO=%x\n",
+				    __func__, pv);
 		}
 		spin_unlock_irqrestore(&HFClock, hfc_flags);
 		hc->hw.r_pcm_md0 = V_F0_LEN; /* shift clock for DSP */
@@ -1292,13 +1295,13 @@ init_chip(struct hfc_multi *hc)
 			pv |= PLX_MASTER_EN | PLX_SLAVE_EN_N;
 			pv |= PLX_SYNC_O_EN;
 			if (debug & DEBUG_HFCMULTI_INIT)
-				printk(KERN_WARNING "%s: master: PLX_GPIO=%x\n",
+				printk(KERN_DEBUG "%s: master: PLX_GPIO=%x\n",
 					__func__, pv);
 		} else {
 			pv &= ~(PLX_MASTER_EN | PLX_SLAVE_EN_N);
 			pv &= ~PLX_SYNC_O_EN;
 			if (debug & DEBUG_HFCMULTI_INIT)
-				printk(KERN_WARNING "%s: slave: PLX_GPIO=%x\n",
+				printk(KERN_DEBUG "%s: slave: PLX_GPIO=%x\n",
 					__func__, pv);
 		}
 		writel(pv, plx_acc_32);
@@ -1410,8 +1413,8 @@ controller_fail:
 				writel(pv, plx_acc_32);
 				spin_unlock_irqrestore(&plx_lock, plx_flags);
 				if (debug & DEBUG_HFCMULTI_INIT)
-				    printk(KERN_WARNING "%s: master: PLX_GPIO"
-					"=%x\n", __func__, pv);
+					printk(KERN_DEBUG "%s: master: "
+					    "PLX_GPIO=%x\n", __func__, pv);
 			}
 			hc->hw.r_pcm_md0 |= V_PCM_MD;
 			HFC_outb(hc, R_PCM_MD0, hc->hw.r_pcm_md0 | 0x00);
@@ -1445,7 +1448,7 @@ controller_fail:
 		writel(pv, plx_acc_32);
 		spin_unlock_irqrestore(&plx_lock, plx_flags);
 		if (debug & DEBUG_HFCMULTI_INIT)
-			printk(KERN_WARNING "%s: reset off: PLX_GPIO=%x\n",
+			printk(KERN_DEBUG "%s: reset off: PLX_GPIO=%x\n",
 				__func__, pv);
 	}
 
@@ -1878,7 +1881,7 @@ hfcmulti_dtmf(struct hfc_multi *hc)
 			hc->chan[ch].coeff_count = 0;
 			skb = mI_alloc_skb(512, GFP_ATOMIC);
 			if (!skb) {
-				printk(KERN_WARNING "%s: No memory for skb\n",
+				printk(KERN_DEBUG "%s: No memory for skb\n",
 				    __func__);
 				continue;
 			}
@@ -2104,7 +2107,7 @@ next_frame:
 		printk(KERN_DEBUG "%s(card %d): fifo(%d) has %d bytes space "
 		    "left (z1=%04x, z2=%04x) sending %d of %d bytes %s\n",
 			__func__, hc->id + 1, ch, Zspace, z1, z2, ii-i, len-i,
-			temp ? "HDLC":"TRANS");
+			temp ? "HDLC" : "TRANS");
 
 	/* Have to prep the audio data */
 	hc->write_fifo(hc, d, ii - i);
@@ -2780,13 +2783,13 @@ hfcmulti_interrupt(int intno, void *dev_id)
 			handle_timer_irq(hc);
 		}
 
-		if (r_irq_misc & V_DTMF_IRQ) {
+		if (r_irq_misc & V_DTMF_IRQ)
 			hfcmulti_dtmf(hc);
-		}
+
 		if (r_irq_misc & V_IRQ_PROC) {
 			static int irq_proc_cnt;
 			if (!irq_proc_cnt++)
-				printk(KERN_WARNING "%s: got V_IRQ_PROC -"
+				printk(KERN_DEBUG "%s: got V_IRQ_PROC -"
 				    " this should not happen\n", __func__);
 		}
 
@@ -2936,7 +2939,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		else
 			flow_rx = 0xc0; /* ST->(FIFO,PCM) */
 		/* put on slot */
-		routing = bank_rx?0x80:0xc0; /* reversed */
+		routing = bank_rx ? 0x80 : 0xc0; /* reversed */
 		if (conf >= 0 || bank_rx > 1)
 			routing = 0x40; /* loop */
 		if (debug & DEBUG_HFCMULTI_MODE)
@@ -2971,7 +2974,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 		HFC_wait(hc);
 		if (hc->chan[ch].bch && hc->ctype != HFC_TYPE_E1) {
 			hc->hw.a_st_ctrl0[hc->chan[ch].port] &=
-			    ((ch & 0x3) == 0)? ~V_B1_EN: ~V_B2_EN;
+			    ((ch & 0x3) == 0) ? ~V_B1_EN : ~V_B2_EN;
 			HFC_outb(hc, R_ST_SEL, hc->chan[ch].port);
 			/* undocumented: delay after R_ST_SEL */
 			udelay(1);
@@ -3505,9 +3508,9 @@ handle_bmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 		switch (hh->id) {
 		case HFC_SPL_LOOP_ON: /* set sample loop */
 			if (debug & DEBUG_HFCMULTI_MSG)
-			printk(KERN_DEBUG
-			    "%s: HFC_SPL_LOOP_ON (len = %d)\n",
-			    __func__, skb->len);
+				printk(KERN_DEBUG
+				    "%s: HFC_SPL_LOOP_ON (len = %d)\n",
+				    __func__, skb->len);
 			ret = 0;
 			break;
 		case HFC_SPL_LOOP_OFF: /* set silence */
@@ -3716,8 +3719,7 @@ ph_state_change(struct dchannel *dch)
 	int ch, i;
 
 	if (!dch) {
-		printk(KERN_WARNING "%s: ERROR given dch is NULL\n",
-		    __func__);
+		printk(KERN_WARNING "%s: ERROR given dch is NULL\n", __func__);
 		return;
 	}
 	hc = dch->hw;
@@ -3738,14 +3740,15 @@ ph_state_change(struct dchannel *dch)
 		switch (dch->state) {
 		case (1):
 			if (hc->e1_state != 1) {
-			    for (i = 1; i <= 31; i++) {
-				/* reset fifos on e1 activation */
-				HFC_outb_nodebug(hc, R_FIFO, (i << 1) | 1);
-				HFC_wait_nodebug(hc);
-				HFC_outb_nodebug(hc,
-					R_INC_RES_FIFO, V_RES_F);
-				HFC_wait_nodebug(hc);
-			    }
+				for (i = 1; i <= 31; i++) {
+					/* reset fifos on e1 activation */
+					HFC_outb_nodebug(hc, R_FIFO,
+						(i << 1) | 1);
+					HFC_wait_nodebug(hc);
+					HFC_outb_nodebug(hc, R_INC_RES_FIFO,
+						V_RES_F);
+					HFC_wait_nodebug(hc);
+				}
 			}
 			test_and_set_bit(FLG_ACTIVE, &dch->Flags);
 			_queue_data(&dch->dev.D, PH_ACTIVATE_IND,
@@ -4045,12 +4048,12 @@ open_dchannel(struct hfc_multi *hc, struct dchannel *dch,
 		return -EINVAL;
 	if ((dch->dev.D.protocol != ISDN_P_NONE) &&
 	    (dch->dev.D.protocol != rq->protocol)) {
-	    if (debug & DEBUG_HFCMULTI_MODE)
-		printk(KERN_WARNING "%s: change protocol %x to %x\n",
-		    __func__, dch->dev.D.protocol, rq->protocol);
+		if (debug & DEBUG_HFCMULTI_MODE)
+			printk(KERN_DEBUG "%s: change protocol %x to %x\n",
+			    __func__, dch->dev.D.protocol, rq->protocol);
 	}
-	if ((dch->dev.D.protocol == ISDN_P_TE_S0)
-	 && (rq->protocol != ISDN_P_TE_S0))
+	if ((dch->dev.D.protocol == ISDN_P_TE_S0) &&
+	    (rq->protocol != ISDN_P_TE_S0))
 		l1_event(dch->l1, CLOSE_CHANNEL);
 	if (dch->dev.D.protocol != rq->protocol) {
 		if (rq->protocol == ISDN_P_TE_S0) {
@@ -4127,9 +4130,9 @@ channel_dctrl(struct dchannel *dch, struct mISDN_ctrl_req *cq)
 		wd_cnt = cq->p1 & 0xf;
 		wd_mode = !!(cq->p1 >> 4);
 		if (debug & DEBUG_HFCMULTI_MSG)
-			printk(KERN_DEBUG
-			    "%s: MISDN_CTRL_HFC_WD_INIT mode %s counter 0x%x\n",
-			    __func__, wd_mode ? "AUTO" : "MANUAL", wd_cnt);
+			printk(KERN_DEBUG "%s: MISDN_CTRL_HFC_WD_INIT mode %s"
+			    ", counter 0x%x\n", __func__,
+			    wd_mode ? "AUTO" : "MANUAL", wd_cnt);
 		/* set the watchdog timer */
 		HFC_outb(hc, R_TI_WD, poll_timer | (wd_cnt << 4));
 		hc->hw.r_bert_wd_md = (wd_mode ? V_AUTO_WD_RES : 0);
@@ -4139,8 +4142,8 @@ channel_dctrl(struct dchannel *dch, struct mISDN_ctrl_req *cq)
 		HFC_outb(hc, R_BERT_WD_MD, hc->hw.r_bert_wd_md | V_WD_RES);
 		if (test_bit(HFC_CHIP_PLXSD, &hc->chip)) {
 			/* enable the watchdog output for Speech-Design */
-			HFC_outb(hc, R_GPIO_SEL, V_GPIO_SEL7);
-			HFC_outb(hc, R_GPIO_EN1, V_GPIO_EN15);
+			HFC_outb(hc, R_GPIO_SEL,  V_GPIO_SEL7);
+			HFC_outb(hc, R_GPIO_EN1,  V_GPIO_EN15);
 			HFC_outb(hc, R_GPIO_OUT1, 0);
 			HFC_outb(hc, R_GPIO_OUT1, V_GPIO_OUT15);
 		}
@@ -4319,7 +4322,7 @@ error:
 	}
 
 	if (debug & DEBUG_HFCMULTI_INIT)
-		printk(KERN_WARNING "%s: free irq %d\n", __func__, hc->irq);
+		printk(KERN_DEBUG "%s: free irq %d\n", __func__, hc->irq);
 	if (hc->irq) {
 		free_irq(hc->irq, hc);
 		hc->irq = 0;
@@ -4624,7 +4627,7 @@ release_card(struct hfc_multi *hc)
 	int	ch;
 
 	if (debug & DEBUG_HFCMULTI_INIT)
-		printk(KERN_WARNING "%s: release card (%d) entered\n",
+		printk(KERN_DEBUG "%s: release card (%d) entered\n",
 		    __func__, hc->id);
 
 	/* unregister clock source */
@@ -4653,7 +4656,7 @@ release_card(struct hfc_multi *hc)
 	/* release hardware & irq */
 	if (hc->irq) {
 		if (debug & DEBUG_HFCMULTI_INIT)
-			printk(KERN_WARNING "%s: free irq %d\n",
+			printk(KERN_DEBUG "%s: free irq %d\n",
 			    __func__, hc->irq);
 		free_irq(hc->irq, hc);
 		hc->irq = 0;
@@ -4662,17 +4665,17 @@ release_card(struct hfc_multi *hc)
 	release_io_hfcmulti(hc);
 
 	if (debug & DEBUG_HFCMULTI_INIT)
-		printk(KERN_WARNING "%s: remove instance from list\n",
+		printk(KERN_DEBUG "%s: remove instance from list\n",
 		     __func__);
 	list_del(&hc->list);
 
 	if (debug & DEBUG_HFCMULTI_INIT)
-		printk(KERN_WARNING "%s: delete instance\n", __func__);
+		printk(KERN_DEBUG "%s: delete instance\n", __func__);
 	if (hc == syncmaster)
 		syncmaster = NULL;
 	kfree(hc);
 	if (debug & DEBUG_HFCMULTI_INIT)
-		printk(KERN_WARNING "%s: card successfully removed\n",
+		printk(KERN_DEBUG "%s: card successfully removed\n",
 		    __func__);
 }
 
@@ -4695,7 +4698,7 @@ init_e1_port(struct hfc_multi *hc, struct hm_map *m)
 	    (1 << (ISDN_P_B_HDLC & ISDN_P_B_MASK));
 	dch->dev.D.send = handle_dmsg;
 	dch->dev.D.ctrl = hfcm_dctrl;
-	dch->dev.nrbchan = (hc->dslot)?30:31;
+	dch->dev.nrbchan = (hc->dslot) ? 30 : 31;
 	dch->slot = hc->dslot;
 	hc->chan[hc->dslot].dch = dch;
 	hc->chan[hc->dslot].port = 0;
@@ -4937,7 +4940,7 @@ init_multi_port(struct hfc_multi *hc, int pt)
 	}
 	/* disable E-channel */
 	if (port[Port_cnt] & 0x004) {
-	if (debug & DEBUG_HFCMULTI_INIT)
+		if (debug & DEBUG_HFCMULTI_INIT)
 			printk(KERN_DEBUG
 			    "%s: PROTOCOL disable E-channel: "
 			    "card(%d) port(%d)\n",
@@ -5222,7 +5225,7 @@ static void __devexit hfc_remove_pci(struct pci_dev *pdev)
 		spin_unlock_irqrestore(&HFClock, flags);
 	}  else {
 		if (debug)
-			printk(KERN_WARNING "%s: drvdata allready removed\n",
+			printk(KERN_DEBUG "%s: drvdata allready removed\n",
 			    __func__);
 	}
 }
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index 60dc92562c6..776afc8c927 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -257,7 +257,7 @@ reset_hfcpci(struct hfc_pci *hc)
 	Write_hfc(hc, HFCPCI_INT_M1, hc->hw.int_m1);
 
 	/* Clear already pending ints */
-	if (Read_hfc(hc, HFCPCI_INT_S1));
+	val = Read_hfc(hc, HFCPCI_INT_S1);
 
 	/* set NT/TE mode */
 	hfcpci_setmode(hc);
@@ -499,7 +499,8 @@ receive_dmsg(struct hfc_pci *hc)
 			df->f2 = ((df->f2 + 1) & MAX_D_FRAMES) |
 			    (MAX_D_FRAMES + 1);	/* next buffer */
 			df->za[df->f2 & D_FREG_MASK].z2 =
-			    cpu_to_le16((le16_to_cpu(zp->z2) + rcnt) & (D_FIFO_SIZE - 1));
+			    cpu_to_le16((le16_to_cpu(zp->z2) + rcnt) &
+			    (D_FIFO_SIZE - 1));
 		} else {
 			dch->rx_skb = mI_alloc_skb(rcnt - 3, GFP_ATOMIC);
 			if (!dch->rx_skb) {
@@ -966,6 +967,7 @@ static void
 ph_state_nt(struct dchannel *dch)
 {
 	struct hfc_pci	*hc = dch->hw;
+	u_char	val;
 
 	if (dch->debug)
 		printk(KERN_DEBUG "%s: NT newstate %x\n",
@@ -979,7 +981,7 @@ ph_state_nt(struct dchannel *dch)
 			hc->hw.int_m1 &= ~HFCPCI_INTS_TIMER;
 			Write_hfc(hc, HFCPCI_INT_M1, hc->hw.int_m1);
 			/* Clear already pending ints */
-			if (Read_hfc(hc, HFCPCI_INT_S1));
+			val = Read_hfc(hc, HFCPCI_INT_S1);
 			Write_hfc(hc, HFCPCI_STATES, 4 | HFCPCI_LOAD_STATE);
 			udelay(10);
 			Write_hfc(hc, HFCPCI_STATES, 4);
@@ -1268,8 +1270,7 @@ mode_hfcpci(struct bchannel *bch, int bc, int protocol)
 		rx_slot = (bc>>8) & 0xff;
 		tx_slot = (bc>>16) & 0xff;
 		bc = bc & 0xff;
-	} else if (test_bit(HFC_CFG_PCM, &hc->cfg) &&
-	    (protocol > ISDN_P_NONE))
+	} else if (test_bit(HFC_CFG_PCM, &hc->cfg) && (protocol > ISDN_P_NONE))
 		printk(KERN_WARNING "%s: no pcm channel id but HFC_CFG_PCM\n",
 		    __func__);
 	if (hc->chanlimit > 1) {
@@ -1327,8 +1328,8 @@ mode_hfcpci(struct bchannel *bch, int bc, int protocol)
 	case (ISDN_P_B_RAW):
 		bch->state = protocol;
 		bch->nr = bc;
-		hfcpci_clear_fifo_rx(hc, (fifo2 & 2)?1:0);
-		hfcpci_clear_fifo_tx(hc, (fifo2 & 2)?1:0);
+		hfcpci_clear_fifo_rx(hc, (fifo2 & 2) ? 1 : 0);
+		hfcpci_clear_fifo_tx(hc, (fifo2 & 2) ? 1 : 0);
 		if (bc & 2) {
 			hc->hw.sctrl |= SCTRL_B2_ENA;
 			hc->hw.sctrl_r |= SCTRL_B2_ENA;
@@ -1362,8 +1363,8 @@ mode_hfcpci(struct bchannel *bch, int bc, int protocol)
 	case (ISDN_P_B_HDLC):
 		bch->state = protocol;
 		bch->nr = bc;
-		hfcpci_clear_fifo_rx(hc, (fifo2 & 2)?1:0);
-		hfcpci_clear_fifo_tx(hc, (fifo2 & 2)?1:0);
+		hfcpci_clear_fifo_rx(hc, (fifo2 & 2) ? 1 : 0);
+		hfcpci_clear_fifo_tx(hc, (fifo2 & 2) ? 1 : 0);
 		if (bc & 2) {
 			hc->hw.sctrl |= SCTRL_B2_ENA;
 			hc->hw.sctrl_r |= SCTRL_B2_ENA;
@@ -1457,7 +1458,7 @@ set_hfcpci_rxtest(struct bchannel *bch, int protocol, int chan)
 	switch (protocol) {
 	case (ISDN_P_B_RAW):
 		bch->state = protocol;
-		hfcpci_clear_fifo_rx(hc, (chan & 2)?1:0);
+		hfcpci_clear_fifo_rx(hc, (chan & 2) ? 1 : 0);
 		if (chan & 2) {
 			hc->hw.sctrl_r |= SCTRL_B2_ENA;
 			hc->hw.fifo_en |= HFCPCI_FIFOEN_B2RX;
@@ -1482,7 +1483,7 @@ set_hfcpci_rxtest(struct bchannel *bch, int protocol, int chan)
 		break;
 	case (ISDN_P_B_HDLC):
 		bch->state = protocol;
-		hfcpci_clear_fifo_rx(hc, (chan & 2)?1:0);
+		hfcpci_clear_fifo_rx(hc, (chan & 2) ? 1 : 0);
 		if (chan & 2) {
 			hc->hw.sctrl_r |= SCTRL_B2_ENA;
 			hc->hw.last_bfifo_cnt[1] = 0;
@@ -2047,7 +2048,8 @@ setup_hw(struct hfc_pci *hc)
 		printk(KERN_WARNING "HFC-PCI: No IRQ for PCI card found\n");
 		return 1;
 	}
-	hc->hw.pci_io = (char __iomem *)(unsigned long)hc->pdev->resource[1].start;
+	hc->hw.pci_io =
+		(char __iomem *)(unsigned long)hc->pdev->resource[1].start;
 
 	if (!hc->hw.pci_io) {
 		printk(KERN_WARNING "HFC-PCI: No IO-Mem for PCI card found\n");
@@ -2289,7 +2291,7 @@ hfc_remove_pci(struct pci_dev *pdev)
 		release_card(card);
 	else
 		if (debug)
-			printk(KERN_WARNING "%s: drvdata already removed\n",
+			printk(KERN_DEBUG "%s: drvdata already removed\n",
 			    __func__);
 }
 
diff --git a/drivers/isdn/mISDN/dsp_audio.c b/drivers/isdn/mISDN/dsp_audio.c
index de3795e3f43..9c7c6451bf3 100644
--- a/drivers/isdn/mISDN/dsp_audio.c
+++ b/drivers/isdn/mISDN/dsp_audio.c
@@ -210,9 +210,8 @@ dsp_audio_generate_seven(void)
 		j = 0;
 		for (k = 0; k < 256; k++) {
 			if (dsp_audio_alaw_to_s32[k]
-				< dsp_audio_alaw_to_s32[i]) {
-			j++;
-			}
+			    < dsp_audio_alaw_to_s32[i])
+				j++;
 		}
 		sorted_alaw[j] = i;
 	}
diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c
index 05866184ba2..9c7c0d1ba55 100644
--- a/drivers/isdn/mISDN/dsp_cmx.c
+++ b/drivers/isdn/mISDN/dsp_cmx.c
@@ -238,7 +238,7 @@ dsp_cmx_add_conf_member(struct dsp *dsp, struct dsp_conf *conf)
 
 	member = kzalloc(sizeof(struct dsp_conf_member), GFP_ATOMIC);
 	if (!member) {
-		printk(KERN_ERR "kmalloc struct dsp_conf_member failed\n");
+		printk(KERN_ERR "kzalloc struct dsp_conf_member failed\n");
 		return -ENOMEM;
 	}
 	member->dsp = dsp;
@@ -317,7 +317,7 @@ static struct dsp_conf
 
 	conf = kzalloc(sizeof(struct dsp_conf), GFP_ATOMIC);
 	if (!conf) {
-		printk(KERN_ERR "kmalloc struct dsp_conf failed\n");
+		printk(KERN_ERR "kzalloc struct dsp_conf failed\n");
 		return NULL;
 	}
 	INIT_LIST_HEAD(&conf->mlist);
@@ -1389,7 +1389,8 @@ dsp_cmx_send_member(struct dsp *dsp, int len, s32 *c, int members)
 		while (r != rr && t != tt) {
 #ifdef CMX_TX_DEBUG
 			if (strlen(debugbuf) < 48)
-			    sprintf(debugbuf+strlen(debugbuf), " %02x", p[t]);
+				sprintf(debugbuf+strlen(debugbuf), " %02x",
+				    p[t]);
 #endif
 			*d++ = p[t]; /* write tx_buff */
 			t = (t+1) & CMX_BUFF_MASK;
diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c
index a0e0af81eb2..c35750647c6 100644
--- a/drivers/isdn/mISDN/dsp_core.c
+++ b/drivers/isdn/mISDN/dsp_core.c
@@ -280,7 +280,7 @@ dsp_fill_empty(struct dsp *dsp)
 static int
 dsp_control_req(struct dsp *dsp, struct mISDNhead *hh, struct sk_buff *skb)
 {
-	struct		sk_buff *nskb;
+	struct sk_buff	*nskb;
 	int ret = 0;
 	int cont;
 	u8 *data;
@@ -558,7 +558,7 @@ tone_off:
 			dsp->pipeline.inuse = 1;
 			dsp_cmx_hardware(dsp->conf, dsp);
 			ret = dsp_pipeline_build(&dsp->pipeline,
-				len > 0 ? (char *)data : NULL);
+				len > 0 ? data : NULL);
 			dsp_cmx_hardware(dsp->conf, dsp);
 			dsp_rx_off(dsp);
 		}
@@ -720,7 +720,7 @@ dsp_function(struct mISDNchannel *ch,  struct sk_buff *skb)
 		/* check if dtmf soft decoding is turned on */
 		if (dsp->dtmf.software) {
 			digits = dsp_dtmf_goertzel_decode(dsp, skb->data,
-				skb->len, (dsp_options&DSP_OPT_ULAW)?1:0);
+				skb->len, (dsp_options&DSP_OPT_ULAW) ? 1 : 0);
 		}
 		/* we need to process receive data if software */
 		if (dsp->conf && dsp->conf->software) {
@@ -952,7 +952,7 @@ dsp_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg)
 	int		err = 0;
 
 	if (debug & DEBUG_DSP_CTRL)
-	printk(KERN_DEBUG "%s:(%x)\n", __func__, cmd);
+		printk(KERN_DEBUG "%s:(%x)\n", __func__, cmd);
 
 	switch (cmd) {
 	case OPEN_CHANNEL:
@@ -1175,9 +1175,9 @@ static int dsp_init(void)
 
 	/* init conversion tables */
 	dsp_audio_generate_law_tables();
-	dsp_silence = (dsp_options&DSP_OPT_ULAW)?0xff:0x2a;
-	dsp_audio_law_to_s32 = (dsp_options&DSP_OPT_ULAW)?dsp_audio_ulaw_to_s32:
-		dsp_audio_alaw_to_s32;
+	dsp_silence = (dsp_options&DSP_OPT_ULAW) ? 0xff : 0x2a;
+	dsp_audio_law_to_s32 = (dsp_options&DSP_OPT_ULAW) ?
+		dsp_audio_ulaw_to_s32 : dsp_audio_alaw_to_s32;
 	dsp_audio_generate_s2law_table();
 	dsp_audio_generate_seven();
 	dsp_audio_generate_mix_table();
diff --git a/drivers/isdn/mISDN/dsp_ecdis.h b/drivers/isdn/mISDN/dsp_ecdis.h
index 8a20af43308..21dbd153ee2 100644
--- a/drivers/isdn/mISDN/dsp_ecdis.h
+++ b/drivers/isdn/mISDN/dsp_ecdis.h
@@ -91,7 +91,7 @@ int16_t amp)
 					&& det->tone_cycle_duration <= 475*8) {
 					det->good_cycles++;
 					if (det->good_cycles > 2)
-					det->hit = TRUE;
+						det->hit = TRUE;
 				}
 				det->tone_cycle_duration = 0;
 			}
diff --git a/drivers/isdn/mISDN/dsp_tones.c b/drivers/isdn/mISDN/dsp_tones.c
index 7a9af66f4b1..1debf53670d 100644
--- a/drivers/isdn/mISDN/dsp_tones.c
+++ b/drivers/isdn/mISDN/dsp_tones.c
@@ -253,18 +253,24 @@ static struct pattern {
 	{8000, 0, 0, 0, 0, 0, 0, 0, 0, 0} },
 
 	{TONE_GERMAN_DIALPBX,
-	{DATA_GA, DATA_S, DATA_GA, DATA_S, DATA_GA, DATA_S, NULL, NULL, NULL, NULL},
-	{SIZE_GA, SIZE_S, SIZE_GA, SIZE_S, SIZE_GA, SIZE_S, NULL, NULL, NULL, NULL},
+	{DATA_GA, DATA_S, DATA_GA, DATA_S, DATA_GA, DATA_S, NULL, NULL, NULL,
+		NULL},
+	{SIZE_GA, SIZE_S, SIZE_GA, SIZE_S, SIZE_GA, SIZE_S, NULL, NULL, NULL,
+		NULL},
 	{2000, 2000, 2000, 2000, 2000, 12000, 0, 0, 0, 0} },
 
 	{TONE_GERMAN_OLDDIALPBX,
-	{DATA_GO, DATA_S, DATA_GO, DATA_S, DATA_GO, DATA_S, NULL, NULL, NULL, NULL},
-	{SIZE_GO, SIZE_S, SIZE_GO, SIZE_S, SIZE_GO, SIZE_S, NULL, NULL, NULL, NULL},
+	{DATA_GO, DATA_S, DATA_GO, DATA_S, DATA_GO, DATA_S, NULL, NULL, NULL,
+		NULL},
+	{SIZE_GO, SIZE_S, SIZE_GO, SIZE_S, SIZE_GO, SIZE_S, NULL, NULL, NULL,
+		NULL},
 	{2000, 2000, 2000, 2000, 2000, 12000, 0, 0, 0, 0} },
 
 	{TONE_AMERICAN_DIALPBX,
-	{DATA_DT, DATA_S, DATA_DT, DATA_S, DATA_DT, DATA_S, NULL, NULL, NULL, NULL},
-	{SIZE_DT, SIZE_S, SIZE_DT, SIZE_S, SIZE_DT, SIZE_S, NULL, NULL, NULL, NULL},
+	{DATA_DT, DATA_S, DATA_DT, DATA_S, DATA_DT, DATA_S, NULL, NULL, NULL,
+		NULL},
+	{SIZE_DT, SIZE_S, SIZE_DT, SIZE_S, SIZE_DT, SIZE_S, NULL, NULL, NULL,
+		NULL},
 	{2000, 2000, 2000, 2000, 2000, 12000, 0, 0, 0, 0} },
 
 	{TONE_GERMAN_RINGING,
@@ -434,7 +440,7 @@ dsp_tone_hw_message(struct dsp *dsp, u8 *sample, int len)
 
 	/* unlocking is not required, because we don't expect a response */
 	nskb = _alloc_mISDN_skb(PH_CONTROL_REQ,
-		(len)?HFC_SPL_LOOP_ON:HFC_SPL_LOOP_OFF, len, sample,
+		(len) ? HFC_SPL_LOOP_ON : HFC_SPL_LOOP_OFF, len, sample,
 		GFP_ATOMIC);
 	if (nskb) {
 		if (dsp->ch.peer) {
@@ -498,8 +504,7 @@ dsp_tone(struct dsp *dsp, int tone)
 
 	/* we turn off the tone */
 	if (!tone) {
-		if (dsp->features.hfc_loops)
-		if (timer_pending(&tonet->tl))
+		if (dsp->features.hfc_loops && timer_pending(&tonet->tl))
 			del_timer(&tonet->tl);
 		if (dsp->features.hfc_loops)
 			dsp_tone_hw_message(dsp, NULL, 0);
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index ea3c3aa2004..d9cf83b17e3 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -308,8 +308,8 @@ l1oip_socket_send(struct l1oip *hc, u8 localcodec, u8 channel, u32 chanmask,
 
 	/* assemble frame */
 	*p++ = (L1OIP_VERSION<<6) /* version and coding */
-	     | (hc->pri?0x20:0x00) /* type */
-	     | (hc->id?0x10:0x00) /* id */
+	     | (hc->pri ? 0x20 : 0x00) /* type */
+	     | (hc->id ? 0x10 : 0x00) /* id */
 	     | localcodec;
 	if (hc->id) {
 		*p++ = hc->id>>24; /* id */
@@ -317,7 +317,7 @@ l1oip_socket_send(struct l1oip *hc, u8 localcodec, u8 channel, u32 chanmask,
 		*p++ = hc->id>>8;
 		*p++ = hc->id;
 	}
-	*p++ = (multi == 1)?0x80:0x00 + channel; /* m-flag, channel */
+	*p++ = (multi == 1) ? 0x80 : 0x00 + channel; /* m-flag, channel */
 	if (multi == 1)
 		*p++ = len; /* length */
 	*p++ = timebase>>8; /* time base */
@@ -401,12 +401,12 @@ l1oip_socket_recv(struct l1oip *hc, u8 remotecodec, u8 channel, u16 timebase,
 	}
 
 	/* prepare message */
-	nskb = mI_alloc_skb((remotecodec == 3)?(len<<1):len, GFP_ATOMIC);
+	nskb = mI_alloc_skb((remotecodec == 3) ? (len<<1) : len, GFP_ATOMIC);
 	if (!nskb) {
 		printk(KERN_ERR "%s: No mem for skb.\n", __func__);
 		return;
 	}
-	p = skb_put(nskb, (remotecodec == 3)?(len<<1):len);
+	p = skb_put(nskb, (remotecodec == 3) ? (len<<1) : len);
 
 	if (remotecodec == 1 && ulaw)
 		l1oip_alaw_to_ulaw(buf, len, p);
@@ -458,7 +458,7 @@ l1oip_socket_recv(struct l1oip *hc, u8 remotecodec, u8 channel, u16 timebase,
 		hc->chan[channel].disorder_flag ^= 1;
 		if (nskb)
 #endif
-		queue_ch_frame(&bch->ch, PH_DATA_IND, rx_counter, nskb);
+			queue_ch_frame(&bch->ch, PH_DATA_IND, rx_counter, nskb);
 	}
 }
 
@@ -749,8 +749,8 @@ l1oip_socket_thread(void *data)
 			l1oip_socket_parse(hc, &sin_rx, recvbuf, recvlen);
 		} else {
 			if (debug & DEBUG_L1OIP_SOCKET)
-			    printk(KERN_WARNING "%s: broken pipe on socket\n",
-				__func__);
+				printk(KERN_WARNING
+				    "%s: broken pipe on socket\n", __func__);
 		}
 	}
 
@@ -925,7 +925,7 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 		p = skb->data;
 		l = skb->len;
 		while (l) {
-			ll = (l < L1OIP_MAX_PERFRAME)?l:L1OIP_MAX_PERFRAME;
+			ll = (l < L1OIP_MAX_PERFRAME) ? l : L1OIP_MAX_PERFRAME;
 			l1oip_socket_send(hc, 0, dch->slot, 0,
 				hc->chan[dch->slot].tx_counter++, p, ll);
 			p += ll;
@@ -1173,7 +1173,7 @@ handle_bmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 		p = skb->data;
 		l = skb->len;
 		while (l) {
-			ll = (l < L1OIP_MAX_PERFRAME)?l:L1OIP_MAX_PERFRAME;
+			ll = (l < L1OIP_MAX_PERFRAME) ? l : L1OIP_MAX_PERFRAME;
 			l1oip_socket_send(hc, hc->codec, bch->slot, 0,
 				hc->chan[bch->slot].tx_counter, p, ll);
 			hc->chan[bch->slot].tx_counter += ll;
@@ -1331,8 +1331,8 @@ init_card(struct l1oip *hc, int pri, int bundle)
 	spin_lock_init(&hc->socket_lock);
 	hc->idx = l1oip_cnt;
 	hc->pri = pri;
-	hc->d_idx = pri?16:3;
-	hc->b_num = pri?30:2;
+	hc->d_idx = pri ? 16 : 3;
+	hc->b_num = pri ? 30 : 2;
 	hc->bundle = bundle;
 	if (hc->pri)
 		sprintf(hc->name, "l1oip-e1.%d", l1oip_cnt + 1);
@@ -1517,9 +1517,9 @@ l1oip_init(void)
 
 		if (debug & DEBUG_L1OIP_INIT)
 			printk(KERN_DEBUG "%s: interface %d is %s with %s.\n",
-				__func__, l1oip_cnt, pri?"PRI":"BRI",
-				bundle?"bundled IP packet for all B-channels"
-				 :"seperate IP packets for every B-channel");
+			    __func__, l1oip_cnt, pri ? "PRI" : "BRI",
+			    bundle ? "bundled IP packet for all B-channels" :
+			    "seperate IP packets for every B-channel");
 
 		hc = kzalloc(sizeof(struct l1oip), GFP_ATOMIC);
 		if (!hc) {
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 2a2c30a9438..c36f5213745 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -222,7 +222,7 @@ mISDN_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	} else { /* use default for L2 messages */
 		if ((sk->sk_protocol == ISDN_P_LAPD_TE) ||
 		    (sk->sk_protocol == ISDN_P_LAPD_NT))
-		    mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr;
+			mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr;
 	}
 
 	if (*debug & DEBUG_SOCKET)
diff --git a/drivers/isdn/mISDN/tei.c b/drivers/isdn/mISDN/tei.c
index 778b660f067..bfcdd97df95 100644
--- a/drivers/isdn/mISDN/tei.c
+++ b/drivers/isdn/mISDN/tei.c
@@ -872,7 +872,6 @@ ph_data_ind(struct manager *mgr, struct sk_buff *skb)
 			    __func__, skb->len);
 		goto done;
 	}
-	if (*debug  & DEBUG_L2_TEI)
 
 	if ((skb->data[0] >> 2) != TEI_SAPI) /* not for us */
 		goto done;
diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index bbd99d3282c..5b7e9bf514f 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -259,7 +259,7 @@ mISDN_ioctl(struct inode *inode, struct file *filep, unsigned int cmd,
 	return ret;
 }
 
-static struct file_operations mISDN_fops = {
+static const struct file_operations mISDN_fops = {
 	.read		= mISDN_read,
 	.poll		= mISDN_poll,
 	.ioctl		= mISDN_ioctl,
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 0b28fd1e393..45100b39a7c 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -292,19 +292,19 @@ struct mISDN_devrename {
 
 /* MPH_INFORMATION_REQ payload */
 struct ph_info_ch {
-        __u32 protocol;
-        __u64 Flags;
+	__u32 protocol;
+	__u64 Flags;
 };
 
 struct ph_info_dch {
-        struct ph_info_ch ch;
-        __u16 state;
-        __u16 num_bch;
+	struct ph_info_ch ch;
+	__u16 state;
+	__u16 num_bch;
 };
 
 struct ph_info {
-        struct ph_info_dch dch;
-        struct ph_info_ch  bch[];
+	struct ph_info_dch dch;
+	struct ph_info_ch  bch[];
 };
 
 /* timer device ioctl */
-- 
cgit v1.2.3-70-g09d2


From daebafed7fef54fcc73d2d01431122cfd578d1e0 Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Mon, 25 May 2009 00:56:56 -0700
Subject: mISDN: Added PCI ID for new Junghanns.net Single E1 cards.

The new ID is validated by Cologne Chip.
LEDs control is also supported.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hardware/mISDN/hfcmulti.c | 4 ++++
 include/linux/pci_ids.h                | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index d60f5b7a41d..e1dab30aed3 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -5357,6 +5357,10 @@ static struct pci_device_id hfmultipci_ids[] __devinitdata = {
 		PCI_SUBDEVICE_ID_CCD_SPD4S, 0, 0, H(26)}, /* PLX PCI Bridge */
 	{ PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9030, PCI_VENDOR_ID_CCD,
 		PCI_SUBDEVICE_ID_CCD_SPDE1, 0, 0, H(27)}, /* PLX PCI Bridge */
+
+	{ PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_HFCE1, PCI_VENDOR_ID_CCD,
+		PCI_SUBDEVICE_ID_CCD_JHSE1, 0, 0, H(25)}, /* Junghanns E1 */
+
 	{ PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_HFC4S, PCI_ANY_ID, PCI_ANY_ID,
 		0, 0, 0},
 	{ PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_HFC8S, PCI_ANY_ID, PCI_ANY_ID,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4bdff984ac9..12db06cf0e2 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1914,6 +1914,7 @@
 #define PCI_SUBDEVICE_ID_CCD_SWYX4S	0xB540
 #define PCI_SUBDEVICE_ID_CCD_JH4S20	0xB550
 #define PCI_SUBDEVICE_ID_CCD_IOB8ST_1	0xB552
+#define PCI_SUBDEVICE_ID_CCD_JHSE1	0xB553
 #define PCI_SUBDEVICE_ID_CCD_JH8S	0xB55B
 #define PCI_SUBDEVICE_ID_CCD_BN4S	0xB560
 #define PCI_SUBDEVICE_ID_CCD_BN8S	0xB562
-- 
cgit v1.2.3-70-g09d2


From e3804cbebb67887879102925961d41b503f7fbe3 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Mon, 25 May 2009 01:53:53 -0700
Subject: net: remove COMPAT_NET_DEV_OPS

All drivers are already converted to new net_device_ops API
and nobody uses old API anymore.

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig       |  9 ---------
 include/linux/netdevice.h | 38 -----------------------------------
 net/802/fddi.c            |  4 ----
 net/802/hippi.c           |  5 -----
 net/8021q/vlan_dev.c      |  1 -
 net/appletalk/dev.c       | 11 -----------
 net/core/dev.c            | 50 -----------------------------------------------
 net/ethernet/eth.c        |  5 -----
 8 files changed, 123 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0fb446e047e..efa659f0fb5 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -25,15 +25,6 @@ menuconfig NETDEVICES
 # that for each of the symbols.
 if NETDEVICES
 
-config COMPAT_NET_DEV_OPS
-       default y
-       bool "Enable older network device API compatibility"
-       ---help---
-          This option enables kernel compatibility with older network devices
-          that do not use net_device_ops interface.
-
-	  If unsure, say Y.
-
 config IFB
 	tristate "Intermediate Functional Block support"
 	depends on NET_CLS_ACT
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8574e76b74..ae3c2099a04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -901,44 +901,6 @@ struct net_device
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
-
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	struct {
-		int			(*init)(struct net_device *dev);
-		void			(*uninit)(struct net_device *dev);
-		int			(*open)(struct net_device *dev);
-		int			(*stop)(struct net_device *dev);
-		int			(*hard_start_xmit) (struct sk_buff *skb,
-							    struct net_device *dev);
-		u16			(*select_queue)(struct net_device *dev,
-							struct sk_buff *skb);
-		void			(*change_rx_flags)(struct net_device *dev,
-							   int flags);
-		void			(*set_rx_mode)(struct net_device *dev);
-		void			(*set_multicast_list)(struct net_device *dev);
-		int			(*set_mac_address)(struct net_device *dev,
-							   void *addr);
-		int			(*validate_addr)(struct net_device *dev);
-		int			(*do_ioctl)(struct net_device *dev,
-						    struct ifreq *ifr, int cmd);
-		int			(*set_config)(struct net_device *dev,
-						      struct ifmap *map);
-		int			(*change_mtu)(struct net_device *dev, int new_mtu);
-		int			(*neigh_setup)(struct net_device *dev,
-						       struct neigh_parms *);
-		void			(*tx_timeout) (struct net_device *dev);
-		struct net_device_stats* (*get_stats)(struct net_device *dev);
-		void			(*vlan_rx_register)(struct net_device *dev,
-							    struct vlan_group *grp);
-		void			(*vlan_rx_add_vid)(struct net_device *dev,
-							   unsigned short vid);
-		void			(*vlan_rx_kill_vid)(struct net_device *dev,
-							    unsigned short vid);
-#ifdef CONFIG_NET_POLL_CONTROLLER
-		void                    (*poll_controller)(struct net_device *dev);
-#endif
-	};
-#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 539e6064e6d..3ef0ab0a543 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -185,10 +185,6 @@ static const struct header_ops fddi_header_ops = {
 static void fddi_setup(struct net_device *dev)
 {
 	dev->header_ops		= &fddi_header_ops;
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	dev->change_mtu		= fddi_change_mtu,
-#endif
-
 	dev->type		= ARPHRD_FDDI;
 	dev->hard_header_len	= FDDI_K_SNAP_HLEN+3;	/* Assume 802.2 SNAP hdr len + 3 pad bytes */
 	dev->mtu		= FDDI_K_SNAP_DLEN;	/* Assume max payload of 802.2 SNAP frame */
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 313b9ebf92e..cd3e8e92952 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -193,11 +193,6 @@ static const struct header_ops hippi_header_ops = {
 
 static void hippi_setup(struct net_device *dev)
 {
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	dev->change_mtu			= hippi_change_mtu;
-	dev->set_mac_address 		= hippi_mac_addr;
-	dev->neigh_setup 		= hippi_neigh_setup_dev;
-#endif
 	dev->header_ops			= &hippi_header_ops;
 
 	/*
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index ff7572ac548..1e2ad4c7c59 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -644,7 +644,6 @@ static int vlan_dev_init(struct net_device *dev)
 		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
 		dev->netdev_ops         = &vlan_netdev_ops;
 	}
-	netdev_resync_ops(dev);
 
 	if (is_vlan_dev(real_dev))
 		subclass = 1;
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
index 72277d70c98..6c8016f6186 100644
--- a/net/appletalk/dev.c
+++ b/net/appletalk/dev.c
@@ -9,21 +9,10 @@
 #include <linux/if_arp.h>
 #include <linux/if_ltalk.h>
 
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-static int ltalk_change_mtu(struct net_device *dev, int mtu)
-{
-	return -EINVAL;
-}
-#endif
-
 static void ltalk_setup(struct net_device *dev)
 {
 	/* Fill in the fields of the device structure with localtalk-generic values. */
 
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	dev->change_mtu		= ltalk_change_mtu;
-#endif
-
 	dev->type		= ARPHRD_LOCALTLK;
 	dev->hard_header_len 	= LTALK_HLEN;
 	dev->mtu		= LTALK_MTU;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3942266d1f6..241613f6dd2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4580,39 +4580,6 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
 }
 EXPORT_SYMBOL(netdev_fix_features);
 
-/* Some devices need to (re-)set their netdev_ops inside
- * ->init() or similar.  If that happens, we have to setup
- * the compat pointers again.
- */
-void netdev_resync_ops(struct net_device *dev)
-{
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	const struct net_device_ops *ops = dev->netdev_ops;
-
-	dev->init = ops->ndo_init;
-	dev->uninit = ops->ndo_uninit;
-	dev->open = ops->ndo_open;
-	dev->change_rx_flags = ops->ndo_change_rx_flags;
-	dev->set_rx_mode = ops->ndo_set_rx_mode;
-	dev->set_multicast_list = ops->ndo_set_multicast_list;
-	dev->set_mac_address = ops->ndo_set_mac_address;
-	dev->validate_addr = ops->ndo_validate_addr;
-	dev->do_ioctl = ops->ndo_do_ioctl;
-	dev->set_config = ops->ndo_set_config;
-	dev->change_mtu = ops->ndo_change_mtu;
-	dev->neigh_setup = ops->ndo_neigh_setup;
-	dev->tx_timeout = ops->ndo_tx_timeout;
-	dev->get_stats = ops->ndo_get_stats;
-	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
-	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
-	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	dev->poll_controller = ops->ndo_poll_controller;
-#endif
-#endif
-}
-EXPORT_SYMBOL(netdev_resync_ops);
-
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -4652,23 +4619,6 @@ int register_netdevice(struct net_device *dev)
 
 	dev->iflink = -1;
 
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	/* Netdevice_ops API compatibility support.
-	 * This is temporary until all network devices are converted.
-	 */
-	if (dev->netdev_ops) {
-		netdev_resync_ops(dev);
-	} else {
-		char drivername[64];
-		pr_info("%s (%s): not using net_device_ops yet\n",
-			dev->name, netdev_drivername(dev, drivername, 64));
-
-		/* This works only because net_device_ops and the
-		   compatibility structure are the same. */
-		dev->netdev_ops = (void *) &(dev->init);
-	}
-#endif
-
 	/* Init, if this function is available */
 	if (dev->netdev_ops->ndo_init) {
 		ret = dev->netdev_ops->ndo_init(dev);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 280352aba40..5a883affecd 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -337,11 +337,6 @@ const struct header_ops eth_header_ops ____cacheline_aligned = {
 void ether_setup(struct net_device *dev)
 {
 	dev->header_ops		= &eth_header_ops;
-#ifdef CONFIG_COMPAT_NET_DEV_OPS
-	dev->change_mtu		= eth_change_mtu;
-	dev->set_mac_address 	= eth_mac_addr;
-	dev->validate_addr	= eth_validate_addr;
-#endif
 	dev->type		= ARPHRD_ETHER;
 	dev->hard_header_len 	= ETH_HLEN;
 	dev->mtu		= ETH_DATA_LEN;
-- 
cgit v1.2.3-70-g09d2


From e527ea312f31e88a7fa5472b71db71c565b0d44f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:25 +0200
Subject: perf_counter: Remove unused ABI bits

extra_config_len isn't used for anything, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.116035832@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2ddf5e3c551..b1f2bac09f9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -154,11 +154,11 @@ struct perf_counter_hw_event {
 
 				__reserved_1   : 51;
 
-	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
+	__u32			__reserved_2;
 
-	__u64			__reserved_2;
 	__u64			__reserved_3;
+	__u64			__reserved_4;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:27 +0200
Subject: perf_counter: Propagate inheritance failures down the fork() path

Fail fork() when we fail inheritance for some reason (-ENOMEM most likely).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.324656474@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++--
 kernel/fork.c                |  6 +++++-
 kernel/perf_counter.c        | 20 ++++++++++++--------
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b1f2bac09f9..d3e85de9bf1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -566,7 +566,7 @@ extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task,
 					struct task_struct *next, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *child);
+extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
@@ -631,7 +631,7 @@ perf_counter_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline void perf_counter_init_task(struct task_struct *child)	{ }
+static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index 675e01e9072..c07c3335cea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
-	perf_counter_init_task(p);
+
+	retval = perf_counter_init_task(p);
+	if (retval)
+		goto bad_fork_cleanup_policy;
 
 	if ((retval = audit_alloc(p)))
 		goto bad_fork_cleanup_policy;
@@ -1295,6 +1298,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
+	perf_counter_exit_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 217dbcce2eb..7a7a144870e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3434,18 +3434,23 @@ again:
 /*
  * Initialize the perf_counter context in task_struct
  */
-void perf_counter_init_task(struct task_struct *child)
+int perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
 
 	mutex_init(&child->perf_counter_mutex);
 	INIT_LIST_HEAD(&child->perf_counter_list);
 
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
+		return 0;
+
 	/*
 	 * This is executed from the parent task context, so inherit
 	 * counters that have been marked for cloning.
@@ -3454,11 +3459,7 @@ void perf_counter_init_task(struct task_struct *child)
 
 	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
 	if (!child_ctx)
-		return;
-
-	parent_ctx = parent->perf_counter_ctxp;
-	if (likely(!parent_ctx || !parent_ctx->nr_counters))
-		return;
+		return -ENOMEM;
 
 	__perf_counter_init_context(child_ctx, child);
 	child->perf_counter_ctxp = child_ctx;
@@ -3482,8 +3483,9 @@ void perf_counter_init_task(struct task_struct *child)
 			continue;
 		}
 
-		if (inherit_group(counter, parent,
-				  parent_ctx, child, child_ctx)) {
+		ret = inherit_group(counter, parent, parent_ctx,
+					     child, child_ctx);
+		if (ret) {
 			inherited_all = 0;
 			break;
 		}
@@ -3505,6 +3507,8 @@ void perf_counter_init_task(struct task_struct *child)
 	}
 
 	mutex_unlock(&parent_ctx->mutex);
+
+	return ret;
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)
-- 
cgit v1.2.3-70-g09d2


From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:04 +0200
Subject: perf_counter: x86: Remove interrupt throttle

remove the x86 specific interrupt throttle

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.616671838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c        |  2 --
 arch/x86/kernel/cpu/perf_counter.c | 47 ++++----------------------------------
 include/linux/perf_counter.h       |  2 --
 3 files changed, 5 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b4f64402a82..89b63b5fad3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
-
-	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c14437faf5d..8c8177f859f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
-/*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -775,15 +770,14 @@ again:
 	if (status)
 		goto again;
 
-	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
-		perf_enable();
+	perf_enable();
 
 	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu, idx, throttle = 0, handled = 0;
+	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
@@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
-		throttle = 1;
-		__perf_disable();
-		cpuc->enabled = 0;
-		barrier();
-	}
-
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int disable = 0;
-
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		hwc = &counter->hw;
 
 		if (counter->hw_event.nmi != nmi)
-			goto next;
+			continue;
 
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			goto next;
+			continue;
 
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		disable = perf_counter_overflow(counter, nmi, regs, 0);
-
-next:
-		if (disable || throttle)
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void perf_counter_unthrottle(void)
-{
-	struct cpu_hw_counters *cpuc;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		/*
-		 * Clear them before re-enabling irqs/NMIs again:
-		 */
-		cpuc->interrupts = 0;
-		perf_enable();
-	} else {
-		cpuc->interrupts = 0;
-	}
-}
-
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d3e85de9bf1..0c160be2078 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
-extern void perf_counter_unthrottle(void);
 extern void __perf_disable(void);
 extern bool __perf_enable(void);
 extern void perf_disable(void);
@@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void perf_counter_unthrottle(void)			{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
cgit v1.2.3-70-g09d2


From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:05 +0200
Subject: perf_counter: Generic per counter interrupt throttle

Introduce a generic per counter interrupt throttle.

This uses the perf_counter_overflow() quick disable to throttle a specific
counter when its going too fast when a pmu->unthrottle() method is provided
which can undo the quick disable.

Power needs to implement both the quick disable and the unthrottle method.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++
 include/linux/perf_counter.h       | 11 +++++++
 kernel/perf_counter.c              | 59 +++++++++++++++++++++++++++++++++++---
 kernel/sysctl.c                    |  8 ++++++
 4 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8c8177f859f..c4b543d1a86 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -623,6 +623,18 @@ try_generic:
 	return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->counters[hwc->idx] != counter))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
@@ -1038,6 +1050,7 @@ static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0c160be2078..e3a7585d3e4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -266,6 +266,15 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_PERIOD		= 4,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				time;
+	 * };
+	 */
+	PERF_EVENT_THROTTLE		= 5,
+	PERF_EVENT_UNTHROTTLE		= 6,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -367,6 +376,7 @@ struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
+	void (*unthrottle)		(struct perf_counter *counter);
 };
 
 /**
@@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 extern int sysctl_perf_counter_mlock;
+extern int sysctl_perf_counter_limit;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 14b1fe98483..ec9c4007a7f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 irq_period;
+	u64 interrupts, irq_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
+		interrupts = counter->hw.interrupts;
+		counter->hw.interrupts = 0;
+
+		if (interrupts == MAX_INTERRUPTS) {
+			perf_log_throttle(counter, 1);
+			counter->pmu->unthrottle(counter);
+			interrupts = 2*sysctl_perf_counter_limit/HZ;
+		}
+
 		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
 			continue;
 
-		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		events = HZ * interrupts * counter->hw.irq_period;
 		period = div64_u64(events, counter->hw_event.irq_freq);
 
 		delta = (s64)(1 + period - counter->hw.irq_period);
@@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		perf_log_period(counter, irq_period);
 
 		counter->hw.irq_period = irq_period;
-		counter->hw.interrupts = 0;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2543,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 	perf_output_end(&handle);
 }
 
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+	} throttle_event = {
+		.header = {
+			.type = PERF_EVENT_THROTTLE + 1,
+			.misc = 0,
+			.size = sizeof(throttle_event),
+		},
+		.time = sched_clock(),
+	};
+
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, throttle_event);
+	perf_output_end(&handle);
+}
+
 /*
  * Generic counter overflow handling.
  */
@@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
+	int throttle = counter->pmu->unthrottle != NULL;
 	int ret = 0;
 
-	counter->hw.interrupts++;
+	if (!throttle) {
+		counter->hw.interrupts++;
+	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
+		counter->hw.interrupts++;
+		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+			counter->hw.interrupts = MAX_INTERRUPTS;
+			perf_log_throttle(counter, 0);
+			ret = 1;
+		}
+	}
 
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3cb1849f598..0c4bf863afa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_int_limit",
+		.data		= &sysctl_perf_counter_limit,
+		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
cgit v1.2.3-70-g09d2


From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 22:03:26 +0200
Subject: perf_counter: fix warning & lockup

 - remove bogus warning
 - fix wakeup from NMI path lockup
 - also fix up whitespace noise in perf_counter.h

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 78 ++++++++++++++++++++++----------------------
 kernel/perf_counter.c        |  4 +--
 2 files changed, 40 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e3a7585d3e4..2b16ed37b74 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,7 +73,7 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name) 			\
+#define __PERF_COUNTER_MASK(name)			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
 
@@ -98,14 +98,14 @@ enum sw_event_ids {
  * in the overflow packets.
  */
 enum perf_counter_record_format {
-	PERF_RECORD_IP		= 1U << 0,
-	PERF_RECORD_TID		= 1U << 1,
-	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_ADDR	= 1U << 3,
-	PERF_RECORD_GROUP	= 1U << 4,
-	PERF_RECORD_CALLCHAIN	= 1U << 5,
-	PERF_RECORD_CONFIG	= 1U << 6,
-	PERF_RECORD_CPU		= 1U << 7,
+	PERF_RECORD_IP			= 1U << 0,
+	PERF_RECORD_TID			= 1U << 1,
+	PERF_RECORD_TIME		= 1U << 2,
+	PERF_RECORD_ADDR		= 1U << 3,
+	PERF_RECORD_GROUP		= 1U << 4,
+	PERF_RECORD_CALLCHAIN		= 1U << 5,
+	PERF_RECORD_CONFIG		= 1U << 6,
+	PERF_RECORD_CPU			= 1U << 7,
 };
 
 /*
@@ -235,13 +235,13 @@ enum perf_event_type {
 	 * correlate userspace IPs to code. They have the following structure:
 	 *
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	u32				pid, tid;
-	 * 	u64				addr;
-	 * 	u64				len;
-	 * 	u64				pgoff;
-	 * 	char				filename[];
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
 	 * };
 	 */
 	PERF_EVENT_MMAP			= 1,
@@ -249,27 +249,27 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	u32				pid, tid;
-	 * 	char				comm[];
+	 *	u32				pid, tid;
+	 *	char				comm[];
 	 * };
 	 */
 	PERF_EVENT_COMM			= 3,
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				time;
-	 * 	u64				irq_period;
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				irq_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				time;
+	 *	struct perf_event_header	header;
+	 *	u64				time;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
@@ -280,23 +280,23 @@ enum perf_event_type {
 	 * will be PERF_RECORD_*
 	 *
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	{ u64			ip;	  } && PERF_RECORD_IP
-	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
-	 * 	{ u64			time;     } && PERF_RECORD_TIME
-	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
-	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
-	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
+	 *	{ u64			ip;	  } && PERF_RECORD_IP
+	 *	{ u32			pid, tid; } && PERF_RECORD_TID
+	 *	{ u64			time;     } && PERF_RECORD_TIME
+	 *	{ u64			addr;     } && PERF_RECORD_ADDR
+	 *	{ u64			config;   } && PERF_RECORD_CONFIG
+	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
-	 * 	{ u64			nr;
-	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	{ u64			nr;
+	 *	  { u64 event, val; }	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
-	 * 	{ u16			nr,
-	 * 				hv,
-	 * 				kernel,
-	 * 				user;
-	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
+	 *	{ u16			nr,
+	 *				hv,
+	 *				kernel,
+	 *				user;
+	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
 	 * };
 	 */
 };
@@ -406,7 +406,7 @@ struct perf_mmap_data {
 	atomic_t			wakeup;		/* needs a wakeup    */
 
 	struct perf_counter_mmap_page   *user_page;
-	void 				*data_pages[0];
+	void				*data_pages[0];
 };
 
 struct perf_pending_entry {
@@ -422,7 +422,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
-	int 				nr_siblings;
+	int				nr_siblings;
 	struct perf_counter		*group_leader;
 	const struct pmu		*pmu;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ec9c4007a7f..070f92d3232 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2576,7 +2576,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		.time = sched_clock(),
 	};
 
-	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
 	if (ret)
 		return;
 
@@ -3449,8 +3449,6 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	WARN_ON_ONCE(child != current);
-
 	child_ctx = child->perf_counter_ctxp;
 
 	if (likely(!child_ctx))
-- 
cgit v1.2.3-70-g09d2


From 08baf561083bc27a953aa087dd8a664bb2b88e8e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 25 May 2009 22:58:01 -0700
Subject: net: txq_trans_update() helper

We would like to get rid of netdev->trans_start = jiffies; that about all net
drivers have to use in their start_xmit() function, and use txq->trans_start
instead.

This can be done generically in core network, as suggested by David.

Some devices, (particularly loopback) dont need trans_start update, because
they dont have transmit watchdog. We could add a new device flag, or rely
on fact that txq->tran_start can be updated is txq->xmit_lock_owner is
different than -1. Use a helper function to hide our choice.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 net/core/dev.c            | 3 +++
 net/core/netpoll.c        | 5 ++++-
 net/core/pktgen.c         | 1 +
 net/sched/sch_teql.c      | 1 +
 5 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae3c2099a04..586b71f0358 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1674,6 +1674,12 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 	spin_unlock_bh(&txq->_xmit_lock);
 }
 
+static inline void txq_trans_update(struct netdev_queue *txq)
+{
+	if (txq->xmit_lock_owner != -1)
+		txq->trans_start = jiffies;
+}
+
 /**
  *	netif_tx_lock - grab network device transmit lock
  *	@dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 241613f6dd2..5eb3e48ab31 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1698,6 +1698,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			skb->dst = NULL;
 		}
 		rc = ops->ndo_start_xmit(skb, dev);
+		if (rc == 0)
+			txq_trans_update(txq);
 		/*
 		 * TODO: if skb_orphan() was called by
 		 * dev->hard_start_xmit() (for example, the unmodified
@@ -1727,6 +1729,7 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
+		txq_trans_update(txq);
 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 67b4f3e3d4a..7ab31a7576a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -302,8 +302,11 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 		     tries > 0; --tries) {
 			if (__netif_tx_trylock(txq)) {
-				if (!netif_tx_queue_stopped(txq))
+				if (!netif_tx_queue_stopped(txq)) {
 					status = ops->ndo_start_xmit(skb, dev);
+					if (status == NETDEV_TX_OK)
+						txq_trans_update(txq);
+				}
 				__netif_tx_unlock(txq);
 
 				if (status == NETDEV_TX_OK)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0666a827bc6..b8ccd3c88d6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3438,6 +3438,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	      retry_now:
 		ret = (*xmit)(pkt_dev->skb, odev);
 		if (likely(ret == NETDEV_TX_OK)) {
+			txq_trans_update(txq);
 			pkt_dev->last_ok = 1;
 			pkt_dev->sofar++;
 			pkt_dev->seq_num++;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 428a5ef5b94..a886496bdc3 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -308,6 +308,7 @@ restart:
 				if (!netif_tx_queue_stopped(slave_txq) &&
 				    !netif_tx_queue_frozen(slave_txq) &&
 				    slave_ops->ndo_start_xmit(skb, slave) == 0) {
+					txq_trans_update(slave_txq);
 					__netif_tx_unlock(slave_txq);
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
-- 
cgit v1.2.3-70-g09d2


From b90cf6681f4f6263920616e7ca2fd09130e4143a Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 5 Apr 2009 08:23:44 -0700
Subject: [MTD] Remove option for add_mtd_partitions() to not register
 partitions.

This breaks the dilnetpc map driver, but it could be fixed not to use
that option. We want to simplify the partition handling, and this is a
step towards that.

Remove superfluous 'index' field from private struct mtd_part too, while
we're at it.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdpart.c          | 18 ++++--------------
 include/linux/mtd/partitions.h |  1 -
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 63d1cd2c17b..349fcbe5cc0 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -27,9 +27,7 @@ struct mtd_part {
 	struct mtd_info mtd;
 	struct mtd_info *master;
 	uint64_t offset;
-	int index;
 	struct list_head list;
-	int registered;
 };
 
 /*
@@ -321,8 +319,7 @@ int del_mtd_partitions(struct mtd_info *master)
 	list_for_each_entry_safe(slave, next, &mtd_partitions, list)
 		if (slave->master == master) {
 			list_del(&slave->list);
-			if (slave->registered)
-				del_mtd_device(&slave->mtd);
+			del_mtd_device(&slave->mtd);
 			kfree(slave);
 		}
 
@@ -412,7 +409,6 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	slave->mtd.erase = part_erase;
 	slave->master = master;
 	slave->offset = part->offset;
-	slave->index = partno;
 
 	if (slave->offset == MTDPART_OFS_APPEND)
 		slave->offset = cur_offset;
@@ -500,15 +496,9 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	}
 
 out_register:
-	if (part->mtdp) {
-		/* store the object pointer (caller may or may not register it*/
-		*part->mtdp = &slave->mtd;
-		slave->registered = 0;
-	} else {
-		/* register our partition */
-		add_mtd_device(&slave->mtd);
-		slave->registered = 1;
-	}
+	/* register our partition */
+	add_mtd_device(&slave->mtd);
+
 	return slave;
 }
 
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 7535a74083b..af6dcb992bc 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -40,7 +40,6 @@ struct mtd_partition {
 	uint64_t offset;		/* offset within the master MTD space */
 	uint32_t mask_flags;		/* master MTD flags to mask out for this partition */
 	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only)*/
-	struct mtd_info **mtdp;		/* pointer to store the MTD object */
 };
 
 #define MTDPART_OFS_NXTBLK	(-2)
-- 
cgit v1.2.3-70-g09d2


From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 26 May 2009 20:25:22 +0200
Subject: tracing: add __print_flags for events

Developers have been asking for the ability in the ftrace event tracer
to display names of bits in a flags variable.

Instead of printing out c2, it would be easier to read FOO|BAR|GOO,
assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7.

Some examples where this would be useful are the state flags in a context
switch, kmalloc flags, and even permision flags in accessing files.

[
  v2 changes include:

  Frederic Weisbecker's idea of using a mask instead of bits,
  thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS.

  Li Zefan's idea of allowing the caller of __print_flags to add their
  own delimiter (or no delimiter) where we can get for file permissions
  rwx instead of r|w|x.
]

[
  v3 changes:

   Christoph Hellwig's idea of using an array instead of va_args.
]

[ Impact: better displaying of flags in trace output ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/ftrace_event.h | 13 ++++++++++++-
 include/trace/ftrace.h       | 14 ++++++++++++++
 kernel/trace/trace_output.c  | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bae51ddfabd..4b58cf1a11c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -3,12 +3,23 @@
 
 #include <linux/trace_seq.h>
 #include <linux/ring_buffer.h>
-
+#include <linux/percpu.h>
 
 struct trace_array;
 struct tracer;
 struct dentry;
 
+DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
+struct trace_print_flags {
+	unsigned long		mask;
+	const char		*name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+				   unsigned long flags,
+				   const struct trace_print_flags *flag_array);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b5ff2e8229e..22c94719c56 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -87,6 +87,7 @@
  *	struct trace_seq *s = &iter->seq;
  *	struct ftrace_raw_<call> *field; <-- defined in stage 1
  *	struct trace_entry *entry;
+ *	struct trace_seq *p;
  *	int ret;
  *
  *	entry = iter->ent;
@@ -98,7 +99,9 @@
  *
  *	field = (typeof(field))entry;
  *
+ *	p = get_cpu_var(ftrace_event_seq);
  *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -119,6 +122,14 @@
 #undef __get_str
 #define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
 
+#undef __print_flags
+#define __print_flags(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags flags[] =		\
+			{ flag_array, { -1, NULL }};			\
+		ftrace_print_flags_seq(p, delim, flag, flags);		\
+	})
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
@@ -127,6 +138,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##call *field;				\
 	struct trace_entry *entry;					\
+	struct trace_seq *p;						\
 	int ret;							\
 									\
 	entry = iter->ent;						\
@@ -138,7 +150,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
+	p = &get_cpu_var(ftrace_event_seq);				\
 	ret = trace_seq_printf(s, #call ": " print);			\
+	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7136420603a..a4840c260c8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -15,6 +15,9 @@
 #define EVENT_HASHSIZE	128
 
 static DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 	return 0;
 }
 
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+		       unsigned long flags,
+		       const struct trace_print_flags *flag_array)
+{
+	unsigned long mask;
+	const char *str;
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%lx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.2.3-70-g09d2


From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 May 2009 19:21:47 -0400
Subject: tracing: add __print_symbolic to trace events

This patch adds __print_symbolic which is similar to __print_flags but
works for an enumeration type instead. That is, there is only a one to one
mapping between the values and the symbols. When a match is made, then
it is printed, otherwise the hex value is outputed.

[ Impact: add interface for showing symbol names in events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/ftrace_event.h |  3 +++
 include/trace/ftrace.h       |  8 ++++++++
 kernel/trace/trace_output.c  | 25 +++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4b58cf1a11c..bbf40f624fc 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -20,6 +20,9 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 				   unsigned long flags,
 				   const struct trace_print_flags *flag_array);
 
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+				     const struct trace_print_flags *symbol_array);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 22c94719c56..87fc227c6fb 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -130,6 +130,14 @@
 		ftrace_print_flags_seq(p, delim, flag, flags);		\
 	})
 
+#undef __print_symbolic
+#define __print_symbolic(value, symbol_array...)			\
+	({								\
+		static const struct trace_print_flags symbols[] =	\
+			{ symbol_array, { -1, NULL }};			\
+		ftrace_print_symbols_seq(p, value, symbols);		\
+	})
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a4840c260c8..c12d95db2f5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 	return p->buffer;
 }
 
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+			 const struct trace_print_flags *symbol_array)
+{
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  symbol_array[i].name; i++) {
+
+		if (val != symbol_array[i].mask)
+			continue;
+
+		trace_seq_puts(p, symbol_array[i].name);
+		break;
+	}
+
+	if (!p->len)
+		trace_seq_printf(p, "0x%lx", val);
+		
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.2.3-70-g09d2


From 8e401eccd3a62fb57f117bb09b7c1fc70ab19e8c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 May 2009 21:16:25 -0700
Subject: phy: Eliminate references to BUS_ID_SIZE.

Just use the constant 20 to keep things working.

If someone is so motivated, this can be converted over to
dynamic strings.  I tried and it's a lot of work.

But for now this is good enough.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index c216e4e503b..b1368b8f657 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -79,7 +79,7 @@ typedef enum {
  * Need to be a little smaller than phydev->dev.bus_id to leave room
  * for the ":%02x"
  */
-#define MII_BUS_ID_SIZE	(BUS_ID_SIZE - 3)
+#define MII_BUS_ID_SIZE	(20 - 3)
 
 /*
  * The Bus class for PHYs.  Devices which provide access to
@@ -407,7 +407,7 @@ struct phy_driver {
 /* A Structure for boards to register fixups with the PHY Lib */
 struct phy_fixup {
 	struct list_head list;
-	char bus_id[BUS_ID_SIZE];
+	char bus_id[20];
 	u32 phy_uid;
 	u32 phy_uid_mask;
 	int (*run)(struct phy_device *phydev);
-- 
cgit v1.2.3-70-g09d2


From 78a478d0efd9e86e5345b436e130497b4e5846e8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 26 May 2009 18:50:21 +0000
Subject: gro: Inline skb_gro_header and cache frag0 virtual address

The function skb_gro_header is called four times per packet which
quickly adds up at 10Gb/s.  This patch inlines it to allow better
optimisations.

Some architectures perform multiplication for page_address, which
is done by each skb_gro_header invocation.  This patch caches that
value in skb->cb to avoid the unnecessary multiplications.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 22 +++++++++++++++-------
 net/core/dev.c            | 27 ++++++++++++---------------
 2 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 586b71f0358..61890ed0bcf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1008,6 +1008,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 void netif_napi_del(struct napi_struct *napi);
 
 struct napi_gro_cb {
+	/* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
+	void *frag0;
+
 	/* This indicates where we are processing relative to skb->data. */
 	int data_offset;
 
@@ -1107,9 +1110,9 @@ extern int		dev_restart(struct net_device *dev);
 #ifdef CONFIG_NETPOLL_TRAP
 extern int		netpoll_trap(void);
 #endif
-extern void	      *skb_gro_header(struct sk_buff *skb, unsigned int hlen);
 extern int	       skb_gro_receive(struct sk_buff **head,
 				       struct sk_buff *skb);
+extern void	       skb_gro_reset_offset(struct sk_buff *skb);
 
 static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
 {
@@ -1126,23 +1129,28 @@ static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
 	NAPI_GRO_CB(skb)->data_offset += len;
 }
 
-static inline void skb_gro_reset_offset(struct sk_buff *skb)
+static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
 {
-	NAPI_GRO_CB(skb)->data_offset = 0;
+	unsigned int offset = skb_gro_offset(skb);
+
+	hlen += offset;
+	if (!NAPI_GRO_CB(skb)->frag0 ||
+	    unlikely(skb_shinfo(skb)->frags[0].size + skb_headlen(skb) < hlen))
+		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
+
+	return NAPI_GRO_CB(skb)->frag0 + offset;
 }
 
 static inline void *skb_gro_mac_header(struct sk_buff *skb)
 {
 	return skb_headlen(skb) ? skb_mac_header(skb) :
-	       page_address(skb_shinfo(skb)->frags[0].page) +
-	       skb_shinfo(skb)->frags[0].page_offset;
+	       NAPI_GRO_CB(skb)->frag0;
 }
 
 static inline void *skb_gro_network_header(struct sk_buff *skb)
 {
 	return skb_headlen(skb) ? skb_network_header(skb) :
-	       page_address(skb_shinfo(skb)->frags[0].page) +
-	       skb_shinfo(skb)->frags[0].page_offset + skb_network_offset(skb);
+	       NAPI_GRO_CB(skb)->frag0 + skb_network_offset(skb);
 }
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 5eb3e48ab31..bdb1a738193 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2390,21 +2390,6 @@ void napi_gro_flush(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
-void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
-{
-	unsigned int offset = skb_gro_offset(skb);
-
-	hlen += offset;
-	if (unlikely(skb_headlen(skb) ||
-		     skb_shinfo(skb)->frags[0].size < hlen ||
-		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
-		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
-
-	return page_address(skb_shinfo(skb)->frags[0].page) +
-	       skb_shinfo(skb)->frags[0].page_offset + offset;
-}
-EXPORT_SYMBOL(skb_gro_header);
-
 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
@@ -2520,6 +2505,18 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_skb_finish);
 
+void skb_gro_reset_offset(struct sk_buff *skb)
+{
+	NAPI_GRO_CB(skb)->data_offset = 0;
+	NAPI_GRO_CB(skb)->frag0 = NULL;
+
+	if (!skb_headlen(skb) && !PageHighMem(skb_shinfo(skb)->frags[0].page))
+		NAPI_GRO_CB(skb)->frag0 =
+			page_address(skb_shinfo(skb)->frags[0].page) +
+			skb_shinfo(skb)->frags[0].page_offset;
+}
+EXPORT_SYMBOL(skb_gro_reset_offset);
+
 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	skb_gro_reset_offset(skb);
-- 
cgit v1.2.3-70-g09d2


From 78d3fd0b7de844a6dad56e9620fc9d2271b32ab9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 26 May 2009 18:50:23 +0000
Subject: gro: Only use skb_gro_header for completely non-linear packets

Currently skb_gro_header is used for packets which put the hardware
header in skb->data with the rest in frags.  Since the drivers that
need this optimisation all provide completely non-linear packets,
we can gain extra optimisations by only performing the frag0
optimisation for completely non-linear packets.

In particular, we can simply test frag0 (instead of skb_headlen)
to see whether the optimisation is in force.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ++++++-----
 net/core/dev.c            |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 61890ed0bcf..d1afc3b6485 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1135,22 +1135,23 @@ static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
 
 	hlen += offset;
 	if (!NAPI_GRO_CB(skb)->frag0 ||
-	    unlikely(skb_shinfo(skb)->frags[0].size + skb_headlen(skb) < hlen))
+	    unlikely(skb_shinfo(skb)->frags[0].size < hlen)) {
+		NAPI_GRO_CB(skb)->frag0 = NULL;
 		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
+	}
 
 	return NAPI_GRO_CB(skb)->frag0 + offset;
 }
 
 static inline void *skb_gro_mac_header(struct sk_buff *skb)
 {
-	return skb_headlen(skb) ? skb_mac_header(skb) :
-	       NAPI_GRO_CB(skb)->frag0;
+	return NAPI_GRO_CB(skb)->frag0 ?: skb_mac_header(skb);
 }
 
 static inline void *skb_gro_network_header(struct sk_buff *skb)
 {
-	return skb_headlen(skb) ? skb_network_header(skb) :
-	       NAPI_GRO_CB(skb)->frag0 + skb_network_offset(skb);
+	return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) +
+	       skb_network_offset(skb);
 }
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index bdb1a738193..f9d90c56b6f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2510,7 +2510,8 @@ void skb_gro_reset_offset(struct sk_buff *skb)
 	NAPI_GRO_CB(skb)->data_offset = 0;
 	NAPI_GRO_CB(skb)->frag0 = NULL;
 
-	if (!skb_headlen(skb) && !PageHighMem(skb_shinfo(skb)->frags[0].page))
+	if (skb->mac_header == skb->tail &&
+	    !PageHighMem(skb_shinfo(skb)->frags[0].page))
 		NAPI_GRO_CB(skb)->frag0 =
 			page_address(skb_shinfo(skb)->frags[0].page) +
 			skb_shinfo(skb)->frags[0].page_offset;
-- 
cgit v1.2.3-70-g09d2


From 7489594cb249aeb178287c9a43a9e4f366044259 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 26 May 2009 18:50:27 +0000
Subject: gro: Optimise length comparison in skb_gro_header

By caching frag0_len, we can avoid checking both frag0 and the
length separately in skb_gro_header.  This helps as skb_gro_header
is called four times per packet which amounts to a few million
times at 10Gb/s.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++--
 net/core/dev.c            | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1afc3b6485..2e44a049be0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1011,6 +1011,9 @@ struct napi_gro_cb {
 	/* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
 	void *frag0;
 
+	/* Length of frag0. */
+	unsigned int frag0_len;
+
 	/* This indicates where we are processing relative to skb->data. */
 	int data_offset;
 
@@ -1134,9 +1137,9 @@ static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
 	unsigned int offset = skb_gro_offset(skb);
 
 	hlen += offset;
-	if (!NAPI_GRO_CB(skb)->frag0 ||
-	    unlikely(skb_shinfo(skb)->frags[0].size < hlen)) {
+	if (NAPI_GRO_CB(skb)->frag0_len < hlen) {
 		NAPI_GRO_CB(skb)->frag0 = NULL;
+		NAPI_GRO_CB(skb)->frag0_len = 0;
 		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index f9d90c56b6f..b1722a2d1fb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2509,12 +2509,15 @@ void skb_gro_reset_offset(struct sk_buff *skb)
 {
 	NAPI_GRO_CB(skb)->data_offset = 0;
 	NAPI_GRO_CB(skb)->frag0 = NULL;
+	NAPI_GRO_CB(skb)->frag0_len = 0;
 
 	if (skb->mac_header == skb->tail &&
-	    !PageHighMem(skb_shinfo(skb)->frags[0].page))
+	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
 		NAPI_GRO_CB(skb)->frag0 =
 			page_address(skb_shinfo(skb)->frags[0].page) +
 			skb_shinfo(skb)->frags[0].page_offset;
+		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
+	}
 }
 EXPORT_SYMBOL(skb_gro_reset_offset);
 
-- 
cgit v1.2.3-70-g09d2


From a5b1cf288d4200506ab62fbb86cc81ace948a306 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 26 May 2009 18:50:28 +0000
Subject: gro: Avoid unnecessary comparison after skb_gro_header

For the overwhelming majority of cases, skb_gro_header's return
value cannot be NULL.  Yet we must check it because of its current
form.  This patch splits it up into multiple functions in order
to avoid this.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 23 ++++++++++++++---------
 net/core/dev.c            | 17 ++++++++++++-----
 net/ipv4/af_inet.c        | 13 ++++++++++---
 net/ipv4/tcp.c            | 22 ++++++++++++++++------
 net/ipv6/af_inet6.c       | 13 ++++++++++---
 5 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e44a049be0..371ece521e5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1132,18 +1132,23 @@ static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
 	NAPI_GRO_CB(skb)->data_offset += len;
 }
 
-static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
+static inline void *skb_gro_header_fast(struct sk_buff *skb,
+					unsigned int offset)
 {
-	unsigned int offset = skb_gro_offset(skb);
+	return NAPI_GRO_CB(skb)->frag0 + offset;
+}
 
-	hlen += offset;
-	if (NAPI_GRO_CB(skb)->frag0_len < hlen) {
-		NAPI_GRO_CB(skb)->frag0 = NULL;
-		NAPI_GRO_CB(skb)->frag0_len = 0;
-		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
-	}
+static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
+{
+	return NAPI_GRO_CB(skb)->frag0_len < hlen;
+}
 
-	return NAPI_GRO_CB(skb)->frag0 + offset;
+static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
+					unsigned int offset)
+{
+	NAPI_GRO_CB(skb)->frag0 = NULL;
+	NAPI_GRO_CB(skb)->frag0_len = 0;
+	return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
 }
 
 static inline void *skb_gro_mac_header(struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index b1722a2d1fb..cd29e613bc5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2590,17 +2590,24 @@ struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 {
 	struct sk_buff *skb = napi->skb;
 	struct ethhdr *eth;
+	unsigned int hlen;
+	unsigned int off;
 
 	napi->skb = NULL;
 
 	skb_reset_mac_header(skb);
 	skb_gro_reset_offset(skb);
 
-	eth = skb_gro_header(skb, sizeof(*eth));
-	if (!eth) {
-		napi_reuse_skb(napi, skb);
-		skb = NULL;
-		goto out;
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*eth);
+	eth = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		eth = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!eth)) {
+			napi_reuse_skb(napi, skb);
+			skb = NULL;
+			goto out;
+		}
 	}
 
 	skb_gro_pull(skb, sizeof(*eth));
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 170689681aa..644cc553531 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1246,13 +1246,20 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	struct sk_buff **pp = NULL;
 	struct sk_buff *p;
 	struct iphdr *iph;
+	unsigned int hlen;
+	unsigned int off;
 	int flush = 1;
 	int proto;
 	int id;
 
-	iph = skb_gro_header(skb, sizeof(*iph));
-	if (unlikely(!iph))
-		goto out;
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+	iph = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		iph = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!iph))
+			goto out;
+	}
 
 	proto = iph->protocol & (MAX_INET_PROTOS - 1);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 68342d43189..c3dcec5efea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2518,20 +2518,30 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	unsigned int thlen;
 	unsigned int flags;
 	unsigned int mss = 1;
+	unsigned int hlen;
+	unsigned int off;
 	int flush = 1;
 	int i;
 
-	th = skb_gro_header(skb, sizeof(*th));
-	if (unlikely(!th))
-		goto out;
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*th);
+	th = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		th = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!th))
+			goto out;
+	}
 
 	thlen = th->doff * 4;
 	if (thlen < sizeof(*th))
 		goto out;
 
-	th = skb_gro_header(skb, thlen);
-	if (unlikely(!th))
-		goto out;
+	hlen = off + thlen;
+	if (skb_gro_header_hard(skb, hlen)) {
+		th = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!th))
+			goto out;
+	}
 
 	skb_gro_pull(skb, thlen);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 61f55386a23..b6215be0963 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -817,13 +817,20 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 	struct sk_buff *p;
 	struct ipv6hdr *iph;
 	unsigned int nlen;
+	unsigned int hlen;
+	unsigned int off;
 	int flush = 1;
 	int proto;
 	__wsum csum;
 
-	iph = skb_gro_header(skb, sizeof(*iph));
-	if (unlikely(!iph))
-		goto out;
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+	iph = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		iph = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!iph))
+			goto out;
+	}
 
 	skb_gro_pull(skb, sizeof(*iph));
 	skb_set_transport_header(skb, skb_gro_offset(skb));
-- 
cgit v1.2.3-70-g09d2


From cbf806dd9302f3ff27ba496dae474b9da6b58873 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 27 May 2009 06:22:58 -0700
Subject: Input: ucb1400 - move static function from header into core

it's a little too large for static line.
The ts is currently the only mainline user but Marek Vasut claims that
there is a battery driver in an ARM tree which also needs this function.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/mfd/ucb1400_core.c | 20 ++++++++++++++++++++
 include/linux/ucb1400.h    | 23 ++++-------------------
 2 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ucb1400_core.c b/drivers/mfd/ucb1400_core.c
index 178159e264c..78c2135c5de 100644
--- a/drivers/mfd/ucb1400_core.c
+++ b/drivers/mfd/ucb1400_core.c
@@ -23,6 +23,26 @@
 #include <linux/module.h>
 #include <linux/ucb1400.h>
 
+unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel,
+		int adcsync)
+{
+	unsigned int val;
+
+	if (adcsync)
+		adc_channel |= UCB_ADC_SYNC_ENA;
+
+	ucb1400_reg_write(ac97, UCB_ADC_CR, UCB_ADC_ENA | adc_channel);
+	ucb1400_reg_write(ac97, UCB_ADC_CR, UCB_ADC_ENA | adc_channel |
+			UCB_ADC_START);
+
+	while (!((val = ucb1400_reg_read(ac97, UCB_ADC_DATA))
+				& UCB_ADC_DAT_VALID))
+		schedule_timeout_uninterruptible(1);
+
+	return val & UCB_ADC_DAT_MASK;
+}
+EXPORT_SYMBOL_GPL(ucb1400_adc_read);
+
 static int ucb1400_core_probe(struct device *dev)
 {
 	int err;
diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h
index 970473bf8d5..ed889f4168f 100644
--- a/include/linux/ucb1400.h
+++ b/include/linux/ucb1400.h
@@ -134,28 +134,13 @@ static inline void ucb1400_adc_enable(struct snd_ac97 *ac97)
 	ucb1400_reg_write(ac97, UCB_ADC_CR, UCB_ADC_ENA);
 }
 
-static unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel,
-					int adcsync)
-{
-	unsigned int val;
-
-	if (adcsync)
-		adc_channel |= UCB_ADC_SYNC_ENA;
-
-	ucb1400_reg_write(ac97, UCB_ADC_CR, UCB_ADC_ENA | adc_channel);
-	ucb1400_reg_write(ac97, UCB_ADC_CR, UCB_ADC_ENA | adc_channel |
-				UCB_ADC_START);
-
-	while (!((val = ucb1400_reg_read(ac97, UCB_ADC_DATA))
-			& UCB_ADC_DAT_VALID))
-		schedule_timeout_uninterruptible(1);
-
-	return val & UCB_ADC_DAT_MASK;
-}
-
 static inline void ucb1400_adc_disable(struct snd_ac97 *ac97)
 {
 	ucb1400_reg_write(ac97, UCB_ADC_CR, 0);
 }
 
+
+unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel,
+			      int adcsync);
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From a17c859849402315613a0015ac8fbf101acf0cc1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 27 May 2009 17:50:35 +0200
Subject: netfilter: conntrack: add support for DCCP handshake sequence to
 ctnetlink

This patch adds CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ that exposes
the u64 handshake sequence number to user-space.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h | 1 +
 include/net/netlink.h                         | 9 +++++++++
 net/netfilter/nf_conntrack_proto_dccp.c       | 7 +++++++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 1a865e48b8e..ed4ef8d0b11 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -101,6 +101,7 @@ enum ctattr_protoinfo_dccp {
 	CTA_PROTOINFO_DCCP_UNSPEC,
 	CTA_PROTOINFO_DCCP_STATE,
 	CTA_PROTOINFO_DCCP_ROLE,
+	CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
 	__CTA_PROTOINFO_DCCP_MAX,
 };
 #define CTA_PROTOINFO_DCCP_MAX (__CTA_PROTOINFO_DCCP_MAX - 1)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index eddb50289d6..007bdb07dab 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -939,6 +939,15 @@ static inline u64 nla_get_u64(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_get_be64 - return payload of __be64 attribute
+ * @nla: __be64 netlink attribute
+ */
+static inline __be64 nla_get_be64(const struct nlattr *nla)
+{
+	return *(__be64 *) nla_data(nla);
+}
+
 /**
  * nla_get_flag - return payload of flag attribute
  * @nla: flag netlink attribute
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 8e757dd5339..11801c43c8c 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -635,6 +635,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state);
 	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE,
 		   ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]);
+	NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
+		     cpu_to_be64(ct->proto.dccp.handshake_seq));
 	nla_nest_end(skb, nest_parms);
 	read_unlock_bh(&dccp_lock);
 	return 0;
@@ -647,6 +649,7 @@ nla_put_failure:
 static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
 	[CTA_PROTOINFO_DCCP_STATE]	= { .type = NLA_U8 },
 	[CTA_PROTOINFO_DCCP_ROLE]	= { .type = NLA_U8 },
+	[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
 };
 
 static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
@@ -679,6 +682,10 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
 		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
 	}
+	if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
+		ct->proto.dccp.handshake_seq =
+		be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
+	}
 	write_unlock_bh(&dccp_lock);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From fca4217c5bab31019b5247e977673c9fcc385f6b Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Wed, 1 Apr 2009 07:28:13 +1100
Subject: knfsd: reply cache cleanups

Make REQHASH() an inline function.  Rename hash_list to cache_hash.
Fix an obsolete comment.

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfscache.c         | 29 +++++++++++++++++++----------
 include/linux/nfsd/cache.h |  3 +--
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5bfc2ac60d5..6f0aa4989c6 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,15 +29,24 @@
  */
 #define CACHESIZE		1024
 #define HASHSIZE		64
-#define REQHASH(xid)		(((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
 
-static struct hlist_head *	hash_list;
+static struct hlist_head *	cache_hash;
 static struct list_head 	lru_head;
 static int			cache_disabled = 1;
 
+/*
+ * Calculate the hash index from an XID.
+ */
+static inline u32 request_hash(u32 xid)
+{
+	u32 h = xid;
+	h ^= (xid >> 24);
+	return h & (HASHSIZE-1);
+}
+
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 
-/* 
+/*
  * locking for the reply cache:
  * A cache entry is "single use" if c_state == RC_INPROG
  * Otherwise, it when accessing _prev or _next, the lock must be held.
@@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
 		i--;
 	}
 
-	hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-	if (!hash_list)
+	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+	if (!cache_hash)
 		goto out_nomem;
 
 	cache_disabled = 0;
@@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
 
 	cache_disabled = 1;
 
-	kfree (hash_list);
-	hash_list = NULL;
+	kfree (cache_hash);
+	cache_hash = NULL;
 }
 
 /*
@@ -108,7 +117,7 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
 	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid));
+	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
 }
 
 /*
@@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
 	spin_lock(&cache_lock);
 	rtn = RC_DOIT;
 
-	rh = &hash_list[REQHASH(xid)];
+	rh = &cache_hash[request_hash(xid)];
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
 		if (rp->c_state != RC_UNUSED &&
 		    xid == rp->c_xid && proc == rp->c_proc &&
@@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
 	len >>= 2;
-	
+
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
 		rp->c_state = RC_UNUSED;
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index 5bccaab8105..3a3f58934f5 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -14,8 +14,7 @@
 #include <linux/uio.h>
 
 /*
- * Representation of a reply cache entry. The first two members *must*
- * be hash_next and hash_prev.
+ * Representation of a reply cache entry.
  */
 struct svc_cacherep {
 	struct hlist_node	c_hash;
-- 
cgit v1.2.3-70-g09d2


From ab6bf42e2339580b5d87746d0ff4da4b1578b03e Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.co.il>
Date: Wed, 27 May 2009 14:38:34 -0700
Subject: mlx4_core: Add module parameter for number of MTTs per segment

The current MTT allocator uses kmalloc() to allocate a buffer for its
buddy allocator, and thus is limited in the amount of MTT segments
that it can control.  As a result, the size of memory that can be
registered is limited too.  This patch uses a module parameter to
control the number of MTT entries that each segment represents,
allowing more memory to be registered with the same number of
segments.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/main.c     | 14 ++++++++++++--
 drivers/net/mlx4/mr.c       |  6 +++---
 drivers/net/mlx4/profile.c  |  2 +-
 include/linux/mlx4/device.h |  1 +
 4 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 30bea968969..018348c0119 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -100,6 +100,10 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
 		  "(0/1, default 0)");
 
+static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)");
+
 int mlx4_check_port_params(struct mlx4_dev *dev,
 			   enum mlx4_port_type *port_type)
 {
@@ -203,12 +207,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
 	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
 	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
+	dev->caps.mtts_per_seg	     = 1 << log_mtts_per_seg;
 	dev->caps.reserved_mtts	     = DIV_ROUND_UP(dev_cap->reserved_mtts,
-						    MLX4_MTT_ENTRY_PER_SEG);
+						    dev->caps.mtts_per_seg);
 	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
 	dev->caps.reserved_uars	     = dev_cap->reserved_uars;
 	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
-	dev->caps.mtt_entry_sz	     = MLX4_MTT_ENTRY_PER_SEG * dev_cap->mtt_entry_sz;
+	dev->caps.mtt_entry_sz	     = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
 	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
 	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
 	dev->caps.flags		     = dev_cap->flags;
@@ -1304,6 +1309,11 @@ static int __init mlx4_verify_params(void)
 		return -1;
 	}
 
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
+		printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
+		return -1;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index 0caf74cae8b..3b8973d1993 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -209,7 +209,7 @@ int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
 	} else
 		mtt->page_shift = page_shift;
 
-	for (mtt->order = 0, i = MLX4_MTT_ENTRY_PER_SEG; i < npages; i <<= 1)
+	for (mtt->order = 0, i = dev->caps.mtts_per_seg; i < npages; i <<= 1)
 		++mtt->order;
 
 	mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order);
@@ -350,7 +350,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
 						   MLX4_MPT_PD_FLAG_RAE);
 		mpt_entry->mtt_sz    = cpu_to_be32((1 << mr->mtt.order) *
-						   MLX4_MTT_ENTRY_PER_SEG);
+						   dev->caps.mtts_per_seg);
 	} else {
 		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
 	}
@@ -391,7 +391,7 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	    (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64)))
 		return -EINVAL;
 
-	if (start_index & (MLX4_MTT_ENTRY_PER_SEG - 1))
+	if (start_index & (dev->caps.mtts_per_seg - 1))
 		return -EINVAL;
 
 	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg +
diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c
index cebdf3243ca..bd22df95adf 100644
--- a/drivers/net/mlx4/profile.c
+++ b/drivers/net/mlx4/profile.c
@@ -98,7 +98,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
 	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
 	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
-	profile[MLX4_RES_MTT].size    = MLX4_MTT_ENTRY_PER_SEG * dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MTT].size    = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
 	profile[MLX4_RES_MCG].size    = MLX4_MGM_ENTRY_SIZE;
 
 	profile[MLX4_RES_QP].num      = request->num_qp;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 3aff8a6a389..ce7cc6c7bcb 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -210,6 +210,7 @@ struct mlx4_caps {
 	int			num_comp_vectors;
 	int			num_mpts;
 	int			num_mtt_segs;
+	int			mtts_per_seg;
 	int			fmr_reserved_mtts;
 	int			reserved_mtts;
 	int			reserved_mrws;
-- 
cgit v1.2.3-70-g09d2


From 1ce8e7b57b3a4527ef83da1c5c7bd8a6b9d87b56 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 May 2009 04:42:37 +0000
Subject: net: ALIGN/PTR_ALIGN cleanup in alloc_netdev_mq()/netdev_priv()

Use ALIGN() and PTR_ALIGN() macros instead of handcoding them.

Get rid of NETDEV_ALIGN_CONST ugly define

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +----
 net/core/dev.c            | 9 ++++-----
 net/mac80211/main.c       | 8 ++------
 3 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 371ece521e5..14efce33c00 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -905,7 +905,6 @@ struct net_device
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
 #define	NETDEV_ALIGN		32
-#define	NETDEV_ALIGN_CONST	(NETDEV_ALIGN - 1)
 
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
@@ -976,9 +975,7 @@ static inline bool netdev_uses_trailer_tags(struct net_device *dev)
  */
 static inline void *netdev_priv(const struct net_device *dev)
 {
-	return (char *)dev + ((sizeof(struct net_device)
-			       + NETDEV_ALIGN_CONST)
-			      & ~NETDEV_ALIGN_CONST);
+	return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
 }
 
 /* Set the sysfs physical device reference for the network logical device
diff --git a/net/core/dev.c b/net/core/dev.c
index ed4550fd9ec..32ceee17896 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4988,18 +4988,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	struct netdev_queue *tx;
 	struct net_device *dev;
 	size_t alloc_size;
-	void *p;
+	struct net_device *p;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
 	alloc_size = sizeof(struct net_device);
 	if (sizeof_priv) {
 		/* ensure 32-byte alignment of private area */
-		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
+		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
 		alloc_size += sizeof_priv;
 	}
 	/* ensure 32-byte alignment of whole construct */
-	alloc_size += NETDEV_ALIGN_CONST;
+	alloc_size += NETDEV_ALIGN - 1;
 
 	p = kzalloc(alloc_size, GFP_KERNEL);
 	if (!p) {
@@ -5014,8 +5014,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 		goto free_p;
 	}
 
-	dev = (struct net_device *)
-		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
+	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
 	if (dev_addr_init(dev))
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 6b7e92eaab4..e37770ced53 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -735,9 +735,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	 * +-------------------------+
 	 *
 	 */
-	priv_size = ((sizeof(struct ieee80211_local) +
-		      NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST) +
-		    priv_data_len;
+	priv_size = ALIGN(sizeof(*local), NETDEV_ALIGN) + priv_data_len;
 
 	wiphy = wiphy_new(&mac80211_config_ops, priv_size);
 
@@ -754,9 +752,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 
 	local->hw.wiphy = wiphy;
 
-	local->hw.priv = (char *)local +
-			 ((sizeof(struct ieee80211_local) +
-			   NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
+	local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN);
 
 	BUG_ON(!ops->tx);
 	BUG_ON(!ops->start);
-- 
cgit v1.2.3-70-g09d2


From 385a154cac8763284f65cdfa25f6796c9eb1ca21 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 27 May 2009 15:48:07 -0700
Subject: net: correct a comment for the final #endif

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 14efce33c00..8e03b06e638 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1975,4 +1975,4 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev)
 }
 #endif /* __KERNEL__ */
 
-#endif	/* _LINUX_DEV_H */
+#endif	/* _LINUX_NETDEVICE_H */
-- 
cgit v1.2.3-70-g09d2


From d3e78ee3d015dac1794433abb6403b6fc8e70e10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 May 2009 11:41:50 +0200
Subject: perf_counter: Fix perf_counter_init_task() on !CONFIG_PERF_COUNTERS

Pointed out by compiler warnings:

   tip/include/linux/perf_counter.h:644: warning: no return statement in function returning non-void

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b16ed37b74..a65ddc58051 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -641,7 +641,7 @@ perf_counter_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline int perf_counter_init_task(struct task_struct *child)	{ }
+static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-- 
cgit v1.2.3-70-g09d2


From c93f7669098eb97c5376e5396e3dfb734c17df4f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 28 May 2009 22:18:17 +1000
Subject: perf_counter: Fix race in attaching counters to tasks and exiting

Commit 564c2b21 ("perf_counter: Optimize context switch between
identical inherited contexts") introduced a race where it is possible
that a counter being attached to a task could get attached to the
wrong task, if the task is one that has inherited its context from
another task via fork.  This happens because the optimized context
switch could switch the context to another task after find_get_context
has read task->perf_counter_ctxp.  In fact, it's possible that the
context could then get freed, if the other task then exits.

This fixes the problem by protecting both the context switch and the
critical code in find_get_context with spinlocks.  The context switch
locks the cxt->lock of both the outgoing and incoming contexts before
swapping them.  That means that once code such as find_get_context
has obtained the spinlock for the context associated with a task,
the context can't get swapped to another task.  However, the context
may have been swapped in the interval between reading
task->perf_counter_ctxp and getting the lock, so it is necessary to
check and retry.

To make sure that none of the contexts being looked at in
find_get_context can get freed, this changes the context freeing code
to use RCU.  Thus an rcu_read_lock() is sufficient to ensure that no
contexts can get freed.  This part of the patch is lifted from a patch
posted by Peter Zijlstra.

This also adds a check to make sure that we can't add a counter to a
task that is exiting.

There is also a race between perf_counter_exit_task and
find_get_context; this solves the race by moving the get_ctx that
was in perf_counter_alloc into the locked region in find_get_context,
so that once find_get_context has got the context for a task, it
won't get freed even if the task calls perf_counter_exit_task.  It
doesn't matter if new top-level (non-inherited) counters get attached
to the context after perf_counter_exit_task has detached the context
from the task.  They will just stay there and never get scheduled in
until the counters' fds get closed, and then perf_release will remove
them from the context and eventually free the context.

With this, we are now doing the unclone in find_get_context rather
than when a counter was added to or removed from a context (actually,
we were missing the unclone_ctx() call when adding a counter to a
context).  We don't need to unclone when removing a counter from a
context because we have no way to remove a counter from a cloned
context.

This also takes out the smp_wmb() in find_get_context, which Peter
Zijlstra pointed out was unnecessary because the cmpxchg implies a
full barrier anyway.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   5 +-
 kernel/perf_counter.c        | 205 ++++++++++++++++++++++++++++++-------------
 2 files changed, 148 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a65ddc58051..717bf3b59ba 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -541,8 +541,9 @@ struct perf_counter_context {
 	 * been cloned (inherited) from a common ancestor.
 	 */
 	struct perf_counter_context *parent_ctx;
-	u32			parent_gen;
-	u32			generation;
+	u64			parent_gen;
+	u64			generation;
+	struct rcu_head		rcu_head;
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f91aa..52e5a15321d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx)
 	atomic_inc(&ctx->refcount);
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_counter_context *ctx;
+
+	ctx = container_of(head, struct perf_counter_context, rcu_head);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_counter_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		kfree(ctx);
+		if (ctx->task)
+			put_task_struct(ctx->task);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -211,22 +221,6 @@ group_sched_out(struct perf_counter *group_counter,
 		cpuctx->exclusive = 0;
 }
 
-/*
- * Mark this context as not being a clone of another.
- * Called when counters are added to or removed from this context.
- * We also increment our generation number so that anything that
- * was cloned from this context before this will not match anything
- * cloned from this context after this.
- */
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	++ctx->generation;
-	if (!ctx->parent_ctx)
-		return;
-	put_ctx(ctx->parent_ctx);
-	ctx->parent_ctx = NULL;
-}
-
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info)
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
  */
 static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
-	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info)
 
 /*
  * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
  */
 static void perf_counter_disable(struct perf_counter *counter)
 {
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info)
 
 /*
  * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
  */
 static void perf_counter_enable(struct perf_counter *counter)
 {
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct perf_counter_context *next_ctx;
+	struct perf_counter_context *parent;
 	struct pt_regs *regs;
+	int do_switch = 1;
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		return;
 
 	update_context_time(ctx);
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_counter_ctxp;
-	if (next_ctx && context_equiv(ctx, next_ctx)) {
-		task->perf_counter_ctxp = next_ctx;
-		next->perf_counter_ctxp = ctx;
-		ctx->task = next;
-		next_ctx->task = task;
-		return;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		spin_lock(&ctx->lock);
+		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+		if (context_equiv(ctx, next_ctx)) {
+			task->perf_counter_ctxp = next_ctx;
+			next->perf_counter_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+		}
+		spin_unlock(&next_ctx->lock);
+		spin_unlock(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	__perf_counter_sched_out(ctx, cpuctx);
-
-	cpuctx->task_ctx = NULL;
+	if (do_switch) {
+		__perf_counter_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
 }
 
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	ctx->task = task;
 }
 
-static void put_context(struct perf_counter_context *ctx)
-{
-	if (ctx->task)
-		put_task_struct(ctx->task);
-}
-
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct perf_counter_context *tctx;
+	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	int err;
 
 	/*
 	 * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
 
 		return ctx;
 	}
@@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
+	/*
+	 * Can't attach counters to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
 	/* Reuse ptrace permission checks for now. */
-	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_task_struct(task);
-		return ERR_PTR(-EACCES);
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+ retry_lock:
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more and we can
+		 * unclone it if necessary.
+		 * Once it's not a clone things will be stable.
+		 */
+		spin_lock_irq(&ctx->lock);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock_irq(&ctx->lock);
+			goto retry;
+		}
+		parent_ctx = ctx->parent_ctx;
+		if (parent_ctx) {
+			put_ctx(parent_ctx);
+			ctx->parent_ctx = NULL;		/* no longer a clone */
+		}
+		++ctx->generation;
+		/*
+		 * Get an extra reference before dropping the lock so that
+		 * this context won't get freed if the task exits.
+		 */
+		get_ctx(ctx);
+		spin_unlock_irq(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	ctx = task->perf_counter_ctxp;
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-		if (!ctx) {
-			put_task_struct(task);
-			return ERR_PTR(-ENOMEM);
-		}
+		err = -ENOMEM;
+		if (!ctx)
+			goto errout;
 		__perf_counter_init_context(ctx, task);
-		/*
-		 * Make sure other cpus see correct values for *ctx
-		 * once task->perf_counter_ctxp is visible to them.
-		 */
-		smp_wmb();
-		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
-		if (tctx) {
+		get_ctx(ctx);
+		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
 			 */
 			kfree(ctx);
-			ctx = tctx;
+			goto retry_lock;
 		}
+		get_task_struct(task);
 	}
 
+	put_task_struct(task);
 	return ctx;
+
+ errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
 }
 
 static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
-	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter)
 	if (counter->destroy)
 		counter->destroy(counter);
 
+	put_ctx(counter->ctx);
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	put_task_struct(counter->owner);
 
 	free_counter(counter);
-	put_context(ctx);
 
 	return 0;
 }
@@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	mutex_unlock(&ctx->mutex);
 }
 
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
 static void perf_counter_for_each_child(struct perf_counter *counter,
 					void (*func)(struct perf_counter *))
 {
@@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
-	get_ctx(ctx);
-
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
@@ -3290,7 +3370,7 @@ err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-	put_context(ctx);
+	put_ctx(ctx);
 
 	goto out_fput;
 }
@@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
 		return child_counter;
+	get_ctx(child_ctx);
 
 	/*
 	 * Make the child state follow the state of the parent counter,
@@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child,
 
 /*
  * When a child task exits, feed back counter values to parent counters.
- *
- * Note: we may be running in child context, but the PID is not hashed
- * anymore so new counters will not be added.
- * (XXX not sure that is true when we get called from flush_old_exec.
- *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
@@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child)
 
 	local_irq_save(flags);
 	__perf_counter_task_sched_out(child_ctx);
+
+	/*
+	 * Take the context lock here so that if find_get_context is
+	 * reading child->perf_counter_ctxp, we wait until it has
+	 * incremented the context's refcount before we do put_ctx below.
+	 */
+	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
 	mutex_lock(&child_ctx->mutex);
@@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child)
 
 	__perf_counter_init_context(child_ctx, child);
 	child->perf_counter_ctxp = child_ctx;
+	get_task_struct(child);
 
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
-- 
cgit v1.2.3-70-g09d2


From 7f0333eb2f98bbfece4fbfe21076d0a3e49f0bb0 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Date: Wed, 13 May 2009 06:20:29 -0700
Subject: wimax: Add netlink interface to get device state

wimax connection manager / daemon has to know what is current
state of the device. Previously it was only possible to get
notification whet state has changed.

Note:

 By mistake, the new generic netlink's number for
 WIMAX_GNL_OP_STATE_GET was declared inserting into the existing list
 of API calls, not appending; thus, it'd break existing API.

 Fixed by Inaky Perez-Gonzalez <inaky@linux.intel.com> by moving to
 the tail, where we add to the interface, not modify the interface.

 Thanks to Stephen Hemminger <shemminger@vyatta.com> for catching this.

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
---
 include/linux/wimax.h    |  5 +++
 net/wimax/Makefile       |  1 +
 net/wimax/debug-levels.h |  1 +
 net/wimax/debugfs.c      |  1 +
 net/wimax/op-state-get.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/wimax/stack.c        |  5 ++-
 6 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 net/wimax/op-state-get.c

(limited to 'include/linux')

diff --git a/include/linux/wimax.h b/include/linux/wimax.h
index c89de7f4e5b..2f7a6b77d6b 100644
--- a/include/linux/wimax.h
+++ b/include/linux/wimax.h
@@ -78,6 +78,7 @@ enum {
 	WIMAX_GNL_OP_RFKILL,	/* Run wimax_rfkill() */
 	WIMAX_GNL_OP_RESET,	/* Run wimax_rfkill() */
 	WIMAX_GNL_RE_STATE_CHANGE,	/* Report: status change */
+	WIMAX_GNL_OP_STATE_GET,		/* Request for current state */
 };
 
 
@@ -113,6 +114,10 @@ enum {
 	WIMAX_GNL_RESET_IFIDX = 1,
 };
 
+/* Atributes for wimax_state_get() */
+enum {
+	WIMAX_GNL_STGET_IFIDX = 1,
+};
 
 /*
  * Attributes for the Report State Change
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
index 5b80b941c2c..8f1510d0cc2 100644
--- a/net/wimax/Makefile
+++ b/net/wimax/Makefile
@@ -6,6 +6,7 @@ wimax-y :=		\
 	op-msg.o	\
 	op-reset.o	\
 	op-rfkill.o	\
+	op-state-get.o	\
 	stack.o
 
 wimax-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h
index 1c29123a3aa..0975adba6b7 100644
--- a/net/wimax/debug-levels.h
+++ b/net/wimax/debug-levels.h
@@ -36,6 +36,7 @@ enum d_module {
 	D_SUBMODULE_DECLARE(op_msg),
 	D_SUBMODULE_DECLARE(op_reset),
 	D_SUBMODULE_DECLARE(op_rfkill),
+	D_SUBMODULE_DECLARE(op_state_get),
 	D_SUBMODULE_DECLARE(stack),
 };
 
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
index 94d216a4640..6c9bedb7431 100644
--- a/net/wimax/debugfs.c
+++ b/net/wimax/debugfs.c
@@ -61,6 +61,7 @@ int wimax_debugfs_add(struct wimax_dev *wimax_dev)
 	__debugfs_register("wimax_dl_", op_msg, dentry);
 	__debugfs_register("wimax_dl_", op_reset, dentry);
 	__debugfs_register("wimax_dl_", op_rfkill, dentry);
+	__debugfs_register("wimax_dl_", op_state_get, dentry);
 	__debugfs_register("wimax_dl_", stack, dentry);
 	result = 0;
 out:
diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c
new file mode 100644
index 00000000000..a76b8fcb056
--- /dev/null
+++ b/net/wimax/op-state-get.c
@@ -0,0 +1,86 @@
+/*
+ * Linux WiMAX
+ * Implement and export a method for getting a WiMAX device current state
+ *
+ * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on previous WiMAX core work by:
+ *  Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ *  Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_state_get
+#include "debug-levels.h"
+
+
+static const
+struct nla_policy wimax_gnl_state_get_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_STGET_IFIDX] = {
+		.type = NLA_U32,
+	},
+};
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the state get command from user space, return a combination
+ * value that describe the current state.
+ *
+ * No attributes.
+ */
+static
+int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+	struct device *dev;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) {
+		printk(KERN_ERR "WIMAX_GNL_OP_STATE_GET: can't find IFIDX "
+			"attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	dev = wimax_dev_to_dev(wimax_dev);
+	/* Execute the operation and send the result back to user space */
+	result = wimax_state_get(wimax_dev);
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
+
+
+struct genl_ops wimax_gnl_state_get = {
+	.cmd = WIMAX_GNL_OP_STATE_GET,
+	.flags = GENL_ADMIN_PERM,
+	.policy = wimax_gnl_state_get_policy,
+	.doit = wimax_gnl_doit_state_get,
+	.dumpit = NULL,
+};
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 933e1422b09..79fb7d7c640 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -402,13 +402,15 @@ EXPORT_SYMBOL_GPL(wimax_dev_init);
 extern struct genl_ops
 	wimax_gnl_msg_from_user,
 	wimax_gnl_reset,
-	wimax_gnl_rfkill;
+	wimax_gnl_rfkill,
+	wimax_gnl_state_get;
 
 static
 struct genl_ops *wimax_gnl_ops[] = {
 	&wimax_gnl_msg_from_user,
 	&wimax_gnl_reset,
 	&wimax_gnl_rfkill,
+	&wimax_gnl_state_get,
 };
 
 
@@ -533,6 +535,7 @@ struct d_level D_LEVEL[] = {
 	D_SUBMODULE_DEFINE(op_msg),
 	D_SUBMODULE_DEFINE(op_reset),
 	D_SUBMODULE_DEFINE(op_rfkill),
+	D_SUBMODULE_DEFINE(op_state_get),
 	D_SUBMODULE_DEFINE(stack),
 };
 size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
-- 
cgit v1.2.3-70-g09d2


From 7481806dcfd07e9a636155554f6f4b4fbd976381 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Fri, 22 May 2009 00:41:11 -0700
Subject: wimax: a new API call was added, increment minor protocol version
 number

As the 'state_get' API call was added, we need to increase the minor
protocol version number so applications that depend on the can check
it's presence.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 include/linux/wimax.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wimax.h b/include/linux/wimax.h
index 2f7a6b77d6b..4fdcc563551 100644
--- a/include/linux/wimax.h
+++ b/include/linux/wimax.h
@@ -59,7 +59,7 @@ enum {
 	 * M - Major: change if removing or modifying an existing call.
 	 * m - minor: change when adding a new call
 	 */
-	WIMAX_GNL_VERSION = 00,
+	WIMAX_GNL_VERSION = 01,
 	/* Generic NetLink attributes */
 	WIMAX_GNL_ATTR_INVALID = 0x00,
 	WIMAX_GNL_ATTR_MAX = 10,
-- 
cgit v1.2.3-70-g09d2


From 5d4e039b2cb1ca4de9774344ea7b61ad7fa1b0a1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 28 May 2009 01:05:00 +0000
Subject: bonding: allow bond in mode balance-alb to work properly in bridge
 -try4.3

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3

(updated)
changes v4.2 -> v4.3
- memcpy the address always, not just in case it differs from master->dev_addr
- compare_ether_addr_64bits() is not used so there is no direct need to make new
  header file (I think it would be good to have bond stuff in separate file
  anyway).

changes v4.1 -> v4.2
- use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
  against skb->dev->dev_addr

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly. Note that the comparsion is not
made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
previously in eth_type_trans().

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

Jirka

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8e03b06e638..1eaf5ae14fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1917,6 +1917,16 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *master)
+{
+	if (skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+	}
+}
+
 /* On bonding slaves other than the currently active slave, suppress
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
@@ -1930,6 +1940,14 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 		if (master->priv_flags & IFF_MASTER_ARPMON)
 			dev->last_rx = jiffies;
 
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, master);
+		}
+
 		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
 			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-- 
cgit v1.2.3-70-g09d2


From bbbee90829304d156c12b171c0ac7e6e1aba8b90 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:25:58 +0200
Subject: perf_counter: Ammend cleanup in fork() fail

When fork() fails we cannot use perf_counter_exit_task() since that
assumes to operate on current. Write a new helper that cleans up
unused/clean contexts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/fork.c                |  2 +-
 kernel/perf_counter.c        | 43 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 717bf3b59ba..519a41bba24 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -579,6 +579,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task,
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
+extern void perf_counter_free_task(struct task_struct *task);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void __perf_disable(void);
@@ -644,6 +645,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
+static inline void perf_counter_free_task(struct task_struct *task)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_disable(void)					{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index c07c3335cea..23bf757ed32 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1298,7 +1298,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
-	perf_counter_exit_task(p);
+	perf_counter_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0c000d305e0..79c3f26541d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3538,8 +3538,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 }
 
 static void
-__perf_counter_exit_task(struct task_struct *child,
-			 struct perf_counter *child_counter,
+__perf_counter_exit_task(struct perf_counter *child_counter,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
@@ -3605,7 +3604,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
-		__perf_counter_exit_task(child, child_counter, child_ctx);
+		__perf_counter_exit_task(child_counter, child_ctx);
 
 	/*
 	 * If the last counter was a group counter, it will have appended all
@@ -3620,6 +3619,44 @@ again:
 	put_ctx(child_ctx);
 }
 
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
+	struct perf_counter *counter, *tmp;
+
+	if (!ctx)
+		return;
+
+	mutex_lock(&ctx->mutex);
+again:
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+		struct perf_counter *parent = counter->parent;
+
+		if (WARN_ON_ONCE(!parent))
+			continue;
+
+		mutex_lock(&parent->child_mutex);
+		list_del_init(&counter->child_list);
+		mutex_unlock(&parent->child_mutex);
+
+		fput(parent->filp);
+
+		list_del_counter(counter, ctx);
+		free_counter(counter);
+	}
+
+	if (!list_empty(&ctx->counter_list))
+		goto again;
+
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+}
+
 /*
  * Initialize the perf_counter context in task_struct
  */
-- 
cgit v1.2.3-70-g09d2


From ccffad25b5136958d4769ed6de5e87992dd9c65c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 22 May 2009 23:22:17 +0000
Subject: net: convert unicast addr list

This patch converts unicast address list to standard list_head using
previously introduced struct netdev_hw_addr. It also relaxes the
locking. Original spinlock (still used for multicast addresses) is not
needed and is no longer used for a protection of this list. All
reading and writing takes place under rtnl (with no changes).

I also removed a possibility to specify the length of the address
while adding or deleting unicast address. It's always dev->addr_len.

The convertion touched especially e1000 and ixgbe codes when the
change is not so trivial.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

 drivers/net/bnx2.c               |   13 +--
 drivers/net/e1000/e1000_main.c   |   24 +++--
 drivers/net/ixgbe/ixgbe_common.c |   14 ++--
 drivers/net/ixgbe/ixgbe_common.h |    4 +-
 drivers/net/ixgbe/ixgbe_main.c   |    6 +-
 drivers/net/ixgbe/ixgbe_type.h   |    4 +-
 drivers/net/macvlan.c            |   11 +-
 drivers/net/mv643xx_eth.c        |   11 +-
 drivers/net/niu.c                |    7 +-
 drivers/net/virtio_net.c         |    7 +-
 drivers/s390/net/qeth_l2_main.c  |    6 +-
 drivers/scsi/fcoe/fcoe.c         |   16 ++--
 include/linux/netdevice.h        |   18 ++--
 net/8021q/vlan.c                 |    4 +-
 net/8021q/vlan_dev.c             |   10 +-
 net/core/dev.c                   |  195 +++++++++++++++++++++++++++-----------
 net/dsa/slave.c                  |   10 +-
 net/packet/af_packet.c           |    4 +-
 18 files changed, 227 insertions(+), 137 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bnx2.c               |  13 ++-
 drivers/net/e1000/e1000_main.c   |  24 +++--
 drivers/net/ixgbe/ixgbe_common.c |  14 +--
 drivers/net/ixgbe/ixgbe_common.h |   4 +-
 drivers/net/ixgbe/ixgbe_main.c   |   6 +-
 drivers/net/ixgbe/ixgbe_type.h   |   4 +-
 drivers/net/macvlan.c            |  11 ++-
 drivers/net/mv643xx_eth.c        |  11 ++-
 drivers/net/niu.c                |   7 +-
 drivers/net/virtio_net.c         |   7 +-
 drivers/s390/net/qeth_l2_main.c  |   6 +-
 drivers/scsi/fcoe/fcoe.c         |  16 ++--
 include/linux/netdevice.h        |  18 ++--
 net/8021q/vlan.c                 |   4 +-
 net/8021q/vlan_dev.c             |  10 +-
 net/core/dev.c                   | 195 +++++++++++++++++++++++++++------------
 net/dsa/slave.c                  |  10 +-
 net/packet/af_packet.c           |   4 +-
 18 files changed, 227 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 83ee0f53f2d..f53017250e0 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -48,6 +48,7 @@
 #include <linux/cache.h>
 #include <linux/firmware.h>
 #include <linux/log2.h>
+#include <linux/list.h>
 
 #include "bnx2.h"
 #include "bnx2_fw.h"
@@ -3310,7 +3311,7 @@ bnx2_set_rx_mode(struct net_device *dev)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 	u32 rx_mode, sort_mode;
-	struct dev_addr_list *uc_ptr;
+	struct netdev_hw_addr *ha;
 	int i;
 
 	if (!netif_running(dev))
@@ -3369,21 +3370,19 @@ bnx2_set_rx_mode(struct net_device *dev)
 		sort_mode |= BNX2_RPM_SORT_USER0_MC_HSH_EN;
 	}
 
-	uc_ptr = NULL;
 	if (dev->uc_count > BNX2_MAX_UNICAST_ADDRESSES) {
 		rx_mode |= BNX2_EMAC_RX_MODE_PROMISCUOUS;
 		sort_mode |= BNX2_RPM_SORT_USER0_PROM_EN |
 			     BNX2_RPM_SORT_USER0_PROM_VLAN;
 	} else if (!(dev->flags & IFF_PROMISC)) {
-		uc_ptr = dev->uc_list;
-
 		/* Add all entries into to the match filter list */
-		for (i = 0; i < dev->uc_count; i++) {
-			bnx2_set_mac_addr(bp, uc_ptr->da_addr,
+		i = 0;
+		list_for_each_entry(ha, &dev->uc_list, list) {
+			bnx2_set_mac_addr(bp, ha->addr,
 					  i + BNX2_START_UNICAST_ADDRESS_INDEX);
 			sort_mode |= (1 <<
 				      (i + BNX2_START_UNICAST_ADDRESS_INDEX));
-			uc_ptr = uc_ptr->next;
+			i++;
 		}
 
 	}
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 79fe1ee3da5..74667e52143 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2330,7 +2330,8 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
-	struct dev_addr_list *uc_ptr;
+	struct netdev_hw_addr *ha;
+	bool use_uc = false;
 	struct dev_addr_list *mc_ptr;
 	u32 rctl;
 	u32 hash_value;
@@ -2369,12 +2370,11 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 			rctl |= E1000_RCTL_VFE;
 	}
 
-	uc_ptr = NULL;
 	if (netdev->uc_count > rar_entries - 1) {
 		rctl |= E1000_RCTL_UPE;
 	} else if (!(netdev->flags & IFF_PROMISC)) {
 		rctl &= ~E1000_RCTL_UPE;
-		uc_ptr = netdev->uc_list;
+		use_uc = true;
 	}
 
 	ew32(RCTL, rctl);
@@ -2392,13 +2392,20 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 	 * if there are not 14 addresses, go ahead and clear the filters
 	 * -- with 82571 controllers only 0-13 entries are filled here
 	 */
+	i = 1;
+	if (use_uc)
+		list_for_each_entry(ha, &netdev->uc_list, list) {
+			if (i == rar_entries)
+				break;
+			e1000_rar_set(hw, ha->addr, i++);
+		}
+
+	WARN_ON(i == rar_entries);
+
 	mc_ptr = netdev->mc_list;
 
-	for (i = 1; i < rar_entries; i++) {
-		if (uc_ptr) {
-			e1000_rar_set(hw, uc_ptr->da_addr, i);
-			uc_ptr = uc_ptr->next;
-		} else if (mc_ptr) {
+	for (; i < rar_entries; i++) {
+		if (mc_ptr) {
 			e1000_rar_set(hw, mc_ptr->da_addr, i);
 			mc_ptr = mc_ptr->next;
 		} else {
@@ -2408,7 +2415,6 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 			E1000_WRITE_FLUSH();
 		}
 	}
-	WARN_ON(uc_ptr != NULL);
 
 	/* load any remaining addresses into the hash table */
 
diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c
index 0cc3c47cb45..6f79409270a 100644
--- a/drivers/net/ixgbe/ixgbe_common.c
+++ b/drivers/net/ixgbe/ixgbe_common.c
@@ -28,6 +28,8 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
 
 #include "ixgbe.h"
 #include "ixgbe_common.h"
@@ -1356,15 +1358,14 @@ static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq)
  *  Drivers using secondary unicast addresses must set user_set_promisc when
  *  manually putting the device into promiscuous mode.
  **/
-s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list,
-                              u32 addr_count, ixgbe_mc_addr_itr next)
+s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw,
+				      struct list_head *uc_list)
 {
-	u8 *addr;
 	u32 i;
 	u32 old_promisc_setting = hw->addr_ctrl.overflow_promisc;
 	u32 uc_addr_in_use;
 	u32 fctrl;
-	u32 vmdq;
+	struct netdev_hw_addr *ha;
 
 	/*
 	 * Clear accounting of old secondary address list,
@@ -1382,10 +1383,9 @@ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list,
 	}
 
 	/* Add the new addresses */
-	for (i = 0; i < addr_count; i++) {
+	list_for_each_entry(ha, uc_list, list) {
 		hw_dbg(hw, " Adding the secondary addresses:\n");
-		addr = next(hw, &addr_list, &vmdq);
-		ixgbe_add_uc_addr(hw, addr, vmdq);
+		ixgbe_add_uc_addr(hw, ha->addr, 0);
 	}
 
 	if (hw->addr_ctrl.overflow_promisc) {
diff --git a/drivers/net/ixgbe/ixgbe_common.h b/drivers/net/ixgbe/ixgbe_common.h
index dd260890ad0..b2a4b2c99c4 100644
--- a/drivers/net/ixgbe/ixgbe_common.h
+++ b/drivers/net/ixgbe/ixgbe_common.h
@@ -59,8 +59,8 @@ s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw);
 s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
                                       u32 mc_addr_count,
                                       ixgbe_mc_addr_itr func);
-s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list,
-                                      u32 addr_count, ixgbe_mc_addr_itr func);
+s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw,
+				      struct list_head *uc_list);
 s32 ixgbe_enable_mc_generic(struct ixgbe_hw *hw);
 s32 ixgbe_disable_mc_generic(struct ixgbe_hw *hw);
 s32 ixgbe_enable_rx_dma_generic(struct ixgbe_hw *hw, u32 regval);
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 924aa5ed02c..de70a2df9ae 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -2181,11 +2181,7 @@ static void ixgbe_set_rx_mode(struct net_device *netdev)
 	IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctrl);
 
 	/* reprogram secondary unicast list */
-	addr_count = netdev->uc_count;
-	if (addr_count)
-		addr_list = netdev->uc_list->dmi_addr;
-	hw->mac.ops.update_uc_addr_list(hw, addr_list, addr_count,
-	                                  ixgbe_addr_list_itr);
+	hw->mac.ops.update_uc_addr_list(hw, &netdev->uc_list);
 
 	/* reprogram multicast list */
 	addr_count = netdev->mc_count;
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index df1f7034c28..a8a8243d8fd 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -30,6 +30,7 @@
 
 #include <linux/types.h>
 #include <linux/mdio.h>
+#include <linux/list.h>
 
 /* Vendor ID */
 #define IXGBE_INTEL_VENDOR_ID   0x8086
@@ -2223,8 +2224,7 @@ struct ixgbe_mac_operations {
 	s32 (*set_vmdq)(struct ixgbe_hw *, u32, u32);
 	s32 (*clear_vmdq)(struct ixgbe_hw *, u32, u32);
 	s32 (*init_rx_addrs)(struct ixgbe_hw *);
-	s32 (*update_uc_addr_list)(struct ixgbe_hw *, u8 *, u32,
-	                           ixgbe_mc_addr_itr);
+	s32 (*update_uc_addr_list)(struct ixgbe_hw *, struct list_head *);
 	s32 (*update_mc_addr_list)(struct ixgbe_hw *, u8 *, u32,
 	                           ixgbe_mc_addr_itr);
 	s32 (*enable_mc)(struct ixgbe_hw *);
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d5334b41e4b..021d9941c29 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -232,7 +232,7 @@ static int macvlan_open(struct net_device *dev)
 	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
 		goto out;
 
-	err = dev_unicast_add(lowerdev, dev->dev_addr, ETH_ALEN);
+	err = dev_unicast_add(lowerdev, dev->dev_addr);
 	if (err < 0)
 		goto out;
 	if (dev->flags & IFF_ALLMULTI) {
@@ -244,7 +244,7 @@ static int macvlan_open(struct net_device *dev)
 	return 0;
 
 del_unicast:
-	dev_unicast_delete(lowerdev, dev->dev_addr, ETH_ALEN);
+	dev_unicast_delete(lowerdev, dev->dev_addr);
 out:
 	return err;
 }
@@ -258,7 +258,7 @@ static int macvlan_stop(struct net_device *dev)
 	if (dev->flags & IFF_ALLMULTI)
 		dev_set_allmulti(lowerdev, -1);
 
-	dev_unicast_delete(lowerdev, dev->dev_addr, ETH_ALEN);
+	dev_unicast_delete(lowerdev, dev->dev_addr);
 
 	macvlan_hash_del(vlan);
 	return 0;
@@ -282,10 +282,11 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p)
 		if (macvlan_addr_busy(vlan->port, addr->sa_data))
 			return -EBUSY;
 
-		if ((err = dev_unicast_add(lowerdev, addr->sa_data, ETH_ALEN)))
+		err = dev_unicast_add(lowerdev, addr->sa_data);
+		if (err)
 			return err;
 
-		dev_unicast_delete(lowerdev, dev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(lowerdev, dev->dev_addr);
 
 		macvlan_hash_change_addr(vlan, addr->sa_data);
 	}
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 1361ddc8d31..b4e18a58cb1 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -55,6 +55,7 @@
 #include <linux/types.h>
 #include <linux/inet_lro.h>
 #include <asm/system.h>
+#include <linux/list.h>
 
 static char mv643xx_eth_driver_name[] = "mv643xx_eth";
 static char mv643xx_eth_driver_version[] = "1.4";
@@ -1721,20 +1722,20 @@ static void uc_addr_set(struct mv643xx_eth_private *mp, unsigned char *addr)
 
 static u32 uc_addr_filter_mask(struct net_device *dev)
 {
-	struct dev_addr_list *uc_ptr;
+	struct netdev_hw_addr *ha;
 	u32 nibbles;
 
 	if (dev->flags & IFF_PROMISC)
 		return 0;
 
 	nibbles = 1 << (dev->dev_addr[5] & 0x0f);
-	for (uc_ptr = dev->uc_list; uc_ptr != NULL; uc_ptr = uc_ptr->next) {
-		if (memcmp(dev->dev_addr, uc_ptr->da_addr, 5))
+	list_for_each_entry(ha, &dev->uc_list, list) {
+		if (memcmp(dev->dev_addr, ha->addr, 5))
 			return 0;
-		if ((dev->dev_addr[5] ^ uc_ptr->da_addr[5]) & 0xf0)
+		if ((dev->dev_addr[5] ^ ha->addr[5]) & 0xf0)
 			return 0;
 
-		nibbles |= 1 << (uc_ptr->da_addr[5] & 0x0f);
+		nibbles |= 1 << (ha->addr[5] & 0x0f);
 	}
 
 	return nibbles;
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index edac3a0b02d..fa61a12c5e1 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -22,6 +22,7 @@
 #include <linux/log2.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/list.h>
 
 #include <linux/io.h>
 
@@ -6362,6 +6363,7 @@ static void niu_set_rx_mode(struct net_device *dev)
 	struct niu *np = netdev_priv(dev);
 	int i, alt_cnt, err;
 	struct dev_addr_list *addr;
+	struct netdev_hw_addr *ha;
 	unsigned long flags;
 	u16 hash[16] = { 0, };
 
@@ -6383,9 +6385,8 @@ static void niu_set_rx_mode(struct net_device *dev)
 	if (alt_cnt) {
 		int index = 0;
 
-		for (addr = dev->uc_list; addr; addr = addr->next) {
-			err = niu_set_alt_mac(np, index,
-					      addr->da_addr);
+		list_for_each_entry(ha, &dev->uc_list, list) {
+			err = niu_set_alt_mac(np, index, ha->addr);
 			if (err)
 				printk(KERN_WARNING PFX "%s: Error %d "
 				       "adding alt mac %d\n",
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6cc5bcd34fb..0c9ca67f66e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -680,6 +680,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	u8 promisc, allmulti;
 	struct virtio_net_ctrl_mac *mac_data;
 	struct dev_addr_list *addr;
+	struct netdev_hw_addr *ha;
 	void *buf;
 	int i;
 
@@ -718,9 +719,9 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 
 	/* Store the unicast list and count in the front of the buffer */
 	mac_data->entries = dev->uc_count;
-	addr = dev->uc_list;
-	for (i = 0; i < dev->uc_count; i++, addr = addr->next)
-		memcpy(&mac_data->macs[i][0], addr->da_addr, ETH_ALEN);
+	i = 0;
+	list_for_each_entry(ha, &dev->uc_list, list)
+		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
 
 	sg_set_buf(&sg[0], mac_data,
 		   sizeof(mac_data->entries) + (dev->uc_count * ETH_ALEN));
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 9ca6bab7c9b..ecd3d06c0d5 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -19,6 +19,7 @@
 #include <linux/etherdevice.h>
 #include <linux/mii.h>
 #include <linux/ip.h>
+#include <linux/list.h>
 
 #include "qeth_core.h"
 
@@ -640,6 +641,7 @@ static void qeth_l2_set_multicast_list(struct net_device *dev)
 {
 	struct qeth_card *card = dev->ml_priv;
 	struct dev_addr_list *dm;
+	struct netdev_hw_addr *ha;
 
 	if (card->info.type == QETH_CARD_TYPE_OSN)
 		return ;
@@ -653,8 +655,8 @@ static void qeth_l2_set_multicast_list(struct net_device *dev)
 	for (dm = dev->mc_list; dm; dm = dm->next)
 		qeth_l2_add_mc(card, dm->da_addr, 0);
 
-	for (dm = dev->uc_list; dm; dm = dm->next)
-		qeth_l2_add_mc(card, dm->da_addr, 1);
+	list_for_each_entry(ha, &dev->uc_list, list)
+		qeth_l2_add_mc(card, ha->addr, 1);
 
 	spin_unlock_bh(&card->mclock);
 	if (!qeth_adp_supported(card, IPA_SETADP_SET_PROMISC_MODE))
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index ce33f107b0a..f791348871f 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -182,8 +182,8 @@ static void fcoe_update_src_mac(struct fcoe_ctlr *fip, u8 *old, u8 *new)
 	fc = fcoe_from_ctlr(fip);
 	rtnl_lock();
 	if (!is_zero_ether_addr(old))
-		dev_unicast_delete(fc->real_dev, old, ETH_ALEN);
-	dev_unicast_add(fc->real_dev, new, ETH_ALEN);
+		dev_unicast_delete(fc->real_dev, old);
+	dev_unicast_add(fc->real_dev, new);
 	rtnl_unlock();
 }
 
@@ -233,13 +233,11 @@ void fcoe_netdev_cleanup(struct fcoe_softc *fc)
 	/* Delete secondary MAC addresses */
 	rtnl_lock();
 	memcpy(flogi_maddr, (u8[6]) FC_FCOE_FLOGI_MAC, ETH_ALEN);
-	dev_unicast_delete(fc->real_dev, flogi_maddr, ETH_ALEN);
+	dev_unicast_delete(fc->real_dev, flogi_maddr);
 	if (!is_zero_ether_addr(fc->ctlr.data_src_addr))
-		dev_unicast_delete(fc->real_dev,
-				   fc->ctlr.data_src_addr, ETH_ALEN);
+		dev_unicast_delete(fc->real_dev, fc->ctlr.data_src_addr);
 	if (fc->ctlr.spma)
-		dev_unicast_delete(fc->real_dev,
-				   fc->ctlr.ctl_src_addr, ETH_ALEN);
+		dev_unicast_delete(fc->real_dev, fc->ctlr.ctl_src_addr);
 	dev_mc_delete(fc->real_dev, FIP_ALL_ENODE_MACS, ETH_ALEN, 0);
 	rtnl_unlock();
 }
@@ -347,9 +345,9 @@ static int fcoe_netdev_config(struct fc_lport *lp, struct net_device *netdev)
 	 */
 	rtnl_lock();
 	memcpy(flogi_maddr, (u8[6]) FC_FCOE_FLOGI_MAC, ETH_ALEN);
-	dev_unicast_add(fc->real_dev, flogi_maddr, ETH_ALEN);
+	dev_unicast_add(fc->real_dev, flogi_maddr);
 	if (fc->ctlr.spma)
-		dev_unicast_add(fc->real_dev, fc->ctlr.ctl_src_addr, ETH_ALEN);
+		dev_unicast_add(fc->real_dev, fc->ctlr.ctl_src_addr);
 	dev_mc_add(fc->real_dev, FIP_ALL_ENODE_MACS, ETH_ALEN, 0);
 	rtnl_unlock();
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1eaf5ae14fe..bbfabf3012b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -215,9 +215,12 @@ struct netdev_hw_addr {
 	struct list_head	list;
 	unsigned char		addr[MAX_ADDR_LEN];
 	unsigned char		type;
-#define NETDEV_HW_ADDR_T_LAN	1
-#define NETDEV_HW_ADDR_T_SAN	2
-#define NETDEV_HW_ADDR_T_SLAVE	3
+#define NETDEV_HW_ADDR_T_LAN		1
+#define NETDEV_HW_ADDR_T_SAN		2
+#define NETDEV_HW_ADDR_T_SLAVE		3
+#define NETDEV_HW_ADDR_T_UNICAST	4
+	int			refcount;
+	bool			synced;
 	struct rcu_head		rcu_head;
 };
 
@@ -773,10 +776,11 @@ struct net_device
 	unsigned char		addr_len;	/* hardware address length	*/
 	unsigned short          dev_id;		/* for shared network cards */
 
-	spinlock_t		addr_list_lock;
-	struct dev_addr_list	*uc_list;	/* Secondary unicast mac addresses */
+	struct list_head	uc_list;	/* Secondary unicast mac
+						   addresses */
 	int			uc_count;	/* Number of installed ucasts	*/
 	int			uc_promisc;
+	spinlock_t		addr_list_lock;
 	struct dev_addr_list	*mc_list;	/* Multicast mac addresses	*/
 	int			mc_count;	/* Number of installed mcasts	*/
 	unsigned int		promiscuity;
@@ -1836,8 +1840,8 @@ extern int dev_addr_del_multiple(struct net_device *to_dev,
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
-extern int		dev_unicast_delete(struct net_device *dev, void *addr, int alen);
-extern int		dev_unicast_add(struct net_device *dev, void *addr, int alen);
+extern int		dev_unicast_delete(struct net_device *dev, void *addr);
+extern int		dev_unicast_add(struct net_device *dev, void *addr);
 extern int		dev_unicast_sync(struct net_device *to, struct net_device *from);
 extern void		dev_unicast_unsync(struct net_device *to, struct net_device *from);
 extern int 		dev_mc_delete(struct net_device *dev, void *addr, int alen, int all);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d1e10546eb8..714e1c3536b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -378,13 +378,13 @@ static void vlan_sync_address(struct net_device *dev,
 	 * the new address */
 	if (compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) &&
 	    !compare_ether_addr(vlandev->dev_addr, dev->dev_addr))
-		dev_unicast_delete(dev, vlandev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(dev, vlandev->dev_addr);
 
 	/* vlan address was equal to the old address and is different from
 	 * the new address */
 	if (!compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) &&
 	    compare_ether_addr(vlandev->dev_addr, dev->dev_addr))
-		dev_unicast_add(dev, vlandev->dev_addr, ETH_ALEN);
+		dev_unicast_add(dev, vlandev->dev_addr);
 
 	memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN);
 }
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 1e2ad4c7c59..96bad8f233e 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -441,7 +441,7 @@ static int vlan_dev_open(struct net_device *dev)
 		return -ENETDOWN;
 
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) {
-		err = dev_unicast_add(real_dev, dev->dev_addr, ETH_ALEN);
+		err = dev_unicast_add(real_dev, dev->dev_addr);
 		if (err < 0)
 			goto out;
 	}
@@ -470,7 +470,7 @@ clear_allmulti:
 		dev_set_allmulti(real_dev, -1);
 del_unicast:
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
-		dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(real_dev, dev->dev_addr);
 out:
 	netif_carrier_off(dev);
 	return err;
@@ -492,7 +492,7 @@ static int vlan_dev_stop(struct net_device *dev)
 		dev_set_promiscuity(real_dev, -1);
 
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
-		dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len);
+		dev_unicast_delete(real_dev, dev->dev_addr);
 
 	netif_carrier_off(dev);
 	return 0;
@@ -511,13 +511,13 @@ static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
 		goto out;
 
 	if (compare_ether_addr(addr->sa_data, real_dev->dev_addr)) {
-		err = dev_unicast_add(real_dev, addr->sa_data, ETH_ALEN);
+		err = dev_unicast_add(real_dev, addr->sa_data);
 		if (err < 0)
 			return err;
 	}
 
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
-		dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(real_dev, dev->dev_addr);
 
 out:
 	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
diff --git a/net/core/dev.c b/net/core/dev.c
index 32ceee17896..e2fcc5f1017 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3473,8 +3473,9 @@ void dev_set_rx_mode(struct net_device *dev)
 
 /* hw addresses list handling functions */
 
-static int __hw_addr_add(struct list_head *list, unsigned char *addr,
-			 int addr_len, unsigned char addr_type)
+static int __hw_addr_add(struct list_head *list, int *delta,
+			 unsigned char *addr, int addr_len,
+			 unsigned char addr_type)
 {
 	struct netdev_hw_addr *ha;
 	int alloc_size;
@@ -3482,6 +3483,15 @@ static int __hw_addr_add(struct list_head *list, unsigned char *addr,
 	if (addr_len > MAX_ADDR_LEN)
 		return -EINVAL;
 
+	list_for_each_entry(ha, list, list) {
+		if (!memcmp(ha->addr, addr, addr_len) &&
+		    ha->type == addr_type) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+
 	alloc_size = sizeof(*ha);
 	if (alloc_size < L1_CACHE_BYTES)
 		alloc_size = L1_CACHE_BYTES;
@@ -3490,7 +3500,11 @@ static int __hw_addr_add(struct list_head *list, unsigned char *addr,
 		return -ENOMEM;
 	memcpy(ha->addr, addr, addr_len);
 	ha->type = addr_type;
+	ha->refcount = 1;
+	ha->synced = false;
 	list_add_tail_rcu(&ha->list, list);
+	if (delta)
+		(*delta)++;
 	return 0;
 }
 
@@ -3502,29 +3516,30 @@ static void ha_rcu_free(struct rcu_head *head)
 	kfree(ha);
 }
 
-static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
-			    int addr_len, unsigned char addr_type,
-			    int ignore_index)
+static int __hw_addr_del(struct list_head *list, int *delta,
+			 unsigned char *addr, int addr_len,
+			 unsigned char addr_type)
 {
 	struct netdev_hw_addr *ha;
-	int i = 0;
 
 	list_for_each_entry(ha, list, list) {
-		if (i++ != ignore_index &&
-		    !memcmp(ha->addr, addr, addr_len) &&
+		if (!memcmp(ha->addr, addr, addr_len) &&
 		    (ha->type == addr_type || !addr_type)) {
+			if (--ha->refcount)
+				return 0;
 			list_del_rcu(&ha->list);
 			call_rcu(&ha->rcu_head, ha_rcu_free);
+			if (delta)
+				(*delta)--;
 			return 0;
 		}
 	}
 	return -ENOENT;
 }
 
-static int __hw_addr_add_multiple_ii(struct list_head *to_list,
-				     struct list_head *from_list,
-				     int addr_len, unsigned char addr_type,
-				     int ignore_index)
+static int __hw_addr_add_multiple(struct list_head *to_list, int *to_delta,
+				  struct list_head *from_list, int addr_len,
+				  unsigned char addr_type)
 {
 	int err;
 	struct netdev_hw_addr *ha, *ha2;
@@ -3532,7 +3547,8 @@ static int __hw_addr_add_multiple_ii(struct list_head *to_list,
 
 	list_for_each_entry(ha, from_list, list) {
 		type = addr_type ? addr_type : ha->type;
-		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		err = __hw_addr_add(to_list, to_delta, ha->addr,
+				    addr_len, type);
 		if (err)
 			goto unroll;
 	}
@@ -3543,27 +3559,69 @@ unroll:
 		if (ha2 == ha)
 			break;
 		type = addr_type ? addr_type : ha2->type;
-		__hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
-				 ignore_index);
+		__hw_addr_del(to_list, to_delta, ha2->addr,
+			      addr_len, type);
 	}
 	return err;
 }
 
-static void __hw_addr_del_multiple_ii(struct list_head *to_list,
-				      struct list_head *from_list,
-				      int addr_len, unsigned char addr_type,
-				      int ignore_index)
+static void __hw_addr_del_multiple(struct list_head *to_list, int *to_delta,
+				   struct list_head *from_list, int addr_len,
+				   unsigned char addr_type)
 {
 	struct netdev_hw_addr *ha;
 	unsigned char type;
 
 	list_for_each_entry(ha, from_list, list) {
 		type = addr_type ? addr_type : ha->type;
-		__hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
-				 ignore_index);
+		__hw_addr_del(to_list, to_delta, ha->addr,
+			      addr_len, addr_type);
+	}
+}
+
+static int __hw_addr_sync(struct list_head *to_list, int *to_delta,
+			  struct list_head *from_list, int *from_delta,
+			  int addr_len)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, from_list, list) {
+		if (!ha->synced) {
+			err = __hw_addr_add(to_list, to_delta, ha->addr,
+					    addr_len, ha->type);
+			if (err)
+				break;
+			ha->synced = true;
+			ha->refcount++;
+		} else if (ha->refcount == 1) {
+			__hw_addr_del(to_list, to_delta, ha->addr,
+				      addr_len, ha->type);
+			__hw_addr_del(from_list, from_delta, ha->addr,
+				      addr_len, ha->type);
+		}
 	}
+	return err;
 }
 
+static void __hw_addr_unsync(struct list_head *to_list, int *to_delta,
+			     struct list_head *from_list, int *from_delta,
+			     int addr_len)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, from_list, list) {
+		if (ha->synced) {
+			__hw_addr_del(to_list, to_delta, ha->addr,
+				      addr_len, ha->type);
+			ha->synced = false;
+			__hw_addr_del(from_list, from_delta, ha->addr,
+				      addr_len, ha->type);
+		}
+	}
+}
+
+
 static void __hw_addr_flush(struct list_head *list)
 {
 	struct netdev_hw_addr *ha, *tmp;
@@ -3594,7 +3652,7 @@ static int dev_addr_init(struct net_device *dev)
 
 	INIT_LIST_HEAD(&dev->dev_addr_list);
 	memset(addr, 0, sizeof(*addr));
-	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+	err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(*addr),
 			    NETDEV_HW_ADDR_T_LAN);
 	if (!err) {
 		/*
@@ -3626,7 +3684,7 @@ int dev_addr_add(struct net_device *dev, unsigned char *addr,
 
 	ASSERT_RTNL();
 
-	err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+	err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, dev->addr_len,
 			    addr_type);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
@@ -3649,11 +3707,20 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr,
 		 unsigned char addr_type)
 {
 	int err;
+	struct netdev_hw_addr *ha;
 
 	ASSERT_RTNL();
 
-	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
-			       addr_type, 0);
+	/*
+	 * We can not remove the first address from the list because
+	 * dev->dev_addr points to that.
+	 */
+	ha = list_first_entry(&dev->dev_addr_list, struct netdev_hw_addr, list);
+	if (ha->addr == dev->dev_addr && ha->refcount == 1)
+		return -ENOENT;
+
+	err = __hw_addr_del(&dev->dev_addr_list, NULL, addr, dev->addr_len,
+			    addr_type);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 	return err;
@@ -3680,9 +3747,9 @@ int dev_addr_add_multiple(struct net_device *to_dev,
 
 	if (from_dev->addr_len != to_dev->addr_len)
 		return -EINVAL;
-	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
-					&from_dev->dev_addr_list,
-					to_dev->addr_len, addr_type, 0);
+	err = __hw_addr_add_multiple(&to_dev->dev_addr_list, NULL,
+				     &from_dev->dev_addr_list,
+				     to_dev->addr_len, addr_type);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
 	return err;
@@ -3707,9 +3774,9 @@ int dev_addr_del_multiple(struct net_device *to_dev,
 
 	if (from_dev->addr_len != to_dev->addr_len)
 		return -EINVAL;
-	__hw_addr_del_multiple_ii(&to_dev->dev_addr_list,
-				  &from_dev->dev_addr_list,
-				  to_dev->addr_len, addr_type, 0);
+	__hw_addr_del_multiple(&to_dev->dev_addr_list, NULL,
+			       &from_dev->dev_addr_list,
+			       to_dev->addr_len, addr_type);
 	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
 	return 0;
 }
@@ -3779,24 +3846,22 @@ int __dev_addr_add(struct dev_addr_list **list, int *count,
  *	dev_unicast_delete	- Release secondary unicast address.
  *	@dev: device
  *	@addr: address to delete
- *	@alen: length of @addr
  *
  *	Release reference to a secondary unicast address and remove it
  *	from the device if the reference count drops to zero.
  *
  * 	The caller must hold the rtnl_mutex.
  */
-int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
+int dev_unicast_delete(struct net_device *dev, void *addr)
 {
 	int err;
 
 	ASSERT_RTNL();
 
-	netif_addr_lock_bh(dev);
-	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
+	err = __hw_addr_del(&dev->uc_list, &dev->uc_count, addr,
+			    dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
 	if (!err)
 		__dev_set_rx_mode(dev);
-	netif_addr_unlock_bh(dev);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_delete);
@@ -3805,24 +3870,22 @@ EXPORT_SYMBOL(dev_unicast_delete);
  *	dev_unicast_add		- add a secondary unicast address
  *	@dev: device
  *	@addr: address to add
- *	@alen: length of @addr
  *
  *	Add a secondary unicast address to the device or increase
  *	the reference count if it already exists.
  *
  *	The caller must hold the rtnl_mutex.
  */
-int dev_unicast_add(struct net_device *dev, void *addr, int alen)
+int dev_unicast_add(struct net_device *dev, void *addr)
 {
 	int err;
 
 	ASSERT_RTNL();
 
-	netif_addr_lock_bh(dev);
-	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
+	err = __hw_addr_add(&dev->uc_list, &dev->uc_count, addr,
+			    dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
 	if (!err)
 		__dev_set_rx_mode(dev);
-	netif_addr_unlock_bh(dev);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_add);
@@ -3879,8 +3942,7 @@ void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
  *	@from: source device
  *
  *	Add newly added addresses to the destination device and release
- *	addresses that have no users left. The source device must be
- *	locked by netif_tx_lock_bh.
+ *	addresses that have no users left.
  *
  *	This function is intended to be called from the dev->set_rx_mode
  *	function of layered software devices.
@@ -3889,12 +3951,15 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)
 {
 	int err = 0;
 
-	netif_addr_lock_bh(to);
-	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
-			      &from->uc_list, &from->uc_count);
+	ASSERT_RTNL();
+
+	if (to->addr_len != from->addr_len)
+		return -EINVAL;
+
+	err = __hw_addr_sync(&to->uc_list, &to->uc_count,
+			     &from->uc_list, &from->uc_count, to->addr_len);
 	if (!err)
 		__dev_set_rx_mode(to);
-	netif_addr_unlock_bh(to);
 	return err;
 }
 EXPORT_SYMBOL(dev_unicast_sync);
@@ -3910,18 +3975,33 @@ EXPORT_SYMBOL(dev_unicast_sync);
  */
 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
 {
-	netif_addr_lock_bh(from);
-	netif_addr_lock(to);
+	ASSERT_RTNL();
 
-	__dev_addr_unsync(&to->uc_list, &to->uc_count,
-			  &from->uc_list, &from->uc_count);
-	__dev_set_rx_mode(to);
+	if (to->addr_len != from->addr_len)
+		return;
 
-	netif_addr_unlock(to);
-	netif_addr_unlock_bh(from);
+	__hw_addr_unsync(&to->uc_list, &to->uc_count,
+			 &from->uc_list, &from->uc_count, to->addr_len);
+	__dev_set_rx_mode(to);
 }
 EXPORT_SYMBOL(dev_unicast_unsync);
 
+static void dev_unicast_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->uc_list);
+	dev->uc_count = 0;
+}
+
+static void dev_unicast_init(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->uc_list);
+}
+
+
 static void __dev_addr_discard(struct dev_addr_list **list)
 {
 	struct dev_addr_list *tmp;
@@ -3940,9 +4020,6 @@ static void dev_addr_discard(struct net_device *dev)
 {
 	netif_addr_lock_bh(dev);
 
-	__dev_addr_discard(&dev->uc_list);
-	dev->uc_count = 0;
-
 	__dev_addr_discard(&dev->mc_list);
 	dev->mc_count = 0;
 
@@ -4535,6 +4612,7 @@ static void rollback_registered(struct net_device *dev)
 	/*
 	 *	Flush the unicast and multicast chains
 	 */
+	dev_unicast_flush(dev);
 	dev_addr_discard(dev);
 
 	if (dev->netdev_ops->ndo_uninit)
@@ -5020,6 +5098,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	if (dev_addr_init(dev))
 		goto free_tx;
 
+	dev_unicast_init(dev);
+
 	dev_net_set(dev, &init_net);
 
 	dev->_tx = tx;
@@ -5223,6 +5303,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	/*
 	 *	Flush the unicast and multicast chains
 	 */
+	dev_unicast_flush(dev);
 	dev_addr_discard(dev);
 
 	netdev_unregister_kobject(dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ed131181215..2175e6d5cc8 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -67,7 +67,7 @@ static int dsa_slave_open(struct net_device *dev)
 		return -ENETDOWN;
 
 	if (compare_ether_addr(dev->dev_addr, master->dev_addr)) {
-		err = dev_unicast_add(master, dev->dev_addr, ETH_ALEN);
+		err = dev_unicast_add(master, dev->dev_addr);
 		if (err < 0)
 			goto out;
 	}
@@ -90,7 +90,7 @@ clear_allmulti:
 		dev_set_allmulti(master, -1);
 del_unicast:
 	if (compare_ether_addr(dev->dev_addr, master->dev_addr))
-		dev_unicast_delete(master, dev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(master, dev->dev_addr);
 out:
 	return err;
 }
@@ -108,7 +108,7 @@ static int dsa_slave_close(struct net_device *dev)
 		dev_set_promiscuity(master, -1);
 
 	if (compare_ether_addr(dev->dev_addr, master->dev_addr))
-		dev_unicast_delete(master, dev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(master, dev->dev_addr);
 
 	return 0;
 }
@@ -147,13 +147,13 @@ static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
 		goto out;
 
 	if (compare_ether_addr(addr->sa_data, master->dev_addr)) {
-		err = dev_unicast_add(master, addr->sa_data, ETH_ALEN);
+		err = dev_unicast_add(master, addr->sa_data);
 		if (err < 0)
 			return err;
 	}
 
 	if (compare_ether_addr(dev->dev_addr, master->dev_addr))
-		dev_unicast_delete(master, dev->dev_addr, ETH_ALEN);
+		dev_unicast_delete(master, dev->dev_addr);
 
 out:
 	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c7c5d524967..6da9f38ef5c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1582,9 +1582,9 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
 		break;
 	case PACKET_MR_UNICAST:
 		if (what > 0)
-			return dev_unicast_add(dev, i->addr, i->alen);
+			return dev_unicast_add(dev, i->addr);
 		else
-			return dev_unicast_delete(dev, i->addr, i->alen);
+			return dev_unicast_delete(dev, i->addr);
 		break;
 	default:;
 	}
-- 
cgit v1.2.3-70-g09d2


From 25346b93ca079080c9cb23331db5c4f6404e8530 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:48:12 +1000
Subject: perf_counter: Provide functions for locking and pinning the context
 for a task

This abstracts out the code for locking the context associated
with a task.  Because the context might get transferred from
one task to another concurrently, we have to check after
locking the context that it is still the right context for the
task and retry if not.  This was open-coded in
find_get_context() and perf_counter_init_task().

This adds a further function for pinning the context for a
task, i.e. marking it so it can't be transferred to another
task.  This adds a 'pin_count' field to struct
perf_counter_context to indicate that a context is pinned,
instead of the previous method of setting the parent_gen count
to all 1s.  Pinning the context with a pin_count is easier to
undo and doesn't require saving the parent_gen value.  This
also adds a perf_unpin_context() to undo the effect of
perf_pin_task_context() and changes perf_counter_init_task to
use it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.34748.755674.596386@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   1 +
 kernel/perf_counter.c        | 128 +++++++++++++++++++++++++------------------
 2 files changed, 75 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 519a41bba24..81ec79c9f19 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -543,6 +543,7 @@ struct perf_counter_context {
 	struct perf_counter_context *parent_ctx;
 	u64			parent_gen;
 	u64			generation;
+	int			pin_count;
 	struct rcu_head		rcu_head;
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 79c3f26541d..da8dfef4b47 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -122,6 +122,69 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *perf_lock_task_context(
+				struct task_struct *task, unsigned long *flags)
+{
+	struct perf_counter_context *ctx;
+
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more.
+		 */
+		spin_lock_irqsave(&ctx->lock, *flags);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			goto retry;
+		}
+	}
+	rcu_read_unlock();
+	return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+	struct perf_counter_context *ctx;
+	unsigned long flags;
+
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		++ctx->pin_count;
+		get_ctx(ctx);
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+	return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	--ctx->pin_count;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	put_ctx(ctx);
+}
+
 /*
  * Add a counter from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
 		&& ctx1->parent_gen == ctx2->parent_gen
-		&& ctx1->parent_gen != ~0ull;
+		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
 /*
@@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	struct perf_counter_context *ctx;
 	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	unsigned long flags;
 	int err;
 
 	/*
@@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto errout;
 
- retry_lock:
-	rcu_read_lock();
  retry:
-	ctx = rcu_dereference(task->perf_counter_ctxp);
+	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
-		/*
-		 * If this context is a clone of another, it might
-		 * get swapped for another underneath us by
-		 * perf_counter_task_sched_out, though the
-		 * rcu_read_lock() protects us from any context
-		 * getting freed.  Lock the context and check if it
-		 * got swapped before we could get the lock, and retry
-		 * if so.  If we locked the right context, then it
-		 * can't get swapped on us any more and we can
-		 * unclone it if necessary.
-		 * Once it's not a clone things will be stable.
-		 */
-		spin_lock_irq(&ctx->lock);
-		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
-			spin_unlock_irq(&ctx->lock);
-			goto retry;
-		}
 		parent_ctx = ctx->parent_ctx;
 		if (parent_ctx) {
 			put_ctx(parent_ctx);
@@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		 * this context won't get freed if the task exits.
 		 */
 		get_ctx(ctx);
-		spin_unlock_irq(&ctx->lock);
+		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
-	rcu_read_unlock();
 
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
@@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			 * the context they set.
 			 */
 			kfree(ctx);
-			goto retry_lock;
+			goto retry;
 		}
 		get_task_struct(task);
 	}
@@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
-	u64 cloned_gen;
 	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
@@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child)
 	get_task_struct(child);
 
 	/*
-	 * If the parent's context is a clone, temporarily set its
-	 * parent_gen to an impossible value (all 1s) so it won't get
-	 * swapped under us.  The rcu_read_lock makes sure that
-	 * parent_ctx continues to exist even if it gets swapped to
-	 * another process and then freed while we are trying to get
-	 * its lock.
+	 * If the parent's context is a clone, pin it so it won't get
+	 * swapped under us.
 	 */
-	rcu_read_lock();
- retry:
-	parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+	parent_ctx = perf_pin_task_context(parent);
+
 	/*
 	 * No need to check if parent_ctx != NULL here; since we saw
 	 * it non-NULL earlier, the only reason for it to become NULL
 	 * is if we exit, and since we're currently in the middle of
 	 * a fork we can't be exiting at the same time.
 	 */
-	spin_lock_irq(&parent_ctx->lock);
-	if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
-		spin_unlock_irq(&parent_ctx->lock);
-		goto retry;
-	}
-	cloned_gen = parent_ctx->parent_gen;
-	if (parent_ctx->parent_ctx)
-		parent_ctx->parent_gen = ~0ull;
-	spin_unlock_irq(&parent_ctx->lock);
-	rcu_read_unlock();
 
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
@@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child)
 		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
 		if (cloned_ctx) {
 			child_ctx->parent_ctx = cloned_ctx;
-			child_ctx->parent_gen = cloned_gen;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
 		} else {
 			child_ctx->parent_ctx = parent_ctx;
 			child_ctx->parent_gen = parent_ctx->generation;
@@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child)
 
 	mutex_unlock(&parent_ctx->mutex);
 
-	/*
-	 * Restore the clone status of the parent.
-	 */
-	if (parent_ctx->parent_ctx) {
-		spin_lock_irq(&parent_ctx->lock);
-		if (parent_ctx->parent_ctx)
-			parent_ctx->parent_gen = cloned_gen;
-		spin_unlock_irq(&parent_ctx->lock);
-	}
+	perf_unpin_context(parent_ctx);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From d1a277c584d0862dbf51991baea947ea5f2ce6bf Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Sat, 30 May 2009 07:55:50 +0000
Subject: can: sja1000: generic OF platform bus driver

This patch adds a generic driver for SJA1000 chips on the OpenFirmware
platform bus found on embedded PowerPC systems. You need a SJA1000 node
definition in your flattened device tree source (DTS) file similar to:

  can@3,100 {
  	compatible = "nxp,sja1000";
  	reg = <3 0x100 0x80>;
  	interrupts = <2 0>;
  	interrupt-parent = <&mpic>;
  	nxp,external-clock-frequency = <16000000>;
  };

See also Documentation/powerpc/dts-bindings/can/sja1000.txt.

CC: devicetree-discuss@ozlabs.org
Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/powerpc/dts-bindings/can/sja1000.txt |  53 +++++
 drivers/net/can/Kconfig                            |   9 +
 drivers/net/can/sja1000/Makefile                   |   1 +
 drivers/net/can/sja1000/sja1000_of_platform.c      | 233 +++++++++++++++++++++
 include/linux/can/platform/sja1000.h               |   3 +
 5 files changed, 299 insertions(+)
 create mode 100644 Documentation/powerpc/dts-bindings/can/sja1000.txt
 create mode 100644 drivers/net/can/sja1000/sja1000_of_platform.c

(limited to 'include/linux')

diff --git a/Documentation/powerpc/dts-bindings/can/sja1000.txt b/Documentation/powerpc/dts-bindings/can/sja1000.txt
new file mode 100644
index 00000000000..d6d209ded93
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/can/sja1000.txt
@@ -0,0 +1,53 @@
+Memory mapped SJA1000 CAN controller from NXP (formerly Philips)
+
+Required properties:
+
+- compatible : should be "nxp,sja1000".
+
+- reg : should specify the chip select, address offset and size required
+	to map the registers of the SJA1000. The size is usually 0x80.
+
+- interrupts: property with a value describing the interrupt source
+	(number and sensitivity) required for the SJA1000.
+
+Optional properties:
+
+- nxp,external-clock-frequency : Frequency of the external oscillator
+	clock in Hz. Note that the internal clock frequency used by the
+	SJA1000 is half of that value. If not specified, a default value
+	of 16000000 (16 MHz) is used.
+
+- nxp,tx-output-mode : operation mode of the TX output control logic:
+	<0x0> : bi-phase output mode
+	<0x1> : normal output mode (default)
+	<0x2> : test output mode
+	<0x3> : clock output mode
+
+- nxp,tx-output-config : TX output pin configuration:
+	<0x01> : TX0 invert
+	<0x02> : TX0 pull-down (default)
+	<0x04> : TX0 pull-up
+	<0x06> : TX0 push-pull
+	<0x08> : TX1 invert
+	<0x10> : TX1 pull-down
+	<0x20> : TX1 pull-up
+	<0x30> : TX1 push-pull
+
+- nxp,clock-out-frequency : clock frequency in Hz on the CLKOUT pin.
+	If not specified or if the specified value is 0, the CLKOUT pin
+	will be disabled.
+
+- nxp,no-comparator-bypass : Allows to disable the CAN input comperator.
+
+For futher information, please have a look to the SJA1000 data sheet.
+
+Examples:
+
+can@3,100 {
+	compatible = "nxp,sja1000";
+	reg = <3 0x100 0x80>;
+	interrupts = <2 0>;
+	interrupt-parent = <&mpic>;
+	nxp,external-clock-frequency = <16000000>;
+};
+
diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index cfd6c5a285f..d5e18812bf4 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -51,6 +51,15 @@ config CAN_SJA1000_PLATFORM
 	  boards from Phytec (http://www.phytec.de) like the PCM027,
 	  PCM038.
 
+config CAN_SJA1000_OF_PLATFORM
+	depends on CAN_SJA1000 && PPC_OF
+	tristate "Generic OF Platform Bus based SJA1000 driver"
+	---help---
+	  This driver adds support for the SJA1000 chips connected to
+	  the OpenFirmware "platform bus" found on embedded systems with
+	  OpenFirmware bindings, e.g. if you have a PowerPC based system
+	  you may want to enable this option.
+
 config CAN_EMS_PCI
 	tristate "EMS CPC-PCI and CPC-PCIe Card"
 	depends on PCI && CAN_SJA1000
diff --git a/drivers/net/can/sja1000/Makefile b/drivers/net/can/sja1000/Makefile
index d6c631f9e66..9d0c08da273 100644
--- a/drivers/net/can/sja1000/Makefile
+++ b/drivers/net/can/sja1000/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_CAN_SJA1000) += sja1000.o
 obj-$(CONFIG_CAN_SJA1000_PLATFORM) += sja1000_platform.o
+obj-$(CONFIG_CAN_SJA1000_OF_PLATFORM) += sja1000_of_platform.o
 obj-$(CONFIG_CAN_EMS_PCI) += ems_pci.o
 obj-$(CONFIG_CAN_KVASER_PCI) += kvaser_pci.o
 
diff --git a/drivers/net/can/sja1000/sja1000_of_platform.c b/drivers/net/can/sja1000/sja1000_of_platform.c
new file mode 100644
index 00000000000..aa953fb4b8d
--- /dev/null
+++ b/drivers/net/can/sja1000/sja1000_of_platform.c
@@ -0,0 +1,233 @@
+/*
+ * Driver for SJA1000 CAN controllers on the OpenFirmware platform bus
+ *
+ * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/* This is a generic driver for SJA1000 chips on the OpenFirmware platform
+ * bus found on embedded PowerPC systems. You need a SJA1000 CAN node
+ * definition in your flattened device tree source (DTS) file similar to:
+ *
+ *   can@3,100 {
+ *           compatible = "nxp,sja1000";
+ *           reg = <3 0x100 0x80>;
+ *           interrupts = <2 0>;
+ *           interrupt-parent = <&mpic>;
+ *           nxp,external-clock-frequency = <16000000>;
+ *   };
+ *
+ * See "Documentation/powerpc/dts-bindings/can/sja1000.txt" for further
+ * information.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+
+#include <linux/of_platform.h>
+#include <asm/prom.h>
+
+#include "sja1000.h"
+
+#define DRV_NAME "sja1000_of_platform"
+
+MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>");
+MODULE_DESCRIPTION("Socket-CAN driver for SJA1000 on the OF platform bus");
+MODULE_LICENSE("GPL v2");
+
+#define SJA1000_OFP_CAN_CLOCK  (16000000 / 2)
+
+#define SJA1000_OFP_OCR        OCR_TX0_PULLDOWN
+#define SJA1000_OFP_CDR        (CDR_CBP | CDR_CLK_OFF)
+
+static u8 sja1000_ofp_read_reg(const struct sja1000_priv *priv, int reg)
+{
+	return in_8(priv->reg_base + reg);
+}
+
+static void sja1000_ofp_write_reg(const struct sja1000_priv *priv,
+				  int reg, u8 val)
+{
+	out_8(priv->reg_base + reg, val);
+}
+
+static int __devexit sja1000_ofp_remove(struct of_device *ofdev)
+{
+	struct net_device *dev = dev_get_drvdata(&ofdev->dev);
+	struct sja1000_priv *priv = netdev_priv(dev);
+	struct device_node *np = ofdev->node;
+	struct resource res;
+
+	dev_set_drvdata(&ofdev->dev, NULL);
+
+	unregister_sja1000dev(dev);
+	free_sja1000dev(dev);
+	iounmap(priv->reg_base);
+	irq_dispose_mapping(dev->irq);
+
+	of_address_to_resource(np, 0, &res);
+	release_mem_region(res.start, resource_size(&res));
+
+	return 0;
+}
+
+static int __devinit sja1000_ofp_probe(struct of_device *ofdev,
+				       const struct of_device_id *id)
+{
+	struct device_node *np = ofdev->node;
+	struct net_device *dev;
+	struct sja1000_priv *priv;
+	struct resource res;
+	const u32 *prop;
+	int err, irq, res_size, prop_size;
+	void __iomem *base;
+
+	err = of_address_to_resource(np, 0, &res);
+	if (err) {
+		dev_err(&ofdev->dev, "invalid address\n");
+		return err;
+	}
+
+	res_size = resource_size(&res);
+
+	if (!request_mem_region(res.start, res_size, DRV_NAME)) {
+		dev_err(&ofdev->dev, "couldn't request %#x..%#x\n",
+			res.start, res.end);
+		return -EBUSY;
+	}
+
+	base = ioremap_nocache(res.start, res_size);
+	if (!base) {
+		dev_err(&ofdev->dev, "couldn't ioremap %#x..%#x\n",
+			res.start, res.end);
+		err = -ENOMEM;
+		goto exit_release_mem;
+	}
+
+	irq = irq_of_parse_and_map(np, 0);
+	if (irq == NO_IRQ) {
+		dev_err(&ofdev->dev, "no irq found\n");
+		err = -ENODEV;
+		goto exit_unmap_mem;
+	}
+
+	dev = alloc_sja1000dev(0);
+	if (!dev) {
+		err = -ENOMEM;
+		goto exit_dispose_irq;
+	}
+
+	priv = netdev_priv(dev);
+
+	priv->read_reg = sja1000_ofp_read_reg;
+	priv->write_reg = sja1000_ofp_write_reg;
+
+	prop = of_get_property(np, "nxp,external-clock-frequency", &prop_size);
+	if (prop && (prop_size ==  sizeof(u32)))
+		priv->can.clock.freq = *prop / 2;
+	else
+		priv->can.clock.freq = SJA1000_OFP_CAN_CLOCK; /* default */
+
+	prop = of_get_property(np, "nxp,tx-output-mode", &prop_size);
+	if (prop && (prop_size == sizeof(u32)))
+		priv->ocr |= *prop & OCR_MODE_MASK;
+	else
+		priv->ocr |= OCR_MODE_NORMAL; /* default */
+
+	prop = of_get_property(np, "nxp,tx-output-config", &prop_size);
+	if (prop && (prop_size == sizeof(u32)))
+		priv->ocr |= (*prop << OCR_TX_SHIFT) & OCR_TX_MASK;
+	else
+		priv->ocr |= OCR_TX0_PULLDOWN; /* default */
+
+	prop = of_get_property(np, "nxp,clock-out-frequency", &prop_size);
+	if (prop && (prop_size == sizeof(u32)) && *prop) {
+		u32 divider = priv->can.clock.freq * 2 / *prop;
+
+		if (divider > 1)
+			priv->cdr |= divider / 2 - 1;
+		else
+			priv->cdr |= CDR_CLKOUT_MASK;
+	} else {
+		priv->cdr |= CDR_CLK_OFF; /* default */
+	}
+
+	prop = of_get_property(np, "nxp,no-comparator-bypass", NULL);
+	if (!prop)
+		priv->cdr |= CDR_CBP; /* default */
+
+	priv->irq_flags = IRQF_SHARED;
+	priv->reg_base = base;
+
+	dev->irq = irq;
+
+	dev_info(&ofdev->dev,
+		 "reg_base=0x%p irq=%d clock=%d ocr=0x%02x cdr=0x%02x\n",
+		 priv->reg_base, dev->irq, priv->can.clock.freq,
+		 priv->ocr, priv->cdr);
+
+	dev_set_drvdata(&ofdev->dev, dev);
+	SET_NETDEV_DEV(dev, &ofdev->dev);
+
+	err = register_sja1000dev(dev);
+	if (err) {
+		dev_err(&ofdev->dev, "registering %s failed (err=%d)\n",
+			DRV_NAME, err);
+		goto exit_free_sja1000;
+	}
+
+	return 0;
+
+exit_free_sja1000:
+	free_sja1000dev(dev);
+exit_dispose_irq:
+	irq_dispose_mapping(irq);
+exit_unmap_mem:
+	iounmap(base);
+exit_release_mem:
+	release_mem_region(res.start, res_size);
+
+	return err;
+}
+
+static struct of_device_id __devinitdata sja1000_ofp_table[] = {
+	{.compatible = "nxp,sja1000"},
+	{},
+};
+
+static struct of_platform_driver sja1000_ofp_driver = {
+	.owner = THIS_MODULE,
+	.name = DRV_NAME,
+	.probe = sja1000_ofp_probe,
+	.remove = __devexit_p(sja1000_ofp_remove),
+	.match_table = sja1000_ofp_table,
+};
+
+static int __init sja1000_ofp_init(void)
+{
+	return of_register_platform_driver(&sja1000_ofp_driver);
+}
+module_init(sja1000_ofp_init);
+
+static void __exit sja1000_ofp_exit(void)
+{
+	return of_unregister_platform_driver(&sja1000_ofp_driver);
+};
+module_exit(sja1000_ofp_exit);
diff --git a/include/linux/can/platform/sja1000.h b/include/linux/can/platform/sja1000.h
index 37966e630ff..01ee2aeb048 100644
--- a/include/linux/can/platform/sja1000.h
+++ b/include/linux/can/platform/sja1000.h
@@ -13,6 +13,7 @@
 #define OCR_MODE_TEST     0x01
 #define OCR_MODE_NORMAL   0x02
 #define OCR_MODE_CLOCK    0x03
+#define OCR_MODE_MASK     0x07
 #define OCR_TX0_INVERT    0x04
 #define OCR_TX0_PULLDOWN  0x08
 #define OCR_TX0_PULLUP    0x10
@@ -21,6 +22,8 @@
 #define OCR_TX1_PULLDOWN  0x40
 #define OCR_TX1_PULLUP    0x80
 #define OCR_TX1_PUSHPULL  0xc0
+#define OCR_TX_MASK       0xfc
+#define OCR_TX_SHIFT      2
 
 struct sja1000_platform_data {
 	u32 clock;	/* CAN bus oscillator frequency in Hz */
-- 
cgit v1.2.3-70-g09d2


From 56d417b12e57dfe11c9b7ba4bea3882c62a55815 Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Mon, 1 Jun 2009 03:07:33 -0700
Subject: IPv6: Add 'autoconf' and 'disable_ipv6' module parameters

Add 'autoconf' and 'disable_ipv6' parameters to the IPv6 module.

The first controls if IPv6 addresses are autoconfigured from
prefixes received in Router Advertisements.  The IPv6 loopback
(::1) and link-local addresses are still configured.

The second controls if IPv6 addresses are desired at all.  No
IPv6 addresses will be added to any interfaces.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++
 Documentation/networking/ipv6.txt      | 37 +++++++++++++++
 include/linux/ipv6.h                   |  6 +++
 net/ipv6/addrconf.c                    | 83 ++++++++++++++++++++++++++++++++--
 net/ipv6/af_inet6.c                    | 22 +++++++--
 5 files changed, 145 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 3ffd233c369..8be76235fe6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1057,6 +1057,13 @@ disable_ipv6 - BOOLEAN
 	address.
 	Default: FALSE (enable IPv6 operation)
 
+	When this value is changed from 1 to 0 (IPv6 is being enabled),
+	it will dynamically create a link-local address on the given
+	interface and start Duplicate Address Detection, if necessary.
+
+	When this value is changed from 0 to 1 (IPv6 is being disabled),
+	it will dynamically delete all address on the given interface.
+
 accept_dad - INTEGER
 	Whether to accept DAD (Duplicate Address Detection).
 	0: Disable DAD
diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt
index 268e5c103dd..9fd7e21296c 100644
--- a/Documentation/networking/ipv6.txt
+++ b/Documentation/networking/ipv6.txt
@@ -33,3 +33,40 @@ disable
 
 		A reboot is required to enable IPv6.
 
+autoconf
+
+	Specifies whether to enable IPv6 address autoconfiguration
+	on all interfaces.  This might be used when one does not wish
+	for addresses to be automatically generated from prefixes
+	received in Router Advertisements.
+
+	The possible values and their effects are:
+
+	0
+		IPv6 address autoconfiguration is disabled on all interfaces.
+
+		Only the IPv6 loopback address (::1) and link-local addresses
+		will be added to interfaces.
+
+	1
+		IPv6 address autoconfiguration is enabled on all interfaces.
+
+		This is the default value.
+
+disable_ipv6
+
+	Specifies whether to disable IPv6 on all interfaces.
+	This might be used when no IPv6 addresses are desired.
+
+	The possible values and their effects are:
+
+	0
+		IPv6 is enabled on all interfaces.
+
+		This is the default value.
+
+	1
+		IPv6 is disabled on all interfaces.
+
+		No IPv6 addresses will be added to interfaces.
+
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 476d9464ac8..c662efa6828 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -169,6 +169,12 @@ struct ipv6_devconf {
 	__s32		accept_dad;
 	void		*sysctl;
 };
+
+struct ipv6_params {
+	__s32 disable_ipv6;
+	__s32 autoconf;
+};
+extern struct ipv6_params ipv6_defaults;
 #endif
 
 /* index values for the variables in ipv6_devconf */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 31938e5fb22..c3488372f12 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -591,7 +591,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 {
 	struct inet6_ifaddr *ifa = NULL;
 	struct rt6_info *rt;
-	struct net *net = dev_net(idev->dev);
 	int hash;
 	int err = 0;
 	int addr_type = ipv6_addr_type(addr);
@@ -608,7 +607,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 		goto out2;
 	}
 
-	if (idev->cnf.disable_ipv6 || net->ipv6.devconf_all->disable_ipv6) {
+	if (idev->cnf.disable_ipv6) {
 		err = -EACCES;
 		goto out2;
 	}
@@ -1752,6 +1751,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 	__u32 prefered_lft;
 	int addr_type;
 	struct inet6_dev *in6_dev;
+	struct net *net = dev_net(dev);
 
 	pinfo = (struct prefix_info *) opt;
 
@@ -1809,7 +1809,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 		if (addrconf_finite_timeout(rt_expires))
 			rt_expires *= HZ;
 
-		rt = rt6_lookup(dev_net(dev), &pinfo->prefix, NULL,
+		rt = rt6_lookup(net, &pinfo->prefix, NULL,
 				dev->ifindex, 1);
 
 		if (rt && addrconf_is_prefix_route(rt)) {
@@ -1846,7 +1846,6 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 		struct inet6_ifaddr * ifp;
 		struct in6_addr addr;
 		int create = 0, update_lft = 0;
-		struct net *net = dev_net(dev);
 
 		if (pinfo->prefix_len == 64) {
 			memcpy(&addr, &pinfo->prefix, 8);
@@ -3988,6 +3987,75 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table,
 	return addrconf_fixup_forwarding(table, valp, val);
 }
 
+static void dev_disable_change(struct inet6_dev *idev)
+{
+	if (!idev || !idev->dev)
+		return;
+
+	if (idev->cnf.disable_ipv6)
+		addrconf_notify(NULL, NETDEV_DOWN, idev->dev);
+	else
+		addrconf_notify(NULL, NETDEV_UP, idev->dev);
+}
+
+static void addrconf_disable_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	read_lock(&dev_base_lock);
+	for_each_netdev(net, dev) {
+		rcu_read_lock();
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
+			idev->cnf.disable_ipv6 = newf;
+			if (changed)
+				dev_disable_change(idev);
+		}
+		rcu_read_unlock();
+	}
+	read_unlock(&dev_base_lock);
+}
+
+static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
+{
+	struct net *net;
+
+	net = (struct net *)table->extra2;
+
+	if (p == &net->ipv6.devconf_dflt->disable_ipv6)
+		return 0;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (p == &net->ipv6.devconf_all->disable_ipv6) {
+		__s32 newf = net->ipv6.devconf_all->disable_ipv6;
+		net->ipv6.devconf_dflt->disable_ipv6 = newf;
+		addrconf_disable_change(net, newf);
+	} else if ((!*p) ^ (!old))
+		dev_disable_change((struct inet6_dev *)table->extra1);
+
+	rtnl_unlock();
+	return 0;
+}
+
+static
+int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	int ret;
+
+	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+	if (write)
+		ret = addrconf_disable_ipv6(ctl, valp, val);
+	return ret;
+}
+
 static struct addrconf_sysctl_table
 {
 	struct ctl_table_header *sysctl_header;
@@ -4225,7 +4293,8 @@ static struct addrconf_sysctl_table
 			.data		=	&ipv6_devconf.disable_ipv6,
 			.maxlen		=	sizeof(int),
 			.mode		=	0644,
-			.proc_handler	=	proc_dointvec,
+			.proc_handler	=	addrconf_sysctl_disable,
+			.strategy	=	sysctl_intvec,
 		},
 		{
 			.ctl_name	=	CTL_UNNUMBERED,
@@ -4346,6 +4415,10 @@ static int addrconf_init_net(struct net *net)
 		dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
 		if (dflt == NULL)
 			goto err_alloc_dflt;
+	} else {
+		/* these will be inherited by all namespaces */
+		dflt->autoconf = ipv6_defaults.autoconf;
+		dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
 	}
 
 	net->ipv6.devconf_all = all;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b6215be0963..85b3d0036af 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -72,9 +72,21 @@ MODULE_LICENSE("GPL");
 static struct list_head inetsw6[SOCK_MAX];
 static DEFINE_SPINLOCK(inetsw6_lock);
 
-static int disable_ipv6 = 0;
-module_param_named(disable, disable_ipv6, int, 0);
-MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional");
+struct ipv6_params ipv6_defaults = {
+	.disable_ipv6 = 0,
+	.autoconf = 1,
+};
+
+static int disable_ipv6_mod = 0;
+
+module_param_named(disable, disable_ipv6_mod, int, 0444);
+MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional");
+
+module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444);
+MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces");
+
+module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444);
+MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces");
 
 static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
 {
@@ -1038,7 +1050,7 @@ static int __init inet6_init(void)
 	for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
 		INIT_LIST_HEAD(r);
 
-	if (disable_ipv6) {
+	if (disable_ipv6_mod) {
 		printk(KERN_INFO
 		       "IPv6: Loaded, but administratively disabled, "
 		       "reboot required to enable\n");
@@ -1227,7 +1239,7 @@ module_init(inet6_init);
 
 static void __exit inet6_exit(void)
 {
-	if (disable_ipv6)
+	if (disable_ipv6_mod)
 		return;
 
 	/* First of all disallow new sockets creation. */
-- 
cgit v1.2.3-70-g09d2


From 22a4f650d686eeaac3629dae1c4294381485efdf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 1 Jun 2009 10:13:37 +0200
Subject: perf_counter: Tidy up style details

 - whitespace fixlets
 - make local variable definitions more consistent

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 +-
 kernel/perf_counter.c        | 39 +++++++++++++++++++++------------------
 2 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81ec79c9f19..0e57d8cc5a3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -562,7 +562,7 @@ struct perf_cpu_context {
 	 *
 	 * task, softirq, irq, nmi context
 	 */
-	int			recursion[4];
+	int				recursion[4];
 };
 
 #ifdef CONFIG_PERF_COUNTERS
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff8b4636f84..df319c48c52 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -16,8 +16,9 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
-#include <linux/ptrace.h>
+#include <linux/dcache.h>
 #include <linux/percpu.h>
+#include <linux/ptrace.h>
 #include <linux/vmstat.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -26,7 +27,6 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
-#include <linux/dcache.h>
 
 #include <asm/irq_regs.h>
 
@@ -65,7 +65,9 @@ void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
-int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu)
 {
@@ -127,8 +129,8 @@ static void put_ctx(struct perf_counter_context *ctx)
  * This has to cope with with the fact that until it is locked,
  * the context could get moved to another task.
  */
-static struct perf_counter_context *perf_lock_task_context(
-				struct task_struct *task, unsigned long *flags)
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 {
 	struct perf_counter_context *ctx;
 
@@ -1330,9 +1332,9 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
 	struct perf_counter_context *parent_ctx;
+	struct perf_counter_context *ctx;
+	struct perf_cpu_context *cpuctx;
 	struct task_struct *task;
 	unsigned long flags;
 	int err;
@@ -1664,8 +1666,8 @@ int perf_counter_task_disable(void)
  */
 void perf_counter_update_userpage(struct perf_counter *counter)
 {
-	struct perf_mmap_data *data;
 	struct perf_counter_mmap_page *userpg;
+	struct perf_mmap_data *data;
 
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
@@ -1769,10 +1771,11 @@ fail:
 
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
-	struct perf_mmap_data *data = container_of(rcu_head,
-			struct perf_mmap_data, rcu_head);
+	struct perf_mmap_data *data;
 	int i;
 
+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
 	free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
 		free_page((unsigned long)data->data_pages[i]);
@@ -1801,8 +1804,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_counter *counter = vma->vm_file->private_data;
 
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
-				      &counter->mmap_mutex)) {
+	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
 		struct user_struct *user = current_user();
 
 		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
@@ -1821,11 +1823,11 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
+	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
+	unsigned long locked, lock_limit;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	unsigned long user_locked, user_lock_limit;
-	unsigned long locked, lock_limit;
 	long user_extra, extra;
 	int ret = 0;
 
@@ -1900,8 +1902,8 @@ unlock:
 
 static int perf_fasync(int fd, struct file *filp, int on)
 {
-	struct perf_counter *counter = filp->private_data;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct perf_counter *counter = filp->private_data;
 	int retval;
 
 	mutex_lock(&inode->i_mutex);
@@ -2412,8 +2414,8 @@ static void perf_counter_output(struct perf_counter *counter,
  */
 
 struct perf_comm_event {
-	struct task_struct 	*task;
-	char 			*comm;
+	struct task_struct	*task;
+	char			*comm;
 	int			comm_size;
 
 	struct {
@@ -2932,6 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
+
 	if (counter->hw.irq_period && !neg)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
@@ -3526,7 +3529,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Make the child state follow the state of the parent counter,
 	 * not its hw_event.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en,dis}able_family.
+	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
cgit v1.2.3-70-g09d2


From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 15:16:05 -0400
Subject: tracing: make trace pipe recognize latency format flag

The trace_pipe did not recognize the latency format flag and would produce
different output than the trace file. The problem was partly due that
the trace flags in the iterator was not set as well as the trace_pipe
zeros out part of the iterator (including the flags) to be able to use
the same routines as the trace file. trace_flags of the iterator should
not cause any problems when not zeroed out by for trace_pipe.

Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 2 +-
 kernel/trace/trace.c         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bbf40f624fc..5c093ffc655 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -51,6 +51,7 @@ struct trace_iterator {
 	int			cpu_file;
 	struct mutex		mutex;
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	unsigned long		iter_flags;
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
@@ -58,7 +59,6 @@ struct trace_iterator {
 	int			cpu;
 	u64			ts;
 
-	unsigned long		iter_flags;
 	loff_t			pos;
 	long			idx;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3a8a87d7e9..cae34c69752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	mutex_init(&iter->mutex);
-- 
cgit v1.2.3-70-g09d2


From f771bef98004d9d141b085d987a77d06669d4f4f Mon Sep 17 00:00:00 2001
From: Nivedita Singhvi <niv@us.ibm.com>
Date: Thu, 28 May 2009 07:00:46 +0000
Subject: ipv4: New multicast-all socket option

After some discussion offline with Christoph Lameter and David Stevens
regarding multicast behaviour in Linux, I'm submitting a slightly
modified patch from the one Christoph submitted earlier.

This patch provides a new socket option IP_MULTICAST_ALL.

In this case, default behaviour is _unchanged_ from the current
Linux standard. The socket option is set by default to provide
original behaviour. Sockets wishing to receive data only from
multicast groups they join explicitly will need to clear this
socket option.

Signed-off-by: Nivedita Singhvi <niv@us.ibm.com>
Signed-off-by: Christoph Lameter<cl@linux.com>
Acked-by: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h      |  1 +
 include/net/inet_sock.h |  3 ++-
 net/ipv4/af_inet.c      |  1 +
 net/ipv4/igmp.c         |  2 +-
 net/ipv4/ip_sockglue.c  | 11 +++++++++++
 5 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/in.h b/include/linux/in.h
index d60122a3a08..cf196da04ec 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -107,6 +107,7 @@ struct in_addr {
 #define MCAST_JOIN_SOURCE_GROUP		46
 #define MCAST_LEAVE_SOURCE_GROUP	47
 #define MCAST_MSFILTER			48
+#define IP_MULTICAST_ALL		49
 
 #define MCAST_EXCLUDE	0
 #define MCAST_INCLUDE	1
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index de0ecc71cf0..20a6957af87 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -130,7 +130,8 @@ struct inet_sock {
 				freebind:1,
 				hdrincl:1,
 				mc_loop:1,
-				transparent:1;
+				transparent:1,
+				mc_all:1;
 	int			mc_index;
 	__be32			mc_addr;
 	struct ip_mc_socklist	*mc_list;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5abee4c9744..d8736217858 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -375,6 +375,7 @@ lookup_protocol:
 	inet->uc_ttl	= -1;
 	inet->mc_loop	= 1;
 	inet->mc_ttl	= 1;
+	inet->mc_all	= 1;
 	inet->mc_index	= 0;
 	inet->mc_list	= NULL;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9eb6219af61..e6058a50379 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2196,7 +2196,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
 			break;
 	}
 	if (!pmc)
-		return 1;
+		return inet->mc_all;
 	psl = pmc->sflist;
 	if (!psl)
 		return pmc->sfmode == MCAST_EXCLUDE;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 21b0187123d..cb49936856e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -453,6 +453,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
 			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
 	    optname == IP_MULTICAST_TTL ||
+	    optname == IP_MULTICAST_ALL ||
 	    optname == IP_MULTICAST_LOOP ||
 	    optname == IP_RECVORIGDSTADDR) {
 		if (optlen >= sizeof(int)) {
@@ -898,6 +899,13 @@ mc_msf_out:
 		kfree(gsf);
 		break;
 	}
+	case IP_MULTICAST_ALL:
+		if (optlen < 1)
+			goto e_inval;
+		if (val != 0 && val != 1)
+			goto e_inval;
+		inet->mc_all = val;
+		break;
 	case IP_ROUTER_ALERT:
 		err = ip_ra_control(sk, val ? 1 : 0, NULL);
 		break;
@@ -1151,6 +1159,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		release_sock(sk);
 		return err;
 	}
+	case IP_MULTICAST_ALL:
+		val = inet->mc_all;
+		break;
 	case IP_PKTOPTIONS:
 	{
 		struct msghdr msg;
-- 
cgit v1.2.3-70-g09d2


From 0e0ee1cc33de8f0cc603269b354085dee340afa0 Mon Sep 17 00:00:00 2001
From: Dmitry Pervushin <dpervushin@embeddedalley.com>
Date: Wed, 29 Apr 2009 19:29:38 +0400
Subject: UBI: add notification API

UBI volume notifications are intended to create the API to get clients
notified about volume creation/deletion, renaming and re-sizing. A
client can subscribe to these notifications using 'ubi_volume_register()'
and cancel the subscription using 'ubi_volume_unregister()'. When UBI
volumes change, a blocking notifier is called. Clients also can request
"added" events on all volumes that existed before client subscribed
to the notifications.

If we use notifications instead of calling functions like 'ubi_gluebi_xxx()',
we can make the MTD emulation layer to be more flexible: build it as a
separate module and load/unload it on demand.

[Artem: many cleanups, rework locking, add "updated" event, provide
 device/volume info in notifiers]

Signed-off-by: Dmitry Pervushin <dpervushin@embeddedalley.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 drivers/mtd/ubi/build.c | 100 +++++++++++++++++++++++++++++++++++++++++---
 drivers/mtd/ubi/cdev.c  |   1 +
 drivers/mtd/ubi/kapi.c  | 108 +++++++++++++++++++++++++++++++++++++++++-------
 drivers/mtd/ubi/ubi.h   |  12 ++++++
 drivers/mtd/ubi/vmt.c   |   4 ++
 include/linux/mtd/ubi.h |  37 +++++++++++++++++
 6 files changed, 241 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index d3da6668266..964a99d48bc 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -121,6 +121,94 @@ static struct device_attribute dev_bgt_enabled =
 static struct device_attribute dev_mtd_num =
 	__ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL);
 
+/**
+ * ubi_volume_notify - send a volume change notification.
+ * @ubi: UBI device description object
+ * @vol: volume description object of the changed volume
+ * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc)
+ *
+ * This is a helper function which notifies all subscribers about a volume
+ * change event (creation, removal, re-sizing, re-naming, updating). Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int ubi_volume_notify(struct ubi_device *ubi, struct ubi_volume *vol, int ntype)
+{
+	struct ubi_notification nt;
+
+	ubi_do_get_device_info(ubi, &nt.di);
+	ubi_do_get_volume_info(ubi, vol, &nt.vi);
+	return blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt);
+}
+
+/**
+ * ubi_notify_all - send a notification to all volumes.
+ * @ubi: UBI device description object
+ * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc)
+ * @nb: the notifier to call
+ *
+ * This function walks all volumes of UBI device @ubi and sends the @ntype
+ * notification for each volume. If @nb is %NULL, then all registered notifiers
+ * are called, otherwise only the @nb notifier is called. Returns the number of
+ * sent notifications.
+ */
+int ubi_notify_all(struct ubi_device *ubi, int ntype, struct notifier_block *nb)
+{
+	struct ubi_notification nt;
+	int i, count = 0;
+
+	ubi_do_get_device_info(ubi, &nt.di);
+
+	mutex_lock(&ubi->device_mutex);
+	for (i = 0; i < ubi->vtbl_slots; i++) {
+		/*
+		 * Since the @ubi->device is locked, and we are not going to
+		 * change @ubi->volumes, we do not have to lock
+		 * @ubi->volumes_lock.
+		 */
+		if (!ubi->volumes[i])
+			continue;
+
+		ubi_do_get_volume_info(ubi, ubi->volumes[i], &nt.vi);
+		if (nb)
+			nb->notifier_call(nb, ntype, &nt);
+		else
+			blocking_notifier_call_chain(&ubi_notifiers, ntype,
+						     &nt);
+		count += 1;
+	}
+	mutex_unlock(&ubi->device_mutex);
+
+	return count;
+}
+
+/**
+ * ubi_enumerate_volumes - send "add" notification for all existing volumes.
+ * @nb: the notifier to call
+ *
+ * This function walks all UBI devices and volumes and sends the
+ * %UBI_VOLUME_ADDED notification for each volume. If @nb is %NULL, then all
+ * registered notifiers are called, otherwise only the @nb notifier is called.
+ * Returns the number of sent notifications.
+ */
+int ubi_enumerate_volumes(struct notifier_block *nb)
+{
+	int i, count = 0;
+
+	/*
+	 * Since the @ubi_devices_mutex is locked, and we are not going to
+	 * change @ubi_devices, we do not have to lock @ubi_devices_lock.
+	 */
+	for (i = 0; i < UBI_MAX_DEVICES; i++) {
+		struct ubi_device *ubi = ubi_devices[i];
+
+		if (!ubi)
+			continue;
+		count += ubi_notify_all(ubi, UBI_VOLUME_ADDED, nb);
+	}
+
+	return count;
+}
+
 /**
  * ubi_get_device - get UBI device.
  * @ubi_num: UBI device number
@@ -891,6 +979,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset)
 	spin_unlock(&ubi->wl_lock);
 
 	ubi_devices[ubi_num] = ubi;
+	ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL);
 	return ubi_num;
 
 out_uif:
@@ -933,13 +1022,13 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway)
 	if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES)
 		return -EINVAL;
 
-	spin_lock(&ubi_devices_lock);
-	ubi = ubi_devices[ubi_num];
-	if (!ubi) {
-		spin_unlock(&ubi_devices_lock);
+	ubi = ubi_get_device(ubi_num);
+	if (!ubi)
 		return -EINVAL;
-	}
 
+	spin_lock(&ubi_devices_lock);
+	put_device(&ubi->dev);
+	ubi->ref_count -= 1;
 	if (ubi->ref_count) {
 		if (!anyway) {
 			spin_unlock(&ubi_devices_lock);
@@ -953,6 +1042,7 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway)
 	spin_unlock(&ubi_devices_lock);
 
 	ubi_assert(ubi_num == ubi->ubi_num);
+	ubi_notify_all(ubi, UBI_VOLUME_REMOVED, NULL);
 	dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num);
 
 	/*
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 9a2b217941f..631983615f1 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -396,6 +396,7 @@ static ssize_t vol_cdev_write(struct file *file, const char __user *buf,
 		}
 		vol->checked = 1;
 		ubi_gluebi_updated(vol);
+		ubi_volume_notify(ubi, vol, UBI_VOLUME_UPDATED);
 		revoke_exclusive(desc, UBI_READWRITE);
 	}
 
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 2675207c5fe..88a72e9c8be 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -25,6 +25,24 @@
 #include <asm/div64.h>
 #include "ubi.h"
 
+/**
+ * ubi_do_get_device_info - get information about UBI device.
+ * @ubi: UBI device description object
+ * @di: the information is stored here
+ *
+ * This function is the same as 'ubi_get_device_info()', but it assumes the UBI
+ * device is locked and cannot disappear.
+ */
+void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di)
+{
+	di->ubi_num = ubi->ubi_num;
+	di->leb_size = ubi->leb_size;
+	di->min_io_size = ubi->min_io_size;
+	di->ro_mode = ubi->ro_mode;
+	di->cdev = ubi->cdev.dev;
+}
+EXPORT_SYMBOL_GPL(ubi_do_get_device_info);
+
 /**
  * ubi_get_device_info - get information about UBI device.
  * @ubi_num: UBI device number
@@ -39,33 +57,24 @@ int ubi_get_device_info(int ubi_num, struct ubi_device_info *di)
 
 	if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES)
 		return -EINVAL;
-
 	ubi = ubi_get_device(ubi_num);
 	if (!ubi)
 		return -ENODEV;
-
-	di->ubi_num = ubi->ubi_num;
-	di->leb_size = ubi->leb_size;
-	di->min_io_size = ubi->min_io_size;
-	di->ro_mode = ubi->ro_mode;
-	di->cdev = ubi->cdev.dev;
-
+	ubi_do_get_device_info(ubi, di);
 	ubi_put_device(ubi);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ubi_get_device_info);
 
 /**
- * ubi_get_volume_info - get information about UBI volume.
- * @desc: volume descriptor
+ * ubi_do_get_volume_info - get information about UBI volume.
+ * @ubi: UBI device description object
+ * @vol: volume description object
  * @vi: the information is stored here
  */
-void ubi_get_volume_info(struct ubi_volume_desc *desc,
-			 struct ubi_volume_info *vi)
+void ubi_do_get_volume_info(struct ubi_device *ubi, struct ubi_volume *vol,
+			    struct ubi_volume_info *vi)
 {
-	const struct ubi_volume *vol = desc->vol;
-	const struct ubi_device *ubi = vol->ubi;
-
 	vi->vol_id = vol->vol_id;
 	vi->ubi_num = ubi->ubi_num;
 	vi->size = vol->reserved_pebs;
@@ -79,6 +88,17 @@ void ubi_get_volume_info(struct ubi_volume_desc *desc,
 	vi->name = vol->name;
 	vi->cdev = vol->cdev.dev;
 }
+
+/**
+ * ubi_get_volume_info - get information about UBI volume.
+ * @desc: volume descriptor
+ * @vi: the information is stored here
+ */
+void ubi_get_volume_info(struct ubi_volume_desc *desc,
+			 struct ubi_volume_info *vi)
+{
+	ubi_do_get_volume_info(desc->vol->ubi, desc->vol, vi);
+}
 EXPORT_SYMBOL_GPL(ubi_get_volume_info);
 
 /**
@@ -561,7 +581,7 @@ int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum)
 EXPORT_SYMBOL_GPL(ubi_leb_unmap);
 
 /**
- * ubi_leb_map - map logical erasblock to a physical eraseblock.
+ * ubi_leb_map - map logical eraseblock to a physical eraseblock.
  * @desc: volume descriptor
  * @lnum: logical eraseblock number
  * @dtype: expected data type
@@ -659,3 +679,59 @@ int ubi_sync(int ubi_num)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ubi_sync);
+
+BLOCKING_NOTIFIER_HEAD(ubi_notifiers);
+
+/**
+ * ubi_register_volume_notifier - register a volume notifier.
+ * @nb: the notifier description object
+ * @ignore_existing: if non-zero, do not send "added" notification for all
+ *                   already existing volumes
+ *
+ * This function registers a volume notifier, which means that
+ * 'nb->notifier_call()' will be invoked when an UBI  volume is created,
+ * removed, re-sized, re-named, or updated. The first argument of the function
+ * is the notification type. The second argument is pointer to a
+ * &struct ubi_notification object which describes the notification event.
+ * Using UBI API from the volume notifier is prohibited.
+ *
+ * This function returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+int ubi_register_volume_notifier(struct notifier_block *nb,
+				 int ignore_existing)
+{
+	int err;
+
+	err = blocking_notifier_chain_register(&ubi_notifiers, nb);
+	if (err != 0)
+		return err;
+	if (ignore_existing)
+		return 0;
+
+	/*
+	 * We are going to walk all UBI devices and all volumes, and
+	 * notify the user about existing volumes by the %UBI_VOLUME_ADDED
+	 * event. We have to lock the @ubi_devices_mutex to make sure UBI
+	 * devices do not disappear.
+	 */
+	mutex_lock(&ubi_devices_mutex);
+	ubi_enumerate_volumes(nb);
+	mutex_unlock(&ubi_devices_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ubi_register_volume_notifier);
+
+/**
+ * ubi_unregister_volume_notifier - unregister the volume notifier.
+ * @nb: the notifier description object
+ *
+ * This function unregisters volume notifier @nm and returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubi_unregister_volume_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&ubi_notifiers, nb);
+}
+EXPORT_SYMBOL_GPL(ubi_unregister_volume_notifier);
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index 6d929329a8d..86e1a4e0ab0 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -38,6 +38,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/ubi.h>
+#include <linux/notifier.h>
 
 #include "ubi-media.h"
 #include "scan.h"
@@ -483,6 +484,7 @@ extern const struct file_operations ubi_cdev_operations;
 extern const struct file_operations ubi_vol_cdev_operations;
 extern struct class *ubi_class;
 extern struct mutex ubi_devices_mutex;
+extern struct blocking_notifier_head ubi_notifiers;
 
 /* vtbl.c */
 int ubi_change_vtbl_record(struct ubi_device *ubi, int idx,
@@ -575,6 +577,16 @@ struct ubi_device *ubi_get_device(int ubi_num);
 void ubi_put_device(struct ubi_device *ubi);
 struct ubi_device *ubi_get_by_major(int major);
 int ubi_major2num(int major);
+int ubi_volume_notify(struct ubi_device *ubi, struct ubi_volume *vol,
+		      int ntype);
+int ubi_notify_all(struct ubi_device *ubi, int ntype,
+		   struct notifier_block *nb);
+int ubi_enumerate_volumes(struct notifier_block *nb);
+
+/* kapi.c */
+void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di);
+void ubi_do_get_volume_info(struct ubi_device *ubi, struct ubi_volume *vol,
+			    struct ubi_volume_info *vi);
 
 /*
  * ubi_rb_for_each_entry - walk an RB-tree.
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
index 8e8d6fae7a0..e151862a3a9 100644
--- a/drivers/mtd/ubi/vmt.c
+++ b/drivers/mtd/ubi/vmt.c
@@ -358,6 +358,7 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req)
 	ubi->vol_count += 1;
 	spin_unlock(&ubi->volumes_lock);
 
+	ubi_volume_notify(ubi, vol, UBI_VOLUME_ADDED);
 	if (paranoid_check_volumes(ubi))
 		dbg_err("check failed while creating volume %d", vol_id);
 	return err;
@@ -466,6 +467,7 @@ int ubi_remove_volume(struct ubi_volume_desc *desc, int no_vtbl)
 	ubi->vol_count -= 1;
 	spin_unlock(&ubi->volumes_lock);
 
+	ubi_volume_notify(ubi, vol, UBI_VOLUME_REMOVED);
 	if (!no_vtbl && paranoid_check_volumes(ubi))
 		dbg_err("check failed while removing volume %d", vol_id);
 
@@ -589,6 +591,7 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
 			(long long)vol->used_ebs * vol->usable_leb_size;
 	}
 
+	ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED);
 	if (paranoid_check_volumes(ubi))
 		dbg_err("check failed while re-sizing volume %d", vol_id);
 	return err;
@@ -635,6 +638,7 @@ int ubi_rename_volumes(struct ubi_device *ubi, struct list_head *rename_list)
 			vol->name_len = re->new_name_len;
 			memcpy(vol->name, re->new_name, re->new_name_len + 1);
 			spin_unlock(&ubi->volumes_lock);
+			ubi_volume_notify(ubi, vol, UBI_VOLUME_RENAMED);
 		}
 	}
 
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index 6316fafe5c2..6913b71d9ab 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -132,6 +132,39 @@ struct ubi_device_info {
 	dev_t cdev;
 };
 
+/*
+ * enum - volume notification types.
+ * @UBI_VOLUME_ADDED: volume has been added
+ * @UBI_VOLUME_REMOVED: start volume volume
+ * @UBI_VOLUME_RESIZED: volume size has been re-sized
+ * @UBI_VOLUME_RENAMED: volume name has been re-named
+ * @UBI_VOLUME_UPDATED: volume name has been updated
+ *
+ * These constants define which type of event has happened when a volume
+ * notification function is invoked.
+ */
+enum {
+	UBI_VOLUME_ADDED,
+	UBI_VOLUME_REMOVED,
+	UBI_VOLUME_RESIZED,
+	UBI_VOLUME_RENAMED,
+	UBI_VOLUME_UPDATED,
+};
+
+/*
+ * struct ubi_notification - UBI notification description structure.
+ * @di: UBI device description object
+ * @vi: UBI volume description object
+ *
+ * UBI notifiers are called with a pointer to an object of this type. The
+ * object describes the notification. Namely, it provides a description of the
+ * UBI device and UBI volume the notification informs about.
+ */
+struct ubi_notification {
+	struct ubi_device_info di;
+	struct ubi_volume_info vi;
+};
+
 /* UBI descriptor given to users when they open UBI volumes */
 struct ubi_volume_desc;
 
@@ -141,6 +174,10 @@ void ubi_get_volume_info(struct ubi_volume_desc *desc,
 struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode);
 struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name,
 					   int mode);
+int ubi_register_volume_notifier(struct notifier_block *nb,
+				 int ignore_existing);
+int ubi_unregister_volume_notifier(struct notifier_block *nb);
+
 void ubi_close_volume(struct ubi_volume_desc *desc);
 int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 		 int len, int check);
-- 
cgit v1.2.3-70-g09d2


From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:52:30 +1000
Subject: perf_counter: Fix cpu migration counter

This fixes the cpu migration software counter to count
correctly even when contexts get swapped from one task to
another.  Previously the cpu migration counts reported by perf
stat were bogus, ranging from negative to several thousand for
a single "lat_ctx 2 8 32" run.  With this patch the cpu
migration count reported for "lat_ctx 2 8 32" is almost always
between 35 and 44.

This fixes the problem by adding a call into the perf_counter
code from set_task_cpu when tasks are migrated.  This enables
us to use the generic swcounter code (with some modifications)
for the cpu migration counter.

This modifies the swcounter code to allow a NULL regs pointer
to be passed in to perf_swcounter_ctx_event() etc.  The cpu
migration counter does this because there isn't necessarily a
pt_regs struct for the task available.  In this case, the
counter will not have interrupt capability - but the migration
counter didn't have interrupt capability before, so this is no
loss.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 +++
 kernel/perf_counter.c        | 74 +++++++++++++-------------------------------
 kernel/sched.c               |  1 +
 3 files changed, 26 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e57d8cc5a3..deb9acf9ad2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 
 extern void perf_counter_comm(struct task_struct *tsk);
 
+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
 #define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
@@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len,
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
+static inline void perf_counter_task_migration(struct task_struct *task,
+					       int cpu)			{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d2653f137e..cd94cf3bf9e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2921,11 +2921,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->hw_event.config != event_config)
 		return 0;
 
-	if (counter->hw_event.exclude_user && user_mode(regs))
-		return 0;
+	if (regs) {
+		if (counter->hw_event.exclude_user && user_mode(regs))
+			return 0;
 
-	if (counter->hw_event.exclude_kernel && !user_mode(regs))
-		return 0;
+		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+			return 0;
+	}
 
 	return 1;
 }
@@ -2935,7 +2937,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg)
+	if (counter->hw.irq_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3151,55 +3153,24 @@ static const struct pmu perf_ops_task_clock = {
 /*
  * Software counter: cpu migrations
  */
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->se.nr_migrations;
-	return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
 {
-	cpu_migrations_perf_counter_update(counter);
-}
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx;
 
-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_cpu_migrations(counter));
-	return 0;
-}
+	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+				 PERF_COUNT_CPU_MIGRATIONS,
+				 1, 1, NULL, 0);
 
-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
-	cpu_migrations_perf_counter_update(counter);
+	ctx = perf_pin_task_context(task);
+	if (ctx) {
+		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+					 PERF_COUNT_CPU_MIGRATIONS,
+					 1, 1, NULL, 0);
+		perf_unpin_context(ctx);
+	}
 }
 
-static const struct pmu perf_ops_cpu_migrations = {
-	.enable		= cpu_migrations_perf_counter_enable,
-	.disable	= cpu_migrations_perf_counter_disable,
-	.read		= cpu_migrations_perf_counter_read,
-};
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
@@ -3272,11 +3243,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		pmu = &perf_ops_generic;
-		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		if (!counter->hw_event.exclude_kernel)
-			pmu = &perf_ops_cpu_migrations;
+		pmu = &perf_ops_generic;
 		break;
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3226cc132e9..8d43347a0c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
+		perf_counter_task_migration(p, new_cpu);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
-- 
cgit v1.2.3-70-g09d2


From bf4e0ed3d027ce581be18496036862131b5f32aa Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:53:16 +1000
Subject: perf_counter: Remove unused prev_state field

This removes the prev_state field of struct perf_counter since
it is now unused.  It was only used by the cpu migration
counter, which doesn't use it any more.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.35052.915728.626374@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 -
 kernel/perf_counter.c        | 4 ----
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index deb9acf9ad2..d970fbc16af 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -427,7 +427,6 @@ struct perf_counter {
 	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
-	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cd94cf3bf9e..fbed4d28ad7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -572,7 +572,6 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
-	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -580,7 +579,6 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -657,7 +655,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 			       struct perf_counter_context *ctx)
 {
 	list_add_counter(counter, ctx);
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
 	counter->tstamp_enabled = ctx->time;
 	counter->tstamp_running = ctx->time;
 	counter->tstamp_stopped = ctx->time;
@@ -820,7 +817,6 @@ static void __perf_counter_enable(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
cgit v1.2.3-70-g09d2


From 874ab9233eeddb85fd2dd85131c145bde75da39a Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 2 Jun 2009 13:58:56 +0200
Subject: netfilter: nf_ct_tcp: TCP simultaneous open support

The patch below adds supporting TCP simultaneous open to conntrack. The
unused LISTEN state is replaced by a new state (SYN_SENT2) denoting the
second SYN sent from the reply direction in the new case. The state table
is updated and the function tcp_in_window is modified to handle
simultaneous open.

The functionality can fairly easily be tested by socat. A sample tcpdump
recording

23:21:34.244733 IP (tos 0x0, ttl 64, id 49224, offset 0, flags [DF], proto TCP (6), length 60) 192.168.0.254.2020 > 192.168.0.1.2020: S, cksum 0xe75f (correct), 3383710133:3383710133(0) win 5840 <mss 1460,sackOK,timestamp 173445629 0,nop,wscale 7>
23:21:34.244783 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40) 192.168.0.1.2020 > 192.168.0.254.2020: R, cksum 0x0253 (correct), 0:0(0) ack 3383710134 win 0
23:21:36.038680 IP (tos 0x0, ttl 64, id 28092, offset 0, flags [DF], proto TCP (6), length 60) 192.168.0.1.2020 > 192.168.0.254.2020: S, cksum 0x704b (correct), 2634546729:2634546729(0) win 5840 <mss 1460,sackOK,timestamp 824213 0,nop,wscale 1>
23:21:36.038777 IP (tos 0x0, ttl 64, id 49225, offset 0, flags [DF], proto TCP (6), length 60) 192.168.0.254.2020 > 192.168.0.1.2020: S, cksum 0xb179 (correct), 3383710133:3383710133(0) ack 2634546730 win 5840 <mss 1460,sackOK,timestamp 173447423 824213,nop,wscale 7>
23:21:36.038847 IP (tos 0x0, ttl 64, id 28093, offset 0, flags [DF], proto TCP (6), length 52) 192.168.0.1.2020 > 192.168.0.254.2020: ., cksum 0xebad (correct), ack 3383710134 win 2920 <nop,nop,timestamp 824213 173447423>

and the corresponding netlink events:

    [NEW] tcp      6 120 SYN_SENT src=192.168.0.254 dst=192.168.0.1 sport=2020 dport=2020 [UNREPLIED] src=192.168.0.1 dst=192.168.0.254 sport=2020 dport=2020
 [UPDATE] tcp      6 120 LISTEN src=192.168.0.254 dst=192.168.0.1 sport=2020 dport=2020 src=192.168.0.1 dst=192.168.0.254 sport=2020 dport=2020
 [UPDATE] tcp      6 60 SYN_RECV src=192.168.0.254 dst=192.168.0.1 sport=2020 dport=2020 src=192.168.0.1 dst=192.168.0.254 sport=2020 dport=2020
 [UPDATE] tcp      6 432000 ESTABLISHED src=192.168.0.254 dst=192.168.0.1 sport=2020 dport=2020 src=192.168.0.1 dst=192.168.0.254 sport=2020 dport=2020 [ASSURED]

The RST packet was dropped in the raw table, thus it did not reach
conntrack.  nfnetlink_conntrack is unpatched so it shows the new SYN_SENT2
state as the old unused LISTEN.

With TCP simultaneous open support we satisfy REQ-2 in RFC 5382  ;-) .

Additional minor correction in this patch is that in order to catch
uninitialized reply directions, "td_maxwin == 0" is used instead of
"td_end == 0" because the former can't be true except in uninitialized
state while td_end may accidentally be equal to zero in the mid of a
connection.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_tcp.h |  3 +-
 net/netfilter/nf_conntrack_proto_tcp.c     | 98 +++++++++++++++++++-----------
 2 files changed, 63 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index 3066789b972..74c27ca770e 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -15,7 +15,8 @@ enum tcp_conntrack {
 	TCP_CONNTRACK_LAST_ACK,
 	TCP_CONNTRACK_TIME_WAIT,
 	TCP_CONNTRACK_CLOSE,
-	TCP_CONNTRACK_LISTEN,
+	TCP_CONNTRACK_LISTEN,	/* obsolete */
+#define TCP_CONNTRACK_SYN_SENT2	TCP_CONNTRACK_LISTEN
 	TCP_CONNTRACK_MAX,
 	TCP_CONNTRACK_IGNORE
 };
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5ccf2b4b2e..4c7f6f0dae9 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -59,7 +59,7 @@ static const char *const tcp_conntrack_names[] = {
 	"LAST_ACK",
 	"TIME_WAIT",
 	"CLOSE",
-	"LISTEN"
+	"SYN_SENT2",
 };
 
 #define SECS * HZ
@@ -82,6 +82,7 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
 	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
 	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,
 	[TCP_CONNTRACK_CLOSE]		= 10 SECS,
+	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS,
 };
 
 #define sNO TCP_CONNTRACK_NONE
@@ -93,7 +94,7 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
 #define sLA TCP_CONNTRACK_LAST_ACK
 #define sTW TCP_CONNTRACK_TIME_WAIT
 #define sCL TCP_CONNTRACK_CLOSE
-#define sLI TCP_CONNTRACK_LISTEN
+#define sS2 TCP_CONNTRACK_SYN_SENT2
 #define sIV TCP_CONNTRACK_MAX
 #define sIG TCP_CONNTRACK_IGNORE
 
@@ -123,6 +124,7 @@ enum tcp_bit_set {
  *
  * NONE:	initial state
  * SYN_SENT:	SYN-only packet seen
+ * SYN_SENT2:	SYN-only packet seen from reply dir, simultaneous open
  * SYN_RECV:	SYN-ACK packet seen
  * ESTABLISHED:	ACK packet seen
  * FIN_WAIT:	FIN packet seen
@@ -131,26 +133,24 @@ enum tcp_bit_set {
  * TIME_WAIT:	last ACK seen
  * CLOSE:	closed connection (RST)
  *
- * LISTEN state is not used.
- *
  * Packets marked as IGNORED (sIG):
  *	if they may be either invalid or valid
  *	and the receiver may send back a connection
  *	closing RST or a SYN/ACK.
  *
  * Packets marked as INVALID (sIV):
- *	if they are invalid
- *	or we do not support the request (simultaneous open)
+ *	if we regard them as truly invalid packets
  */
 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
 	{
 /* ORIGINAL */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
 /*
  *	sNO -> sSS	Initialize a new connection
  *	sSS -> sSS	Retransmitted SYN
- *	sSR -> sIG	Late retransmitted SYN?
+ *	sS2 -> sS2	Late retransmitted SYN
+ *	sSR -> sIG
  *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
  *			are errors. Receiver will reply with RST
  *			and close the connection.
@@ -161,22 +161,30 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sTW -> sSS	Reopened connection (RFC 1122).
  *	sCL -> sSS
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
 /*
- * A SYN/ACK from the client is always invalid:
- *	- either it tries to set up a simultaneous open, which is
- *	  not supported;
- *	- or the firewall has just been inserted between the two hosts
- *	  during the session set-up. The SYN will be retransmitted
- *	  by the true client (or it'll time out).
+ *	sNO -> sIV	Too late and no reason to do anything
+ *	sSS -> sIV	Client can't send SYN and then SYN/ACK
+ *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open
+ *	sSR -> sIG
+ *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
+ *			are errors. Receiver will reply with RST
+ *			and close the connection.
+ *			Or we are not in sync and hold a dead connection.
+ *	sFW -> sIG
+ *	sCW -> sIG
+ *	sLA -> sIG
+ *	sTW -> sIG
+ *	sCL -> sIG
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
 /*
  *	sNO -> sIV	Too late and no reason to do anything...
  *	sSS -> sIV	Client migth not send FIN in this state:
  *			we enforce waiting for a SYN/ACK reply first.
+ *	sS2 -> sIV
  *	sSR -> sFW	Close started.
  *	sES -> sFW
  *	sFW -> sLA	FIN seen in both directions, waiting for
@@ -187,11 +195,12 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sTW -> sTW
  *	sCL -> sCL
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
 /*ack*/	   { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
 /*
  *	sNO -> sES	Assumed.
  *	sSS -> sIV	ACK is invalid: we haven't seen a SYN/ACK yet.
+ *	sS2 -> sIV
  *	sSR -> sES	Established state is reached.
  *	sES -> sES	:-)
  *	sFW -> sCW	Normal close request answered by ACK.
@@ -200,29 +209,31 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
  *	sCL -> sCL
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
 	},
 	{
 /* REPLY */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*syn*/	   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*syn*/	   { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 },
 /*
  *	sNO -> sIV	Never reached.
- *	sSS -> sIV	Simultaneous open, not supported
- *	sSR -> sIV	Simultaneous open, not supported.
- *	sES -> sIV	Server may not initiate a connection.
+ *	sSS -> sS2	Simultaneous open
+ *	sS2 -> sS2	Retransmitted simultaneous SYN
+ *	sSR -> sIV	Invalid SYN packets sent by the server
+ *	sES -> sIV
  *	sFW -> sIV
  *	sCW -> sIV
  *	sLA -> sIV
  *	sTW -> sIV	Reopened connection, but server may not do it.
  *	sCL -> sIV
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
 /*
  *	sSS -> sSR	Standard open.
+ *	sS2 -> sSR	Simultaneous open
  *	sSR -> sSR	Retransmitted SYN/ACK.
  *	sES -> sIG	Late retransmitted SYN/ACK?
  *	sFW -> sIG	Might be SYN/ACK answering ignored SYN
@@ -231,10 +242,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sTW -> sIG
  *	sCL -> sIG
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
 /*
  *	sSS -> sIV	Server might not send FIN in this state.
+ *	sS2 -> sIV
  *	sSR -> sFW	Close started.
  *	sES -> sFW
  *	sFW -> sLA	FIN seen in both directions.
@@ -243,10 +255,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sTW -> sTW
  *	sCL -> sCL
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*ack*/	   { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*ack*/	   { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
 /*
  *	sSS -> sIG	Might be a half-open connection.
+ *	sS2 -> sIG
  *	sSR -> sSR	Might answer late resent SYN.
  *	sES -> sES	:-)
  *	sFW -> sCW	Normal close request answered by ACK.
@@ -255,8 +268,8 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sTW -> sTW	Retransmitted last ACK.
  *	sCL -> sCL
  */
-/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
-/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
 	}
 };
@@ -521,13 +534,14 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
 		 receiver->td_scale);
 
-	if (sender->td_end == 0) {
+	if (sender->td_maxwin == 0) {
 		/*
 		 * Initialize sender data.
 		 */
-		if (tcph->syn && tcph->ack) {
+		if (tcph->syn) {
 			/*
-			 * Outgoing SYN-ACK in reply to a SYN.
+			 * SYN-ACK in reply to a SYN
+			 * or SYN from reply direction in simultaneous open.
 			 */
 			sender->td_end =
 			sender->td_maxend = end;
@@ -543,6 +557,9 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			      && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
 				sender->td_scale =
 				receiver->td_scale = 0;
+			if (!tcph->ack)
+				/* Simultaneous open */
+				return true;
 		} else {
 			/*
 			 * We are in the middle of a connection,
@@ -1068,7 +1085,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 	ct->proto.tcp.seen[1].td_end = 0;
 	ct->proto.tcp.seen[1].td_maxend = 0;
-	ct->proto.tcp.seen[1].td_maxwin = 1;
+	ct->proto.tcp.seen[1].td_maxwin = 0;
 	ct->proto.tcp.seen[1].td_scale = 0;
 
 	/* tcp_packet will set them */
@@ -1309,6 +1326,13 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_syn_sent2",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{
 		.procname	= "ip_conntrack_tcp_timeout_syn_recv",
 		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
-- 
cgit v1.2.3-70-g09d2


From 709e50cf870e61745b39552044aa6c7c38e4f9e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 14:13:15 +0200
Subject: perf_counter: Use PID namespaces properly

Stop using task_struct::pid and start using PID namespaces.

PIDs will be reported in the PID namespace of the monitoring
task at the moment of counter creation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 +++
 kernel/perf_counter.c        | 42 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d970fbc16af..9ec20fc6bd3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,7 @@ enum perf_event_type {
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
+#include <linux/pid_namespace.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -500,6 +501,8 @@ struct perf_counter {
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
+
+	struct pid_namespace		*ns;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fbed4d28ad7..caa012cfe49 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1432,6 +1432,8 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	if (counter->ns)
+		put_pid_ns(counter->ns);
 	kfree(counter);
 }
 
@@ -2267,6 +2269,28 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_pid_nr_ns(p, counter->ns);
+}
+
 static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
@@ -2303,8 +2327,8 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
-		tid_entry.pid = current->group_leader->pid;
-		tid_entry.tid = current->pid;
+		tid_entry.pid = perf_counter_pid(counter, current);
+		tid_entry.tid = perf_counter_tid(counter, current);
 
 		header.type |= PERF_RECORD_TID;
 		header.size += sizeof(tid_entry);
@@ -2432,6 +2456,9 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
 	perf_output_put(&handle, comm_event->event);
 	perf_output_copy(&handle, comm_event->comm,
 				   comm_event->comm_size);
@@ -2504,8 +2531,6 @@ void perf_counter_comm(struct task_struct *task)
 		.task	= task,
 		.event  = {
 			.header = { .type = PERF_EVENT_COMM, },
-			.pid	= task->group_leader->pid,
-			.tid	= task->pid,
 		},
 	};
 
@@ -2542,6 +2567,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	mmap_event->event.pid = perf_counter_pid(counter, current);
+	mmap_event->event.tid = perf_counter_tid(counter, current);
+
 	perf_output_put(&handle, mmap_event->event);
 	perf_output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
@@ -2641,8 +2669,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
 			.start  = addr,
 			.len    = len,
 			.pgoff  = pgoff,
@@ -2664,8 +2690,6 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MUNMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
 			.start  = addr,
 			.len    = len,
 			.pgoff  = pgoff,
@@ -3445,6 +3469,8 @@ SYSCALL_DEFINE5(perf_counter_open,
 	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
 	mutex_unlock(&current->perf_counter_mutex);
 
+	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
-- 
cgit v1.2.3-70-g09d2


From 6bfea1984aea86089907caf8974513c2402a3b3d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 2 Jun 2009 20:08:44 +0200
Subject: netfilter: conntrack: remove events flags from userspace exposed file

This patch moves the event flags from linux/netfilter/nf_conntrack_common.h
to net/netfilter/nf_conntrack_ecache.h. This flags are not of any use
from userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h | 69 ---------------------------
 include/net/netfilter/nf_conntrack_ecache.h   | 69 +++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 885cbe28226..a8248ee422b 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -75,75 +75,6 @@ enum ip_conntrack_status {
 	IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
 };
 
-/* Connection tracking event bits */
-enum ip_conntrack_events
-{
-	/* New conntrack */
-	IPCT_NEW_BIT = 0,
-	IPCT_NEW = (1 << IPCT_NEW_BIT),
-
-	/* Expected connection */
-	IPCT_RELATED_BIT = 1,
-	IPCT_RELATED = (1 << IPCT_RELATED_BIT),
-
-	/* Destroyed conntrack */
-	IPCT_DESTROY_BIT = 2,
-	IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
-
-	/* Timer has been refreshed */
-	IPCT_REFRESH_BIT = 3,
-	IPCT_REFRESH = (1 << IPCT_REFRESH_BIT),
-
-	/* Status has changed */
-	IPCT_STATUS_BIT = 4,
-	IPCT_STATUS = (1 << IPCT_STATUS_BIT),
-
-	/* Update of protocol info */
-	IPCT_PROTOINFO_BIT = 5,
-	IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
-
-	/* Volatile protocol info */
-	IPCT_PROTOINFO_VOLATILE_BIT = 6,
-	IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT),
-
-	/* New helper for conntrack */
-	IPCT_HELPER_BIT = 7,
-	IPCT_HELPER = (1 << IPCT_HELPER_BIT),
-
-	/* Update of helper info */
-	IPCT_HELPINFO_BIT = 8,
-	IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT),
-
-	/* Volatile helper info */
-	IPCT_HELPINFO_VOLATILE_BIT = 9,
-	IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT),
-
-	/* NAT info */
-	IPCT_NATINFO_BIT = 10,
-	IPCT_NATINFO = (1 << IPCT_NATINFO_BIT),
-
-	/* Counter highest bit has been set, unused */
-	IPCT_COUNTER_FILLING_BIT = 11,
-	IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT),
-
-	/* Mark is set */
-	IPCT_MARK_BIT = 12,
-	IPCT_MARK = (1 << IPCT_MARK_BIT),
-
-	/* NAT sequence adjustment */
-	IPCT_NATSEQADJ_BIT = 13,
-	IPCT_NATSEQADJ = (1 << IPCT_NATSEQADJ_BIT),
-
-	/* Secmark is set */
-	IPCT_SECMARK_BIT = 14,
-	IPCT_SECMARK = (1 << IPCT_SECMARK_BIT),
-};
-
-enum ip_conntrack_expect_events {
-	IPEXP_NEW_BIT = 0,
-	IPEXP_NEW = (1 << IPEXP_NEW_BIT),
-};
-
 #ifdef __KERNEL__
 struct ip_conntrack_stat
 {
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 0ff0dc69ca4..892b8cdf7f6 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -11,6 +11,75 @@
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 
+/* Connection tracking event bits */
+enum ip_conntrack_events
+{
+	/* New conntrack */
+	IPCT_NEW_BIT = 0,
+	IPCT_NEW = (1 << IPCT_NEW_BIT),
+
+	/* Expected connection */
+	IPCT_RELATED_BIT = 1,
+	IPCT_RELATED = (1 << IPCT_RELATED_BIT),
+
+	/* Destroyed conntrack */
+	IPCT_DESTROY_BIT = 2,
+	IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
+
+	/* Timer has been refreshed */
+	IPCT_REFRESH_BIT = 3,
+	IPCT_REFRESH = (1 << IPCT_REFRESH_BIT),
+
+	/* Status has changed */
+	IPCT_STATUS_BIT = 4,
+	IPCT_STATUS = (1 << IPCT_STATUS_BIT),
+
+	/* Update of protocol info */
+	IPCT_PROTOINFO_BIT = 5,
+	IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
+
+	/* Volatile protocol info */
+	IPCT_PROTOINFO_VOLATILE_BIT = 6,
+	IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT),
+
+	/* New helper for conntrack */
+	IPCT_HELPER_BIT = 7,
+	IPCT_HELPER = (1 << IPCT_HELPER_BIT),
+
+	/* Update of helper info */
+	IPCT_HELPINFO_BIT = 8,
+	IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT),
+
+	/* Volatile helper info */
+	IPCT_HELPINFO_VOLATILE_BIT = 9,
+	IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT),
+
+	/* NAT info */
+	IPCT_NATINFO_BIT = 10,
+	IPCT_NATINFO = (1 << IPCT_NATINFO_BIT),
+
+	/* Counter highest bit has been set, unused */
+	IPCT_COUNTER_FILLING_BIT = 11,
+	IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT),
+
+	/* Mark is set */
+	IPCT_MARK_BIT = 12,
+	IPCT_MARK = (1 << IPCT_MARK_BIT),
+
+	/* NAT sequence adjustment */
+	IPCT_NATSEQADJ_BIT = 13,
+	IPCT_NATSEQADJ = (1 << IPCT_NATSEQADJ_BIT),
+
+	/* Secmark is set */
+	IPCT_SECMARK_BIT = 14,
+	IPCT_SECMARK = (1 << IPCT_SECMARK_BIT),
+};
+
+enum ip_conntrack_expect_events {
+	IPEXP_NEW_BIT = 0,
+	IPEXP_NEW = (1 << IPEXP_NEW_BIT),
+};
+
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 struct nf_conntrack_ecache {
 	struct nf_conn *ct;
-- 
cgit v1.2.3-70-g09d2


From 8e5799b1ad2a0567fdfaaf0e91b40efee010f2c1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:08:15 +0200
Subject: perf_counter: Add unique counter id

Stephan raised the issue that we currently cannot distinguish between
similar counters within a group (PERF_RECORD_GROUP uses the config
value as identifier).

Therefore, generate a new ID for each counter using a global u64
sequence counter.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 8 +++++---
 kernel/perf_counter.c        | 9 +++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9ec20fc6bd3..4845a214b9e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -114,8 +114,9 @@ enum perf_counter_record_format {
  * in increasing order of bit value, after the counter value.
  */
 enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1,
-	PERF_FORMAT_TOTAL_TIME_RUNNING	=  2,
+	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
+	PERF_FORMAT_ID			=  1U << 2,
 };
 
 /*
@@ -290,7 +291,7 @@ enum perf_event_type {
 	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 *	{ u64			nr;
-	 *	  { u64 event, val; }	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
 	 *	{ u16			nr,
 	 *				hv,
@@ -503,6 +504,7 @@ struct perf_counter {
 	struct rcu_head			rcu_head;
 
 	struct pid_namespace		*ns;
+	u64				id;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index caa012cfe49..978ecfcc7aa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1510,6 +1510,8 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
+	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
@@ -2303,7 +2305,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		u32 pid, tid;
 	} tid_entry;
 	struct {
-		u64 event;
+		u64 id;
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
@@ -2416,7 +2418,7 @@ static void perf_counter_output(struct perf_counter *counter,
 			if (sub != counter)
 				sub->pmu->read(sub);
 
-			group_entry.event = sub->hw_event.config;
+			group_entry.id = sub->id;
 			group_entry.counter = atomic64_read(&sub->count);
 
 			perf_output_put(&handle, group_entry);
@@ -3375,6 +3377,8 @@ done:
 	return counter;
 }
 
+static atomic64_t perf_counter_id;
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3470,6 +3474,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	mutex_unlock(&current->perf_counter_mutex);
 
 	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+	counter->id = atomic64_inc_return(&perf_counter_id);
 
 	fput_light(counter_file, fput_needed2);
 
-- 
cgit v1.2.3-70-g09d2


From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:13:03 +0200
Subject: perf_counter: Rename various fields

A few renames:

  s/irq_period/sample_period/
  s/irq_freq/sample_freq/
  s/PERF_RECORD_/PERF_SAMPLE_/
  s/record_type/sample_type/

And change both the new sample_type and read_format to u64.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  12 ++---
 arch/x86/kernel/cpu/perf_counter.c |   8 +--
 include/linux/perf_counter.h       |  32 ++++++------
 kernel/perf_counter.c              | 104 ++++++++++++++++++-------------------
 4 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f96d55f55bd..c9633321e7a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -535,7 +535,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw.irq_period) {
+		if (counter->hw.sample_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	s64 val, left;
 	unsigned long flags;
 
-	if (!counter->hw.idx || !counter->hw.irq_period)
+	if (!counter->hw.idx || !counter->hw.sample_period)
 		return;
 	local_irq_save(flags);
 	perf_disable();
 	power_pmu_read(counter);
-	left = counter->hw.irq_period;
+	left = counter->hw.sample_period;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
 	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.irq_period)
+	    || counter->hw_event.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
-	u64 period = counter->hw.irq_period;
+	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
 	u64 addr, mmcra, sdsync;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 316b0c995f3..ec06aa5e928 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi	= 1;
 	hw_event->nmi	= 1;
 
-	if (!hwc->irq_period)
-		hwc->irq_period = x86_pmu.max_period;
+	if (!hwc->sample_period)
+		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->irq_period));
+			min(x86_pmu.max_period, hwc->sample_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->irq_period);
+	s64 period = min(x86_pmu.max_period, hwc->sample_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4845a214b9e..1fcd3cc9385 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -94,18 +94,18 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.record_type to request information
+ * Bits that can be set in hw_event.sample_type to request information
  * in the overflow packets.
  */
-enum perf_counter_record_format {
-	PERF_RECORD_IP			= 1U << 0,
-	PERF_RECORD_TID			= 1U << 1,
-	PERF_RECORD_TIME		= 1U << 2,
-	PERF_RECORD_ADDR		= 1U << 3,
-	PERF_RECORD_GROUP		= 1U << 4,
-	PERF_RECORD_CALLCHAIN		= 1U << 5,
-	PERF_RECORD_CONFIG		= 1U << 6,
-	PERF_RECORD_CPU			= 1U << 7,
+enum perf_counter_sample_format {
+	PERF_SAMPLE_IP			= 1U << 0,
+	PERF_SAMPLE_TID			= 1U << 1,
+	PERF_SAMPLE_TIME		= 1U << 2,
+	PERF_SAMPLE_ADDR		= 1U << 3,
+	PERF_SAMPLE_GROUP		= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
+	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
 /*
@@ -132,12 +132,12 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	union {
-		__u64		irq_period;
-		__u64		irq_freq;
+		__u64		sample_period;
+		__u64		sample_freq;
 	};
 
-	__u32			record_type;
-	__u32			read_format;
+	__u64			sample_type;
+	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -262,7 +262,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
-	 *	u64				irq_period;
+	 *	u64				sample_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
@@ -363,7 +363,7 @@ struct hw_perf_counter {
 		};
 	};
 	atomic64_t			prev_count;
-	u64				irq_period;
+	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 978ecfcc7aa..5ecd9981c03 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1186,7 +1186,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 interrupts, irq_period;
+	u64 interrupts, sample_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1204,23 +1204,23 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.irq_period;
-		period = div64_u64(events, counter->hw_event.irq_freq);
+		events = HZ * interrupts * counter->hw.sample_period;
+		period = div64_u64(events, counter->hw_event.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
 
-		irq_period = counter->hw.irq_period + delta;
+		sample_period = counter->hw.sample_period + delta;
 
-		if (!irq_period)
-			irq_period = 1;
+		if (!sample_period)
+			sample_period = 1;
 
-		perf_log_period(counter, irq_period);
+		perf_log_period(counter, sample_period);
 
-		counter->hw.irq_period = irq_period;
+		counter->hw.sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2297,7 +2297,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 record_type = counter->hw_event.record_type;
+	u64 sample_type = counter->hw_event.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2321,61 +2321,61 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
 	header.misc |= perf_misc_flags(regs);
 
-	if (record_type & PERF_RECORD_IP) {
+	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(regs);
-		header.type |= PERF_RECORD_IP;
+		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
-	if (record_type & PERF_RECORD_TID) {
+	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
 		tid_entry.pid = perf_counter_pid(counter, current);
 		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_RECORD_TID;
+		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
+	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_RECORD_TIME;
+		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_ADDR) {
-		header.type |= PERF_RECORD_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR) {
+		header.type |= PERF_SAMPLE_ADDR;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CONFIG) {
-		header.type |= PERF_RECORD_CONFIG;
+	if (sample_type & PERF_SAMPLE_CONFIG) {
+		header.type |= PERF_SAMPLE_CONFIG;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CPU) {
-		header.type |= PERF_RECORD_CPU;
+	if (sample_type & PERF_SAMPLE_CPU) {
+		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= PERF_RECORD_GROUP;
+	if (sample_type & PERF_SAMPLE_GROUP) {
+		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
 
-	if (record_type & PERF_RECORD_CALLCHAIN) {
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= PERF_RECORD_CALLCHAIN;
+			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -2386,28 +2386,28 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	perf_output_put(&handle, header);
 
-	if (record_type & PERF_RECORD_IP)
+	if (sample_type & PERF_SAMPLE_IP)
 		perf_output_put(&handle, ip);
 
-	if (record_type & PERF_RECORD_TID)
+	if (sample_type & PERF_SAMPLE_TID)
 		perf_output_put(&handle, tid_entry);
 
-	if (record_type & PERF_RECORD_TIME)
+	if (sample_type & PERF_SAMPLE_TIME)
 		perf_output_put(&handle, time);
 
-	if (record_type & PERF_RECORD_ADDR)
+	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (record_type & PERF_RECORD_CONFIG)
+	if (sample_type & PERF_SAMPLE_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
-	if (record_type & PERF_RECORD_CPU)
+	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
 	/*
-	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
-	if (record_type & PERF_RECORD_GROUP) {
+	if (sample_type & PERF_SAMPLE_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
 
@@ -2702,7 +2702,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- * Log irq_period changes so that analyzing tools can re-normalize the
+ * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
  */
 
@@ -2725,7 +2725,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 		.period = period,
 	};
 
-	if (counter->hw.irq_period == period)
+	if (counter->hw.sample_period == period)
 		return;
 
 	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
@@ -2834,7 +2834,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = hwc->sample_period;
 
 	if (unlikely(left <= -period)) {
 		left = period;
@@ -2874,7 +2874,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	period = max_t(u64, 10000, counter->hw.irq_period);
+	period = max_t(u64, 10000, counter->hw.sample_period);
 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
@@ -2959,7 +2959,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg && regs)
+	if (counter->hw.sample_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3080,8 +3080,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3092,7 +3092,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
@@ -3132,8 +3132,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3144,7 +3144,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
@@ -3223,7 +3223,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.irq_period = counter->hw_event.irq_period;
+	counter->hw.sample_period = counter->hw_event.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3323,15 +3323,15 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
+	if (hw_event->freq && hw_event->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
 	else
-		hwc->irq_period = hw_event->irq_period;
+		hwc->sample_period = hw_event->sample_period;
 
 	/*
-	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
 	if (perf_event_raw(hw_event)) {
-- 
cgit v1.2.3-70-g09d2


From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:27:45 +0200
Subject: perf_counter: Remove the last nmi/irq bits

IRQ (non-NMI) sampling is not used anymore - remove the last few bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 ------
 include/linux/perf_counter.h       | 4 +---
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ec06aa5e928..9e144fbebd2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	/*
-	 * Use NMI events all the time:
-	 */
-	hwc->nmi	= 1;
-	hw_event->nmi	= 1;
-
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fcd3cc9385..cef9931793f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,7 +140,6 @@ struct perf_counter_hw_event {
 	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -153,7 +152,7 @@ struct perf_counter_hw_event {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 51;
+				__reserved_1   : 52;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -354,7 +353,6 @@ struct hw_perf_counter {
 			u64				config;
 			unsigned long			config_base;
 			unsigned long			counter_base;
-			int				nmi;
 			int				idx;
 		};
 		union { /* software */
-- 
cgit v1.2.3-70-g09d2


From 8e3747c13c39246c7e46def7cf495d9d21d4c5f9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:16:02 +0200
Subject: perf_counter: Change data head from u32 to u64

Since some people worried that 4G might not be a large enough
as an mmap data window, extend it to 64 bit for capable
platforms.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 ++++---
 kernel/perf_counter.c        | 15 ++++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index cef9931793f..c046f7d97cf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -212,7 +212,7 @@ struct perf_counter_mmap_page {
 	 * User-space reading this value should issue an rmb(), on SMP capable
 	 * platforms, after reading this value -- see perf_counter_wakeup().
 	 */
-	__u32   data_head;		/* head in the data section */
+	__u64   data_head;		/* head in the data section */
 };
 
 #define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
@@ -397,10 +397,11 @@ struct perf_mmap_data {
 	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
-	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
-	atomic_t			done_head;	/* completed head    */
+	atomic_long_t			head;		/* write position    */
+	atomic_long_t			done_head;	/* completed head    */
+
 	atomic_t			lock;		/* concurrent writes */
 
 	atomic_t			wakeup;		/* needs a wakeup    */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5ecd9981c03..3f11a2bc6c7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2067,8 +2067,8 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 struct perf_output_handle {
 	struct perf_counter	*counter;
 	struct perf_mmap_data	*data;
-	unsigned int		offset;
-	unsigned int		head;
+	unsigned long		head;
+	unsigned long		offset;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -2122,7 +2122,8 @@ static void perf_output_lock(struct perf_output_handle *handle)
 static void perf_output_unlock(struct perf_output_handle *handle)
 {
 	struct perf_mmap_data *data = handle->data;
-	int head, cpu;
+	unsigned long head;
+	int cpu;
 
 	data->done_head = data->head;
 
@@ -2135,7 +2136,7 @@ again:
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->done_head, 0)))
+	while ((head = atomic_long_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
 
 	/*
@@ -2148,7 +2149,7 @@ again:
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->done_head))) {
+	if (unlikely(atomic_long_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -2195,7 +2196,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
-	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
 	handle->head	= head;
@@ -2246,7 +2247,7 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	 * Check we didn't copy past our reservation window, taking the
 	 * possible unsigned int wrap into account.
 	 */
-	WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 #define perf_output_put(handle, x) \
-- 
cgit v1.2.3-70-g09d2


From 08247e31ca79b8f02cce47b7e8120797a8726606 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:46:57 +0200
Subject: perf_counter: Add ioctl for changing the sample period/frequency

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  9 +++++----
 kernel/perf_counter.c        | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c046f7d97cf..45bdd3b95d3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,10 +164,11 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
-#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
-#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3f11a2bc6c7..abe2f3b6c42 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1604,6 +1604,43 @@ static void perf_counter_for_each(struct perf_counter *counter,
 	mutex_unlock(&counter->child_mutex);
 }
 
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long size;
+	int ret = 0;
+	u64 value;
+
+	if (!counter->hw_event.sample_period)
+		return -EINVAL;
+
+	size = copy_from_user(&value, arg, sizeof(value));
+	if (size != sizeof(value))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->hw_event.freq) {
+		if (value > sysctl_perf_counter_limit) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		counter->hw_event.sample_freq = value;
+	} else {
+		counter->hw_event.sample_period = value;
+		counter->hw.sample_period = value;
+
+		perf_log_period(counter, value);
+	}
+unlock:
+	spin_unlock_irq(&ctx->lock);
+
+	return ret;
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1623,6 +1660,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case PERF_COUNTER_IOC_REFRESH:
 		return perf_counter_refresh(counter, arg);
+
+	case PERF_COUNTER_IOC_PERIOD:
+		return perf_counter_period(counter, (u64 __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
-- 
cgit v1.2.3-70-g09d2


From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 19:22:16 +0200
Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr

The structure isn't hw only and when I read event, I think about those
things that fall out the other end. Rename the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  38 ++++++------
 arch/x86/kernel/cpu/perf_counter.c |  16 ++---
 include/linux/perf_counter.h       |  34 +++++------
 include/linux/syscalls.h           |   4 +-
 kernel/perf_counter.c              | 116 ++++++++++++++++++-------------------
 5 files changed, 104 insertions(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9633321e7a..ea54686cb78 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 		}
 		counter = ctrs[i];
 		if (first) {
-			eu = counter->hw_event.exclude_user;
-			ek = counter->hw_event.exclude_kernel;
-			eh = counter->hw_event.exclude_hv;
+			eu = counter->attr.exclude_user;
+			ek = counter->attr.exclude_kernel;
+			eh = counter->attr.exclude_hv;
 			first = 0;
-		} else if (counter->hw_event.exclude_user != eu ||
-			   counter->hw_event.exclude_kernel != ek ||
-			   counter->hw_event.exclude_hv != eh) {
+		} else if (counter->attr.exclude_user != eu ||
+			   counter->attr.exclude_kernel != ek ||
+			   counter->attr.exclude_hv != eh) {
 			return -EAGAIN;
 		}
 	}
@@ -483,16 +483,16 @@ void hw_perf_enable(void)
 
 	/*
 	 * Add in MMCR0 freeze bits corresponding to the
-	 * hw_event.exclude_* bits for the first counter.
+	 * attr.exclude_* bits for the first counter.
 	 * We have already checked that all counters have the
 	 * same values for these bits as the first counter.
 	 */
 	counter = cpuhw->counter[0];
-	if (counter->hw_event.exclude_user)
+	if (counter->attr.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->hw_event.exclude_kernel)
+	if (counter->attr.exclude_kernel)
 		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->hw_event.exclude_hv)
+	if (counter->attr.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
 	/*
@@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	int n;
 	u64 alt[MAX_EVENT_ALTERNATIVES];
 
-	if (counter->hw_event.exclude_user
-	    || counter->hw_event.exclude_kernel
-	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.sample_period)
+	if (counter->attr.exclude_user
+	    || counter->attr.exclude_kernel
+	    || counter->attr.exclude_hv
+	    || counter->attr.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->hw_event)) {
-		ev = perf_event_id(&counter->hw_event);
+	if (!perf_event_raw(&counter->attr)) {
+		ev = perf_event_id(&counter->attr);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->hw_event);
+		ev = perf_event_config(&counter->attr);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->hw_event.exclude_hv = 0;
+		counter->attr.exclude_hv = 0;
 
 	/*
 	 * If this is a per-task counter, then we can use
@@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.record_type & PERF_RECORD_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 904571bea71..e16e8c13132 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void)
 }
 
 /*
- * Setup the hardware configuration for a given hw_event_type
+ * Setup the hardware configuration for a given attr_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
+	struct perf_counter_attr *attr = &counter->attr;
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
@@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Count user and OS events unless requested not to.
 	 */
-	if (!hw_event->exclude_user)
+	if (!attr->exclude_user)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!hw_event->exclude_kernel)
+	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	if (!hwc->sample_period)
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
+	if (perf_event_raw(attr)) {
+		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu.max_events)
+		if (perf_event_id(attr) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 45bdd3b95d3..37d5541d74c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -22,7 +22,7 @@
  */
 
 /*
- * hw_event.type
+ * attr.type
  */
 enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
@@ -37,10 +37,10 @@ enum perf_event_types {
 };
 
 /*
- * Generalized performance counter event types, used by the hw_event.event_id
+ * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum hw_event_ids {
+enum attr_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -94,7 +94,7 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.sample_type to request information
+ * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
@@ -109,7 +109,7 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in hw_event.read_format to request that
+ * Bits that can be set in attr.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
  */
@@ -122,7 +122,7 @@ enum perf_counter_read_format {
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
-struct perf_counter_hw_event {
+struct perf_counter_attr {
 	/*
 	 * The MSB of the config word signifies if the rest contains cpu
 	 * specific (raw) counter configuration data, if unset, the next
@@ -323,25 +323,25 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_raw(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_RAW_MASK;
+	return attr->config & PERF_COUNTER_RAW_MASK;
 }
 
-static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_config(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+	return attr->config & PERF_COUNTER_CONFIG_MASK;
 }
 
-static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_type(struct perf_counter_attr *attr)
 {
-	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
 		PERF_COUNTER_TYPE_SHIFT;
 }
 
-static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_id(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+	return attr->config & PERF_COUNTER_EVENT_MASK;
 }
 
 /**
@@ -457,7 +457,7 @@ struct perf_counter {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_counter_hw_event	hw_event;
+	struct perf_counter_attr	attr;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->hw_event) &&
-		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->attr) &&
+		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 79faae950e2..c6c84ad8bd7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
-struct perf_counter_hw_event;
+struct perf_counter_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
 asmlinkage long sys_perf_counter_open(
-		const struct perf_counter_hw_event __user *hw_event_uptr,
+		const struct perf_counter_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index abe2f3b6c42..317cef78a38 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu--;
 	ctx->nr_active--;
-	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+	if (counter->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 }
 
@@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter,
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
 
-	if (group_counter->hw_event.exclusive)
+	if (group_counter->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter,
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
-	if (counter->hw_event.exclusive)
+	if (counter->attr.exclusive)
 		cpuctx->exclusive = 1;
 
 	return 0;
@@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter,
 	 * If this group is exclusive and there are already
 	 * counters on the CPU, it can't go on.
 	 */
-	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+	if (counter->attr.exclusive && cpuctx->active_oncpu)
 		return 0;
 	/*
 	 * Otherwise, try to add it if all previous groups were able
@@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	/*
 	 * not supported on inherited counters
 	 */
-	if (counter->hw_event.inherit)
+	if (counter->attr.inherit)
 		return -EINVAL;
 
 	atomic_add(refresh, &counter->event_limit);
@@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->hw_event.pinned)
+		    !counter->attr.pinned)
 			continue;
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
@@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * ignore pinned counters since we did them already.
 		 */
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->hw_event.pinned)
+		    counter->attr.pinned)
 			continue;
 
 		/*
@@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
+		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
 		events = HZ * interrupts * counter->hw.sample_period;
-		period = div64_u64(events, counter->hw_event.sample_freq);
+		period = div64_u64(events, counter->attr.sample_freq);
 
 		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
@@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter)
 	perf_pending_sync(counter);
 
 	atomic_dec(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_dec(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_dec(&nr_comm_tracking);
 
 	if (counter->destroy)
@@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = counter->total_time_enabled +
 			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+	if (counter->attr.read_format & PERF_FORMAT_ID)
 		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
@@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 	int ret = 0;
 	u64 value;
 
-	if (!counter->hw_event.sample_period)
+	if (!counter->attr.sample_period)
 		return -EINVAL;
 
 	size = copy_from_user(&value, arg, sizeof(value));
@@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->lock);
-	if (counter->hw_event.freq) {
+	if (counter->attr.freq) {
 		if (value > sysctl_perf_counter_limit) {
 			ret = -EINVAL;
 			goto unlock;
 		}
 
-		counter->hw_event.sample_freq = value;
+		counter->attr.sample_freq = value;
 	} else {
-		counter->hw_event.sample_period = value;
+		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
 
 		perf_log_period(counter, value);
@@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
 
-	int wakeup_events = counter->hw_event.wakeup_events;
+	int wakeup_events = counter->attr.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
@@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 sample_type = counter->hw_event.sample_type;
+	u64 sample_type = counter->attr.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, addr);
 
 	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->hw_event.config);
+		perf_output_put(&handle, counter->attr.config);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
@@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 static int perf_counter_comm_match(struct perf_counter *counter,
 				   struct perf_comm_event *comm_event)
 {
-	if (counter->hw_event.comm &&
+	if (counter->attr.comm &&
 	    comm_event->event.header.type == PERF_EVENT_COMM)
 		return 1;
 
@@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->hw_event.mmap &&
+	if (counter->attr.mmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MMAP)
 		return 1;
 
-	if (counter->hw_event.munmap &&
+	if (counter->attr.munmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
 		return 1;
 
@@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->hw_event.exclude_kernel || !regs) &&
-			!counter->hw_event.exclude_user)
+	if ((counter->attr.exclude_kernel || !regs) &&
+			!counter->attr.exclude_user)
 		regs = task_pt_regs(current);
 
 	if (regs) {
@@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->hw_event.config != event_config)
+	if (counter->attr.config != event_config)
 		return 0;
 
 	if (regs) {
-		if (counter->hw_event.exclude_user && user_mode(regs))
+		if (counter->attr.exclude_user && user_mode(regs))
 			return 0;
 
-		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		if (counter->attr.exclude_kernel && !user_mode(regs))
 			return 0;
 	}
 
@@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->hw_event));
+	ftrace_profile_disable(perf_event_id(&counter->attr));
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->hw_event);
+	int event_id = perf_event_id(&counter->attr);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->hw_event.sample_period;
+	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->hw_event)) {
+	switch (perf_event_id(&counter->attr)) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+perf_counter_alloc(struct perf_counter_attr *attr,
 		   int cpu,
 		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
@@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	mutex_init(&counter->mmap_mutex);
 
 	counter->cpu			= cpu;
-	counter->hw_event		= *hw_event;
+	counter->attr		= *attr;
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (hw_event->disabled)
+	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
 	else
-		hwc->sample_period = hw_event->sample_period;
+		hwc->sample_period = attr->sample_period;
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(hw_event)) {
+	if (perf_event_raw(attr)) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(hw_event)) {
+	switch (perf_event_type(attr)) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
@@ -3409,11 +3409,11 @@ done:
 	counter->pmu = pmu;
 
 	atomic_inc(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_inc(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_inc(&nr_comm_tracking);
 
 	return counter;
@@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id;
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
- * @hw_event_uptr:	event type attributes for monitoring/sampling
+ * @attr_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		const struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	struct perf_counter_context *ctx;
 	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
@@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
+	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
 	/*
@@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
-		if (hw_event.exclusive || hw_event.pinned)
+		if (attr.exclusive || attr.pinned)
 			goto err_put_context;
 	}
 
-	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
 				     GFP_KERNEL);
 	ret = PTR_ERR(counter);
 	if (IS_ERR(counter))
@@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (parent_counter->parent)
 		parent_counter = parent_counter->parent;
 
-	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+	child_counter = perf_counter_alloc(&parent_counter->attr,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
@@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter,
 
 	/*
 	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
 	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
@@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * inherit into child's child as well:
 	 */
-	child_counter->hw_event.inherit = 1;
+	child_counter->attr.inherit = 1;
 
 	/*
 	 * Get a reference to the parent filp - we will fput it
@@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit) {
+		if (!counter->attr.inherit) {
 			inherited_all = 0;
 			continue;
 		}
-- 
cgit v1.2.3-70-g09d2


From a05c0205ba031c01bba33a21bf0a35920eb64833 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 3 Jun 2009 09:33:18 +0200
Subject: block: Fix bounce limit setting in DM

blk_queue_bounce_limit() is more than a wrapper about the request queue
limits.bounce_pfn variable.  Introduce blk_queue_bounce_pfn() which can
be called by stacking drivers that wish to set the bounce limit
explicitly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 17 +++++++++++++++++
 drivers/md/dm-table.c  |  2 +-
 include/linux/blkdev.h |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d339349289..9acd0b7e802 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -193,6 +193,23 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
+/**
+ * blk_queue_bounce_pfn - set the bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @pfn: max address
+ *
+ * Description:
+ *    This function is similar to blk_queue_bounce_limit except it
+ *    neither changes allocation flags, nor does it set up the ISA DMA
+ *    pool. This function should only be used by stacking drivers.
+ *    Hardware drivers should use blk_queue_bounce_limit instead.
+ */
+void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
+{
+	q->limits.bounce_pfn = pfn;
+}
+EXPORT_SYMBOL(blk_queue_bounce_pfn);
+
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b..3ca1604ddd5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
+	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a135e7..989aa1790f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,6 +910,7 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
+extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
cgit v1.2.3-70-g09d2


From e34d5c1a4f9919a81b4ea4591d7383245f35cb8e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 3 Jun 2009 10:32:06 +0200
Subject: netfilter: conntrack: replace notify chain by function pointer

This patch removes the notify chain infrastructure and replace it
by a simple function pointer. This issue has been mentioned in the
mailing list several times: the use of the notify chain adds
too much overhead for something that is only used by ctnetlink.

This patch also changes nfnetlink_send(). It seems that gfp_any()
returns GFP_KERNEL for user-context request, like those via
ctnetlink, inside the RCU read-side section which is not valid.
Using GFP_KERNEL is also evil since netlink may schedule(),
this leads to "scheduling while atomic" bug reports.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h         |  2 +-
 include/net/netfilter/nf_conntrack_ecache.h | 68 ++++++++++++++++-------
 net/netfilter/nf_conntrack_ecache.c         | 83 +++++++++++++++++++++++------
 net/netfilter/nf_conntrack_netlink.c        | 42 +++++++--------
 net/netfilter/nfnetlink.c                   |  5 +-
 5 files changed, 139 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index c600083cbdf..2214e516146 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -75,7 +75,7 @@ extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
 extern int nfnetlink_has_listeners(unsigned int group);
 extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, 
-			  int echo);
+			  int echo, gfp_t flags);
 extern void nfnetlink_set_err(u32 pid, u32 group, int error);
 extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags);
 
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 2e17a2d0eb3..1afb907e015 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -6,7 +6,6 @@
 #define _NF_CONNTRACK_ECACHE_H
 #include <net/netfilter/nf_conntrack.h>
 
-#include <linux/notifier.h>
 #include <linux/interrupt.h>
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_conntrack_expect.h>
@@ -69,9 +68,13 @@ struct nf_ct_event {
 	int report;
 };
 
-extern struct atomic_notifier_head nf_conntrack_chain;
-extern int nf_conntrack_register_notifier(struct notifier_block *nb);
-extern int nf_conntrack_unregister_notifier(struct notifier_block *nb);
+struct nf_ct_event_notifier {
+	int (*fcn)(unsigned int events, struct nf_ct_event *item);
+};
+
+extern struct nf_ct_event_notifier *nf_conntrack_event_cb;
+extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
+extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
 
 extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
 extern void __nf_ct_event_cache_init(struct nf_conn *ct);
@@ -97,13 +100,23 @@ nf_conntrack_event_report(enum ip_conntrack_events event,
 			  u32 pid,
 			  int report)
 {
-	struct nf_ct_event item = {
-		.ct 	= ct,
-		.pid	= pid,
-		.report = report
-	};
-	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
-		atomic_notifier_call_chain(&nf_conntrack_chain, event, &item);
+	struct nf_ct_event_notifier *notify;
+
+	rcu_read_lock();
+	notify = rcu_dereference(nf_conntrack_event_cb);
+	if (notify == NULL)
+		goto out_unlock;
+
+	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
+		struct nf_ct_event item = {
+			.ct 	= ct,
+			.pid	= pid,
+			.report = report
+		};
+		notify->fcn(event, &item);
+	}
+out_unlock:
+	rcu_read_unlock();
 }
 
 static inline void
@@ -118,9 +131,13 @@ struct nf_exp_event {
 	int report;
 };
 
-extern struct atomic_notifier_head nf_ct_expect_chain;
-extern int nf_ct_expect_register_notifier(struct notifier_block *nb);
-extern int nf_ct_expect_unregister_notifier(struct notifier_block *nb);
+struct nf_exp_event_notifier {
+	int (*fcn)(unsigned int events, struct nf_exp_event *item);
+};
+
+extern struct nf_exp_event_notifier *nf_expect_event_cb;
+extern int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *nb);
+extern void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *nb);
 
 static inline void
 nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
@@ -128,12 +145,23 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
 			  u32 pid,
 			  int report)
 {
-	struct nf_exp_event item = {
-		.exp	= exp,
-		.pid	= pid,
-		.report = report
-	};
-	atomic_notifier_call_chain(&nf_ct_expect_chain, event, &item);
+	struct nf_exp_event_notifier *notify;
+
+	rcu_read_lock();
+	notify = rcu_dereference(nf_expect_event_cb);
+	if (notify == NULL)
+		goto out_unlock;
+
+	{
+		struct nf_exp_event item = {
+			.exp	= exp,
+			.pid	= pid,
+			.report = report
+		};
+		notify->fcn(event, &item);
+	}
+out_unlock:
+	rcu_read_unlock();
 }
 
 static inline void
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index dee4190209c..5516b3e64b4 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -16,24 +16,32 @@
 #include <linux/stddef.h>
 #include <linux/err.h>
 #include <linux/percpu.h>
-#include <linux/notifier.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 
-ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
-EXPORT_SYMBOL_GPL(nf_conntrack_chain);
+static DEFINE_MUTEX(nf_ct_ecache_mutex);
 
-ATOMIC_NOTIFIER_HEAD(nf_ct_expect_chain);
-EXPORT_SYMBOL_GPL(nf_ct_expect_chain);
+struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
+
+struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+EXPORT_SYMBOL_GPL(nf_expect_event_cb);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
 static inline void
 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
 {
+	struct nf_ct_event_notifier *notify;
+
+	rcu_read_lock();
+	notify = rcu_dereference(nf_conntrack_event_cb);
+	if (notify == NULL)
+		goto out_unlock;
+
 	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
 	    && ecache->events) {
 		struct nf_ct_event item = {
@@ -42,14 +50,15 @@ __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
 			.report	= 0
 		};
 
-		atomic_notifier_call_chain(&nf_conntrack_chain,
-					   ecache->events,
-					   &item);
+		notify->fcn(ecache->events, &item);
 	}
 
 	ecache->events = 0;
 	nf_ct_put(ecache->ct);
 	ecache->ct = NULL;
+
+out_unlock:
+	rcu_read_unlock();
 }
 
 /* Deliver all cached events for a particular conntrack. This is called
@@ -111,26 +120,68 @@ void nf_conntrack_ecache_fini(struct net *net)
 	free_percpu(net->ct.ecache);
 }
 
-int nf_conntrack_register_notifier(struct notifier_block *nb)
+int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
 {
-	return atomic_notifier_chain_register(&nf_conntrack_chain, nb);
+	int ret = 0;
+	struct nf_ct_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference(nf_conntrack_event_cb);
+	if (notify != NULL) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	rcu_assign_pointer(nf_conntrack_event_cb, new);
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
+
+out_unlock:
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
 
-int nf_conntrack_unregister_notifier(struct notifier_block *nb)
+void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
 {
-	return atomic_notifier_chain_unregister(&nf_conntrack_chain, nb);
+	struct nf_ct_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference(nf_conntrack_event_cb);
+	BUG_ON(notify != new);
+	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
+	mutex_unlock(&nf_ct_ecache_mutex);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 
-int nf_ct_expect_register_notifier(struct notifier_block *nb)
+int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
 {
-	return atomic_notifier_chain_register(&nf_ct_expect_chain, nb);
+	int ret = 0;
+	struct nf_exp_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference(nf_expect_event_cb);
+	if (notify != NULL) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	rcu_assign_pointer(nf_expect_event_cb, new);
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
+
+out_unlock:
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
 
-int nf_ct_expect_unregister_notifier(struct notifier_block *nb)
+void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
 {
-	return atomic_notifier_chain_unregister(&nf_ct_expect_chain, nb);
+	struct nf_exp_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference(nf_expect_event_cb);
+	BUG_ON(notify != new);
+	rcu_assign_pointer(nf_expect_event_cb, NULL);
+	mutex_unlock(&nf_ct_ecache_mutex);
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b1b9e4fb7de..4448b062de0 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -27,7 +27,6 @@
 #include <linux/netlink.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
-#include <linux/notifier.h>
 
 #include <linux/netfilter.h>
 #include <net/netlink.h>
@@ -454,13 +453,12 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 	       ;
 }
 
-static int ctnetlink_conntrack_event(struct notifier_block *this,
-				     unsigned long events, void *ptr)
+static int
+ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct nlattr *nest_parms;
-	struct nf_ct_event *item = (struct nf_ct_event *)ptr;
 	struct nf_conn *ct = item->ct;
 	struct sk_buff *skb;
 	unsigned int type;
@@ -468,7 +466,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this,
 
 	/* ignore our fake conntrack entry */
 	if (ct == &nf_conntrack_untracked)
-		return NOTIFY_DONE;
+		return 0;
 
 	if (events & IPCT_DESTROY) {
 		type = IPCTNL_MSG_CT_DELETE;
@@ -481,10 +479,10 @@ static int ctnetlink_conntrack_event(struct notifier_block *this,
 		type = IPCTNL_MSG_CT_NEW;
 		group = NFNLGRP_CONNTRACK_UPDATE;
 	} else
-		return NOTIFY_DONE;
+		return 0;
 
 	if (!item->report && !nfnetlink_has_listeners(group))
-		return NOTIFY_DONE;
+		return 0;
 
 	skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
 	if (skb == NULL)
@@ -560,8 +558,8 @@ static int ctnetlink_conntrack_event(struct notifier_block *this,
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, item->pid, group, item->report);
-	return NOTIFY_DONE;
+	nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+	return 0;
 
 nla_put_failure:
 	rcu_read_unlock();
@@ -570,7 +568,7 @@ nlmsg_failure:
 	kfree_skb(skb);
 errout:
 	nfnetlink_set_err(0, group, -ENOBUFS);
-	return NOTIFY_DONE;
+	return 0;
 }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
 
@@ -1507,12 +1505,11 @@ nla_put_failure:
 }
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-static int ctnetlink_expect_event(struct notifier_block *this,
-				  unsigned long events, void *ptr)
+static int
+ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
-	struct nf_exp_event *item = (struct nf_exp_event *)ptr;
 	struct nf_conntrack_expect *exp = item->exp;
 	struct sk_buff *skb;
 	unsigned int type;
@@ -1522,11 +1519,11 @@ static int ctnetlink_expect_event(struct notifier_block *this,
 		type = IPCTNL_MSG_EXP_NEW;
 		flags = NLM_F_CREATE|NLM_F_EXCL;
 	} else
-		return NOTIFY_DONE;
+		return 0;
 
 	if (!item->report &&
 	    !nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW))
-		return NOTIFY_DONE;
+		return 0;
 
 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
 	if (skb == NULL)
@@ -1548,8 +1545,9 @@ static int ctnetlink_expect_event(struct notifier_block *this,
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, item->report);
-	return NOTIFY_DONE;
+	nfnetlink_send(skb, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
+		       item->report, GFP_ATOMIC);
+	return 0;
 
 nla_put_failure:
 	rcu_read_unlock();
@@ -1558,7 +1556,7 @@ nlmsg_failure:
 	kfree_skb(skb);
 errout:
 	nfnetlink_set_err(0, 0, -ENOBUFS);
-	return NOTIFY_DONE;
+	return 0;
 }
 #endif
 static int ctnetlink_exp_done(struct netlink_callback *cb)
@@ -1864,12 +1862,12 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
 }
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-static struct notifier_block ctnl_notifier = {
-	.notifier_call	= ctnetlink_conntrack_event,
+static struct nf_ct_event_notifier ctnl_notifier = {
+	.fcn = ctnetlink_conntrack_event,
 };
 
-static struct notifier_block ctnl_notifier_exp = {
-	.notifier_call	= ctnetlink_expect_event,
+static struct nf_exp_event_notifier ctnl_notifier_exp = {
+	.fcn = ctnetlink_expect_event,
 };
 #endif
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 9dbd5709aad..92761a98837 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -107,9 +107,10 @@ int nfnetlink_has_listeners(unsigned int group)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
-int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+int nfnetlink_send(struct sk_buff *skb, u32 pid,
+		   unsigned group, int echo, gfp_t flags)
 {
-	return nlmsg_notify(nfnl, skb, pid, group, echo, gfp_any());
+	return nlmsg_notify(nfnl, skb, pid, group, echo, flags);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_send);
 
-- 
cgit v1.2.3-70-g09d2


From dfbf97f3ac980b69dfbc41c83a208211a38443e8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:13:45 +0000
Subject: net: add _skb_dst opaque field

struct sk_buff uses one union to define dst and rtable fields.

We want to replace direct access to these pointers by accessors.

First patch adds a new "unsigned long _skb_dst;" opaque field
in this union.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aff494ba6a3..d4d7c666ca6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -325,6 +325,7 @@ struct sk_buff {
 	union {
 		struct  dst_entry	*dst;
 		struct  rtable		*rtable;
+		unsigned long		_skb_dst;
 	};
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
-- 
cgit v1.2.3-70-g09d2


From 511c3f92ad5b6d9f8f6464be1b4f85f0422be91a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:14:27 +0000
Subject: net: skb->rtable accessor

Define skb_rtable(const struct sk_buff *skb) accessor to get rtable from skb

Delete skb->rtable field

Setting rtable is not allowed, just set dst instead as rtable is an alias.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/appletalk/ipddp.c           |  2 +-
 include/linux/skbuff.h                  |  6 +++++-
 include/net/route.h                     |  2 +-
 net/bridge/br_netfilter.c               | 25 +++++++++++++---------
 net/dccp/ipv4.c                         |  6 +++---
 net/ipv4/arp.c                          |  4 ++--
 net/ipv4/icmp.c                         | 10 ++++-----
 net/ipv4/igmp.c                         |  2 +-
 net/ipv4/ip_forward.c                   |  2 +-
 net/ipv4/ip_gre.c                       |  4 ++--
 net/ipv4/ip_input.c                     |  2 +-
 net/ipv4/ip_options.c                   | 16 +++++++-------
 net/ipv4/ip_output.c                    | 10 ++++-----
 net/ipv4/ip_sockglue.c                  |  2 +-
 net/ipv4/ipip.c                         |  2 +-
 net/ipv4/ipmr.c                         |  6 +++---
 net/ipv4/netfilter/ipt_MASQUERADE.c     |  2 +-
 net/ipv4/netfilter/nf_nat_helper.c      |  4 ++--
 net/ipv4/route.c                        | 37 ++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c                     |  4 ++--
 net/netfilter/nf_conntrack_netbios_ns.c |  2 +-
 net/sched/em_meta.c                     |  4 ++--
 net/sctp/protocol.c                     |  8 +++----
 23 files changed, 89 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index 9832b757f10..78cea5e80b1 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -115,7 +115,7 @@ static struct net_device * __init ipddp_init(void)
  */
 static int ipddp_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	__be32 paddr = ((struct rtable*)skb->dst)->rt_gateway;
+	__be32 paddr = skb_rtable(skb)->rt_gateway;
         struct ddpehdr *ddp;
         struct ipddp_route *rt;
         struct atalk_addr *our_addr;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d4d7c666ca6..a3ae3c52583 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -324,7 +324,6 @@ struct sk_buff {
 
 	union {
 		struct  dst_entry	*dst;
-		struct  rtable		*rtable;
 		unsigned long		_skb_dst;
 	};
 #ifdef CONFIG_XFRM
@@ -427,6 +426,11 @@ extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
 			  enum dma_data_direction dir);
 #endif
 
+static inline struct rtable *skb_rtable(const struct sk_buff *skb)
+{
+	return (struct rtable *)skb->_skb_dst;
+}
+
 extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
diff --git a/include/net/route.h b/include/net/route.h
index 4e8cae0e584..40f6346ef49 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -210,7 +210,7 @@ static inline struct inet_peer *rt_get_peer(struct rtable *rt)
 
 static inline int inet_iif(const struct sk_buff *skb)
 {
-	return skb->rtable->rt_iif;
+	return skb_rtable(skb)->rt_iif;
 }
 
 #endif	/* _ROUTE_H */
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index e4a418fcb35..e0ceb66a9ec 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -228,6 +228,7 @@ int nf_bridge_copy_header(struct sk_buff *skb)
 static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -235,12 +236,13 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 	}
 	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
 
-	skb->rtable = bridge_parent_rtable(nf_bridge->physindev);
-	if (!skb->rtable) {
+	rt = bridge_parent_rtable(nf_bridge->physindev);
+	if (!rt) {
 		kfree_skb(skb);
 		return 0;
 	}
-	dst_hold(&skb->rtable->u.dst);
+	dst_hold(&rt->u.dst);
+	skb->dst = &rt->u.dst;
 
 	skb->dev = nf_bridge->physindev;
 	nf_bridge_push_encap_header(skb);
@@ -338,6 +340,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct iphdr *iph = ip_hdr(skb);
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
 	int err;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -347,7 +350,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
 	if (dnat_took_place(skb)) {
 		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
-			struct rtable *rt;
 			struct flowi fl = {
 				.nl_u = {
 					.ip4_u = {
@@ -404,12 +406,13 @@ bridged_dnat:
 			skb->pkt_type = PACKET_HOST;
 		}
 	} else {
-		skb->rtable = bridge_parent_rtable(nf_bridge->physindev);
-		if (!skb->rtable) {
+		rt = bridge_parent_rtable(nf_bridge->physindev);
+		if (!rt) {
 			kfree_skb(skb);
 			return 0;
 		}
-		dst_hold(&skb->rtable->u.dst);
+		dst_hold(&rt->u.dst);
+		skb->dst = &rt->u.dst;
 	}
 
 	skb->dev = nf_bridge->physindev;
@@ -628,9 +631,11 @@ static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb,
 				   const struct net_device *out,
 				   int (*okfn)(struct sk_buff *))
 {
-	if (skb->rtable && skb->rtable == bridge_parent_rtable(in)) {
-		dst_release(&skb->rtable->u.dst);
-		skb->rtable = NULL;
+	struct rtable *rt = skb_rtable(skb);
+
+	if (rt && rt == bridge_parent_rtable(in)) {
+		dst_release(&rt->u.dst);
+		skb->dst = NULL;
 	}
 
 	return NF_ACCEPT;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d1dd95289b8..2cf48ba0dbb 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -452,7 +452,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
 					   struct sk_buff *skb)
 {
 	struct rtable *rt;
-	struct flowi fl = { .oif = skb->rtable->rt_iif,
+	struct flowi fl = { .oif = skb_rtable(skb)->rt_iif,
 			    .nl_u = { .ip4_u =
 				      { .daddr = ip_hdr(skb)->saddr,
 					.saddr = ip_hdr(skb)->daddr,
@@ -514,7 +514,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
 		return;
 
-	if (rxskb->rtable->rt_type != RTN_LOCAL)
+	if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
 		return;
 
 	dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
@@ -567,7 +567,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 
 	/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
-	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 		return 0;	/* discard, don't send a reset here */
 
 	if (dccp_bad_service_code(sk, service)) {
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f11931c1838..816494f271a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -474,7 +474,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 		return 1;
 	}
 
-	paddr = skb->rtable->rt_gateway;
+	paddr = skb_rtable(skb)->rt_gateway;
 
 	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
 		return 0;
@@ -817,7 +817,7 @@ static int arp_process(struct sk_buff *skb)
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
 	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
 
 		if (addr_type == RTN_LOCAL) {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3f50807237e..94f75efae93 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -356,7 +356,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
 static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 {
 	struct ipcm_cookie ipc;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net *net = dev_net(rt->u.dst.dev);
 	struct sock *sk;
 	struct inet_sock *inet;
@@ -416,7 +416,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	struct iphdr *iph;
 	int room;
 	struct icmp_bxm icmp_param;
-	struct rtable *rt = skb_in->rtable;
+	struct rtable *rt = skb_rtable(skb_in);
 	struct ipcm_cookie ipc;
 	__be32 saddr;
 	u8  tos;
@@ -596,7 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					     RT_TOS(tos), rt2->u.dst.dev);
 
 			dst_release(&rt2->u.dst);
-			rt2 = skb_in->rtable;
+			rt2 = skb_rtable(skb_in);
 			skb_in->dst = odst;
 		}
 
@@ -926,7 +926,7 @@ static void icmp_address(struct sk_buff *skb)
 
 static void icmp_address_reply(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net_device *dev = skb->dev;
 	struct in_device *in_dev;
 	struct in_ifaddr *ifa;
@@ -970,7 +970,7 @@ static void icmp_discard(struct sk_buff *skb)
 int icmp_rcv(struct sk_buff *skb)
 {
 	struct icmphdr *icmph;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net *net = dev_net(rt->u.dst.dev);
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e6058a50379..afabd2758b6 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -948,7 +948,7 @@ int igmp_rcv(struct sk_buff *skb)
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
 		/* Is it our report looped back? */
-		if (skb->rtable->fl.iif == 0)
+		if (skb_rtable(skb)->fl.iif == 0)
 			break;
 		/* don't rely on MC router hearing unicast reports */
 		if (skb->pkt_type == PACKET_MULTICAST ||
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index df3fe50bbf0..0761cd9bbd1 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -81,7 +81,7 @@ int ip_forward(struct sk_buff *skb)
 	if (!xfrm4_route_forward(skb))
 		goto drop;
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto sr_failed;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 77436e2732e..85ddad45a91 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -602,7 +602,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
 			/* Looped back packet, drop it! */
-			if (skb->rtable->fl.iif == 0)
+			if (skb_rtable(skb)->fl.iif == 0)
 				goto drop;
 			stats->multicast++;
 			skb->pkt_type = PACKET_BROADCAST;
@@ -704,7 +704,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 
 		if (skb->protocol == htons(ETH_P_IP)) {
-			rt = skb->rtable;
+			rt = skb_rtable(skb);
 			if ((dst = rt->rt_gateway) == 0)
 				goto tx_error_icmp;
 		}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 40f6206b2aa..cea784b0aa4 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -357,7 +357,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	if (iph->ihl > 5 && ip_rcv_options(skb))
 		goto drop;
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
 				skb->len);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 2c88da6e786..7e1074ffdbd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -102,7 +102,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 	sptr = skb_network_header(skb);
 	dptr = dopt->__data;
 
-	daddr = skb->rtable->rt_spec_dst;
+	daddr = skb_rtable(skb)->rt_spec_dst;
 
 	if (sopt->rr) {
 		optlen  = sptr[sopt->rr+1];
@@ -257,7 +257,7 @@ int ip_options_compile(struct net *net,
 	struct rtable *rt = NULL;
 
 	if (skb != NULL) {
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -550,7 +550,7 @@ void ip_forward_options(struct sk_buff *skb)
 {
 	struct   ip_options * opt	= &(IPCB(skb)->opt);
 	unsigned char * optptr;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	unsigned char *raw = skb_network_header(skb);
 
 	if (opt->rr_needaddr) {
@@ -598,7 +598,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	__be32 nexthop;
 	struct iphdr *iph = ip_hdr(skb);
 	unsigned char *optptr = skb_network_header(skb) + opt->srr;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct rtable *rt2;
 	int err;
 
@@ -623,13 +623,13 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		}
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
-		rt = skb->rtable;
-		skb->rtable = NULL;
+		rt = skb_rtable(skb);
+		skb->dst = NULL;
 		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
-		rt2 = skb->rtable;
+		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
 			ip_rt_put(rt2);
-			skb->rtable = rt;
+			skb->dst = &rt->u.dst;
 			return -EINVAL;
 		}
 		ip_rt_put(rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ea19c37ccc0..8d845ebfcca 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -140,7 +140,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 			  __be32 saddr, __be32 daddr, struct ip_options *opt)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 
 	/* Build the IP header. */
@@ -238,7 +238,7 @@ static int ip_finish_output(struct sk_buff *skb)
 int ip_mc_output(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net_device *dev = rt->u.dst.dev;
 
 	/*
@@ -319,7 +319,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 	/* Skip all of this if the packet is already routed,
 	 * f.e. by something like SCTP.
 	 */
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt != NULL)
 		goto packet_routed;
 
@@ -440,7 +440,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	unsigned int mtu, hlen, left, len, ll_rs, pad;
 	int offset;
 	__be16 not_last_frag;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	int err = 0;
 
 	dev = rt->u.dst.dev;
@@ -1362,7 +1362,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 	} replyopts;
 	struct ipcm_cookie ipc;
 	__be32 daddr;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 
 	if (ip_options_echo(&replyopts.opt, skb))
 		return;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cb49936856e..fc7993e9061 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -57,7 +57,7 @@
 static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct in_pktinfo info;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 
 	info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
 	if (rt) {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index bb2f1b17fbf..0c6e7bf18a4 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -416,7 +416,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (!dst) {
 		/* NBMA tunnel */
-		if ((rt = skb->rtable) == NULL) {
+		if ((rt = skb_rtable(skb)) == NULL) {
 			stats->tx_fifo_errors++;
 			goto tx_error;
 		}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 13e9dd3012b..69dd058283e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1354,7 +1354,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 	if (net->ipv4.vif_table[vif].dev != skb->dev) {
 		int true_vifi;
 
-		if (skb->rtable->fl.iif == 0) {
+		if (skb_rtable(skb)->fl.iif == 0) {
 			/* It is our own packet, looped back.
 			   Very complicated situation...
 
@@ -1430,7 +1430,7 @@ int ip_mr_input(struct sk_buff *skb)
 {
 	struct mfc_cache *cache;
 	struct net *net = dev_net(skb->dev);
-	int local = skb->rtable->rt_flags&RTCF_LOCAL;
+	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
 
 	/* Packet is looped back after forward, it should not be
 	   forwarded second time, but still can be delivered locally.
@@ -1646,7 +1646,7 @@ int ipmr_get_route(struct net *net,
 {
 	int err;
 	struct mfc_cache *cache;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 
 	read_lock(&mrt_lock);
 	cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index f389f60cb10..c0992c75bda 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
 		return NF_ACCEPT;
 
 	mr = par->targinfo;
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 	if (!newsrc) {
 		printk("MASQUERADE: %s ate my IP address\n", par->out->name);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index cf7a42bf982..155c008626c 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -140,7 +140,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			 const char *rep_buffer,
 			 unsigned int rep_len)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct tcphdr *tcph;
 	int oldlen, datalen;
@@ -218,7 +218,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 			 const char *rep_buffer,
 			 unsigned int rep_len)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct udphdr *udph;
 	int datalen, oldlen;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 28205e5bfa9..f20060ac2f0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1064,7 +1064,8 @@ work_done:
 out:	return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(unsigned hash, struct rtable *rt,
+			  struct rtable **rp, struct sk_buff *skb)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -1114,7 +1115,10 @@ restart:
 			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
-			*rp = rth;
+			if (rp)
+				*rp = rth;
+			else
+				skb->dst = &rth->u.dst;
 			return 0;
 		}
 
@@ -1210,7 +1214,10 @@ restart:
 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
-	*rp = rt;
+	if (rp)
+		*rp = rt;
+	else
+		skb->dst = &rt->u.dst;
 	return 0;
 }
 
@@ -1407,7 +1414,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 							&netevent);
 
 				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt))
+				if (!rt_intern_hash(hash, rt, &rt, NULL))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1473,7 +1480,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
 
 	if (!in_dev)
@@ -1521,7 +1528,7 @@ out:
 
 static int ip_error(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	unsigned long now;
 	int code;
 
@@ -1698,7 +1705,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
 
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt)
 		dst_set_expires(&rt->u.dst, 0);
 }
@@ -1858,7 +1865,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	in_dev_put(in_dev);
 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	return rt_intern_hash(hash, rth, &skb->rtable);
+	return rt_intern_hash(hash, rth, NULL, skb);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -2019,7 +2026,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	/* put it into the cache */
 	hash = rt_hash(daddr, saddr, fl->iif,
 		       rt_genid(dev_net(rth->u.dst.dev)));
-	return rt_intern_hash(hash, rth, &skb->rtable);
+	return rt_intern_hash(hash, rth, NULL, skb);
 }
 
 /*
@@ -2175,7 +2182,7 @@ local_input:
 	}
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-	err = rt_intern_hash(hash, rth, &skb->rtable);
+	err = rt_intern_hash(hash, rth, NULL, skb);
 	goto done;
 
 no_route:
@@ -2244,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb->rtable = rth;
+			skb->dst = &rth->u.dst;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2420,7 +2427,7 @@ static int ip_mkroute_output(struct rtable **rp,
 	if (err == 0) {
 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
 			       rt_genid(dev_net(dev_out)));
-		err = rt_intern_hash(hash, rth, rp);
+		err = rt_intern_hash(hash, rth, rp, NULL);
 	}
 
 	return err;
@@ -2763,7 +2770,7 @@ static int rt_fill_info(struct net *net,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	long expires;
@@ -2907,7 +2914,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 		local_bh_enable();
 
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		if (err == 0 && rt->u.dst.error)
 			err = -rt->u.dst.error;
 	} else {
@@ -2927,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb->rtable = rt;
+	skb->dst = &rt->u.dst;
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fc79e341628..319c8852644 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -546,7 +546,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	if (th->rst)
 		return;
 
-	if (skb->rtable->rt_type != RTN_LOCAL)
+	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 		return;
 
 	/* Swap the send and the receive. */
@@ -1185,7 +1185,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 #endif
 
 	/* Never answer to SYNs send to broadcast or multicast */
-	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 		goto drop;
 
 	/* TW buckets are converted to open requests without
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 8a3875e36ec..497b2224536 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -48,7 +48,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 {
 	struct nf_conntrack_expect *exp;
 	struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct in_device *in_dev;
 	__be32 mask = 0;
 
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index fad596bf32d..b6b588bed4e 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -258,10 +258,10 @@ META_COLLECTOR(int_rtclassid)
 
 META_COLLECTOR(int_rtiif)
 {
-	if (unlikely(skb->rtable == NULL))
+	if (unlikely(skb_rtable(skb) == NULL))
 		*err = -1;
 	else
-		dst->value = skb->rtable->fl.iif;
+		dst->value = skb_rtable(skb)->fl.iif;
 }
 
 /**************************************************************************
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 8eb3e61cb70..cb2c50dbd42 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -393,7 +393,7 @@ static int sctp_v4_addr_valid(union sctp_addr *addr,
 		return 0;
 
 	/* Is this a broadcast address? */
-	if (skb && skb->rtable->rt_flags & RTCF_BROADCAST)
+	if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST)
 		return 0;
 
 	return 1;
@@ -572,7 +572,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk,
 /* What interface did this skb arrive on? */
 static int sctp_v4_skb_iif(const struct sk_buff *skb)
 {
-	return skb->rtable->rt_iif;
+	return skb_rtable(skb)->rt_iif;
 }
 
 /* Was this packet marked by Explicit Congestion Notification? */
@@ -848,8 +848,8 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
 
 	SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n",
 			  __func__, skb, skb->len,
-			  &skb->rtable->rt_src,
-			  &skb->rtable->rt_dst);
+			  &skb_rtable(skb)->rt_src,
+			  &skb_rtable(skb)->rt_dst);
 
 	inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
 			 IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
-- 
cgit v1.2.3-70-g09d2


From adf30907d63893e4208dfe3f5c88ae12bc2f25d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:19:30 +0000
Subject: net: skb->dst accessors

Define three accessors to get/set dst attached to a skb

struct dst_entry *skb_dst(const struct sk_buff *skb)

void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)

void skb_dst_drop(struct sk_buff *skb)
This one should replace occurrences of :
dst_release(skb->dst)
skb->dst = NULL;

Delete skb->dst field

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c        |  4 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c      | 30 ++++++-------
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 10 ++---
 drivers/net/pppol2tp.c                         | 11 +++--
 drivers/s390/net/qeth_core_main.c              |  4 +-
 drivers/s390/net/qeth_l3_main.c                |  8 ++--
 include/linux/skbuff.h                         | 13 +++++-
 include/net/dst.h                              | 12 ++++--
 include/net/inet6_hashtables.h                 |  2 +-
 include/net/inet_hashtables.h                  |  2 +-
 include/net/ip6_route.h                        |  2 +-
 include/net/xfrm.h                             |  4 +-
 net/atm/br2684.c                               |  2 +-
 net/atm/clip.c                                 | 14 +++---
 net/bridge/br_netfilter.c                      | 18 ++++----
 net/core/dev.c                                 |  7 ++-
 net/core/neighbour.c                           | 11 +++--
 net/core/skbuff.c                              |  4 +-
 net/dccp/ipv4.c                                |  4 +-
 net/dccp/ipv6.c                                |  8 ++--
 net/dccp/output.c                              |  2 +-
 net/decnet/af_decnet.c                         |  6 ++-
 net/decnet/dn_neigh.c                          |  8 ++--
 net/decnet/dn_nsp_out.c                        |  6 +--
 net/decnet/dn_route.c                          | 25 +++++------
 net/ipv4/arp.c                                 |  2 +-
 net/ipv4/icmp.c                                | 10 ++---
 net/ipv4/igmp.c                                |  4 +-
 net/ipv4/ip_forward.c                          |  4 +-
 net/ipv4/ip_fragment.c                         |  2 +-
 net/ipv4/ip_gre.c                              | 23 +++++-----
 net/ipv4/ip_input.c                            |  6 +--
 net/ipv4/ip_options.c                          |  6 +--
 net/ipv4/ip_output.c                           | 20 ++++-----
 net/ipv4/ipip.c                                | 13 +++---
 net/ipv4/ipmr.c                                | 13 +++---
 net/ipv4/netfilter.c                           | 28 ++++++------
 net/ipv4/netfilter/ipt_REJECT.c                |  7 ++-
 net/ipv4/netfilter/nf_nat_standalone.c         |  7 ++-
 net/ipv4/raw.c                                 |  2 +-
 net/ipv4/route.c                               | 14 +++---
 net/ipv4/tcp_ipv4.c                            |  4 +-
 net/ipv4/tcp_output.c                          |  2 +-
 net/ipv4/udp.c                                 |  4 +-
 net/ipv4/xfrm4_input.c                         |  2 +-
 net/ipv4/xfrm4_mode_tunnel.c                   |  4 +-
 net/ipv4/xfrm4_output.c                        |  6 +--
 net/ipv6/exthdrs.c                             | 40 ++++++++---------
 net/ipv6/inet6_connection_sock.c               |  2 +-
 net/ipv6/ip6_input.c                           | 12 +++---
 net/ipv6/ip6_output.c                          | 60 +++++++++++++-------------
 net/ipv6/ip6_tunnel.c                          | 26 ++++++-----
 net/ipv6/ip6mr.c                               | 13 +++---
 net/ipv6/mcast.c                               | 17 +++++---
 net/ipv6/ndisc.c                               |  4 +-
 net/ipv6/netfilter.c                           | 16 ++++---
 net/ipv6/netfilter/ip6t_REJECT.c               |  2 +-
 net/ipv6/raw.c                                 |  2 +-
 net/ipv6/reassembly.c                          | 22 +++++-----
 net/ipv6/route.c                               | 12 +++---
 net/ipv6/sit.c                                 | 21 +++++----
 net/ipv6/tcp_ipv6.c                            |  8 ++--
 net/ipv6/udp.c                                 |  7 ++-
 net/ipv6/xfrm6_mode_tunnel.c                   |  4 +-
 net/ipv6/xfrm6_output.c                        |  4 +-
 net/netfilter/ipvs/ip_vs_xmit.c                | 48 ++++++++++-----------
 net/netfilter/nf_conntrack_proto_gre.c         |  2 +-
 net/netfilter/xt_TCPMSS.c                      |  6 +--
 net/netfilter/xt_policy.c                      |  2 +-
 net/netfilter/xt_realm.c                       |  2 +-
 net/packet/af_packet.c                         |  6 +--
 net/sched/cls_flow.c                           |  8 ++--
 net/sched/cls_route.c                          |  2 +-
 net/sched/em_meta.c                            |  4 +-
 net/sched/sch_sfq.c                            |  2 +-
 net/sched/sch_teql.c                           |  6 +--
 net/sctp/output.c                              |  6 +--
 net/sunrpc/xprtsock.c                          |  2 +-
 net/xfrm/xfrm_input.c                          |  3 +-
 net/xfrm/xfrm_output.c                         | 21 ++++-----
 net/xfrm/xfrm_policy.c                         |  8 +++-
 security/selinux/hooks.c                       |  2 +-
 security/selinux/xfrm.c                        |  2 +-
 83 files changed, 414 insertions(+), 390 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 4248c313936..181b1f32325 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1394,8 +1394,8 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int e = skb_queue_empty(&priv->cm.skb_queue);
 
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	skb_queue_tail(&priv->cm.skb_queue, skb);
 	if (e)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 69c6304cc94..e319d91f60a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -561,7 +561,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_neigh *neigh;
 	unsigned long flags;
 
-	neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
+	neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour, skb->dev);
 	if (!neigh) {
 		++dev->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
@@ -570,9 +570,9 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 
 	spin_lock_irqsave(&priv->lock, flags);
 
-	path = __path_find(dev, skb->dst->neighbour->ha + 4);
+	path = __path_find(dev, skb_dst(skb)->neighbour->ha + 4);
 	if (!path) {
-		path = path_rec_create(dev, skb->dst->neighbour->ha + 4);
+		path = path_rec_create(dev, skb_dst(skb)->neighbour->ha + 4);
 		if (!path)
 			goto err_path;
 
@@ -605,7 +605,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 				goto err_drop;
 			}
 		} else
-			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha));
 	} else {
 		neigh->ah  = NULL;
 
@@ -635,15 +635,15 @@ static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
 
 	/* Look up path record for unicasts */
-	if (skb->dst->neighbour->ha[4] != 0xff) {
+	if (skb_dst(skb)->neighbour->ha[4] != 0xff) {
 		neigh_add_path(skb, dev);
 		return;
 	}
 
 	/* Add in the P_Key for multicasts */
-	skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
-	skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
-	ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb);
+	skb_dst(skb)->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
+	skb_dst(skb)->neighbour->ha[9] = priv->pkey & 0xff;
+	ipoib_mcast_send(dev, skb_dst(skb)->neighbour->ha + 4, skb);
 }
 
 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
@@ -708,16 +708,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_neigh *neigh;
 	unsigned long flags;
 
-	if (likely(skb->dst && skb->dst->neighbour)) {
-		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
+	if (likely(skb_dst(skb) && skb_dst(skb)->neighbour)) {
+		if (unlikely(!*to_ipoib_neigh(skb_dst(skb)->neighbour))) {
 			ipoib_path_lookup(skb, dev);
 			return NETDEV_TX_OK;
 		}
 
-		neigh = *to_ipoib_neigh(skb->dst->neighbour);
+		neigh = *to_ipoib_neigh(skb_dst(skb)->neighbour);
 
 		if (unlikely((memcmp(&neigh->dgid.raw,
-				     skb->dst->neighbour->ha + 4,
+				     skb_dst(skb)->neighbour->ha + 4,
 				     sizeof(union ib_gid))) ||
 			     (neigh->dev != dev))) {
 			spin_lock_irqsave(&priv->lock, flags);
@@ -743,7 +743,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				return NETDEV_TX_OK;
 			}
 		} else if (neigh->ah) {
-			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha));
 			return NETDEV_TX_OK;
 		}
 
@@ -772,7 +772,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
 			    (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
 				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",
-					   skb->dst ? "neigh" : "dst",
+					   skb_dst(skb) ? "neigh" : "dst",
 					   be16_to_cpup((__be16 *) skb->data),
 					   IPOIB_QPN(phdr->hwaddr),
 					   phdr->hwaddr + 4);
@@ -817,7 +817,7 @@ static int ipoib_hard_header(struct sk_buff *skb,
 	 * destination address onto the front of the skb so we can
 	 * figure out where to send the packet later.
 	 */
-	if ((!skb->dst || !skb->dst->neighbour) && daddr) {
+	if ((!skb_dst(skb) || !skb_dst(skb)->neighbour) && daddr) {
 		struct ipoib_pseudoheader *phdr =
 			(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
 		memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 425e31112ed..a0e97532e71 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -261,7 +261,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 
 		skb->dev = dev;
 
-		if (!skb->dst || !skb->dst->neighbour) {
+		if (!skb_dst(skb) || !skb_dst(skb)->neighbour) {
 			/* put pseudoheader back on for next time */
 			skb_push(skb, sizeof (struct ipoib_pseudoheader));
 		}
@@ -707,10 +707,10 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 
 out:
 	if (mcast && mcast->ah) {
-		if (skb->dst		&&
-		    skb->dst->neighbour &&
-		    !*to_ipoib_neigh(skb->dst->neighbour)) {
-			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour,
+		if (skb_dst(skb)		&&
+		    skb_dst(skb)->neighbour &&
+		    !*to_ipoib_neigh(skb_dst(skb)->neighbour)) {
+			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour,
 									skb->dev);
 
 			if (neigh) {
diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c
index 5981debcde5..e7935d09c89 100644
--- a/drivers/net/pppol2tp.c
+++ b/drivers/net/pppol2tp.c
@@ -433,8 +433,7 @@ static void pppol2tp_recv_dequeue_skb(struct pppol2tp_session *session, struct s
 		 *   to the inner packet either
 		 */
 		secpath_reset(skb);
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 
 		po = pppox_sk(session_sock);
@@ -976,7 +975,7 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh
 	/* Calculate UDP checksum if configured to do so */
 	if (sk_tun->sk_no_check == UDP_CSUM_NOXMIT)
 		skb->ip_summed = CHECKSUM_NONE;
-	else if (!(skb->dst->dev->features & NETIF_F_V4_CSUM)) {
+	else if (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		csum = skb_checksum(skb, 0, udp_len, 0);
 		uh->check = csum_tcpudp_magic(inet->saddr, inet->daddr,
@@ -1172,14 +1171,14 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	nf_reset(skb);
 
 	/* Get routing info from the tunnel socket */
-	dst_release(skb->dst);
-	skb->dst = dst_clone(__sk_dst_get(sk_tun));
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst_clone(__sk_dst_get(sk_tun)));
 	pppol2tp_skb_set_owner_w(skb, sk_tun);
 
 	/* Calculate UDP checksum if configured to do so */
 	if (sk_tun->sk_no_check == UDP_CSUM_NOXMIT)
 		skb->ip_summed = CHECKSUM_NONE;
-	else if (!(skb->dst->dev->features & NETIF_F_V4_CSUM)) {
+	else if (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		csum = skb_checksum(skb, 0, udp_len, 0);
 		uh->check = csum_tcpudp_magic(inet->saddr, inet->daddr,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 2994aa1ed46..74c49d9a8db 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -2937,8 +2937,8 @@ int qeth_get_cast_type(struct qeth_card *card, struct sk_buff *skb)
 	if (card->info.type == QETH_CARD_TYPE_OSN)
 		return cast_type;
 
-	if (skb->dst && skb->dst->neighbour) {
-		cast_type = skb->dst->neighbour->type;
+	if (skb_dst(skb) && skb_dst(skb)->neighbour) {
+		cast_type = skb_dst(skb)->neighbour->type;
 		if ((cast_type == RTN_BROADCAST) ||
 		    (cast_type == RTN_MULTICAST) ||
 		    (cast_type == RTN_ANYCAST))
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index cb64b0b534a..6f2386e9d6e 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2549,9 +2549,9 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 		/* IPv4 */
 		hdr->hdr.l3.flags = qeth_l3_get_qeth_hdr_flags4(cast_type);
 		memset(hdr->hdr.l3.dest_addr, 0, 12);
-		if ((skb->dst) && (skb->dst->neighbour)) {
+		if ((skb_dst(skb)) && (skb_dst(skb)->neighbour)) {
 			*((u32 *) (&hdr->hdr.l3.dest_addr[12])) =
-			    *((u32 *) skb->dst->neighbour->primary_key);
+			    *((u32 *) skb_dst(skb)->neighbour->primary_key);
 		} else {
 			/* fill in destination address used in ip header */
 			*((u32 *) (&hdr->hdr.l3.dest_addr[12])) =
@@ -2562,9 +2562,9 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 		hdr->hdr.l3.flags = qeth_l3_get_qeth_hdr_flags6(cast_type);
 		if (card->info.type == QETH_CARD_TYPE_IQD)
 			hdr->hdr.l3.flags &= ~QETH_HDR_PASSTHRU;
-		if ((skb->dst) && (skb->dst->neighbour)) {
+		if ((skb_dst(skb)) && (skb_dst(skb)->neighbour)) {
 			memcpy(hdr->hdr.l3.dest_addr,
-			       skb->dst->neighbour->primary_key, 16);
+			       skb_dst(skb)->neighbour->primary_key, 16);
 		} else {
 			/* fill in destination address used in ip header */
 			memcpy(hdr->hdr.l3.dest_addr,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a3ae3c52583..9ef6eb20247 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -323,7 +323,6 @@ struct sk_buff {
 	struct net_device	*dev;
 
 	union {
-		struct  dst_entry	*dst;
 		unsigned long		_skb_dst;
 	};
 #ifdef CONFIG_XFRM
@@ -426,9 +425,19 @@ extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
 			  enum dma_data_direction dir);
 #endif
 
+static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
+{
+	return (struct dst_entry *)skb->_skb_dst;
+}
+
+static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
+{
+	skb->_skb_dst = (unsigned long)dst;
+}
+
 static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 {
-	return (struct rtable *)skb->_skb_dst;
+	return (struct rtable *)skb_dst(skb);
 }
 
 extern void kfree_skb(struct sk_buff *skb);
diff --git a/include/net/dst.h b/include/net/dst.h
index 6be3b082a07..7fc409c19b3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -195,6 +195,12 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
 }
 
 extern void dst_release(struct dst_entry *dst);
+static inline void skb_dst_drop(struct sk_buff *skb)
+{
+	if (skb->_skb_dst)
+		dst_release(skb_dst(skb));
+	skb->_skb_dst = 0UL;
+}
 
 /* Children define the path of the packet through the
  * Linux networking.  Thus, destinations are stackable.
@@ -246,7 +252,7 @@ static inline void dst_negative_advice(struct dst_entry **dst_p)
 
 static inline void dst_link_failure(struct sk_buff *skb)
 {
-	struct dst_entry * dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	if (dst && dst->ops && dst->ops->link_failure)
 		dst->ops->link_failure(skb);
 }
@@ -265,13 +271,13 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout)
 /* Output packet to network from transport.  */
 static inline int dst_output(struct sk_buff *skb)
 {
-	return skb->dst->output(skb);
+	return skb_dst(skb)->output(skb);
 }
 
 /* Input packet from network to transport.  */
 static inline int dst_input(struct sk_buff *skb)
 {
-	return skb->dst->input(skb);
+	return skb_dst(skb)->input(skb);
 }
 
 static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index f74665d7bea..22c73a77cd9 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -100,7 +100,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 
 	if (unlikely(sk = skb_steal_sock(skb)))
 		return sk;
-	else return __inet6_lookup(dev_net(skb->dst->dev), hashinfo,
+	else return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
 				   &ipv6_hdr(skb)->saddr, sport,
 				   &ipv6_hdr(skb)->daddr, ntohs(dport),
 				   inet6_iif(skb));
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index a44e2248b2e..d522dcf3031 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -385,7 +385,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 	if (unlikely(sk = skb_steal_sock(skb)))
 		return sk;
 	else
-		return __inet_lookup(dev_net(skb->dst->dev), hashinfo,
+		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
 				     iph->saddr, sport,
 				     iph->daddr, dport, inet_iif(skb));
 }
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 5f53db7e4e5..0e1b8aebaff 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -142,7 +142,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
 
 static inline int ipv6_unicast_destination(struct sk_buff *skb)
 {
-	struct rt6_info *rt = (struct rt6_info *) skb->dst;
+	struct rt6_info *rt = (struct rt6_info *) skb_dst(skb);
 
 	return rt->rt6i_flags & RTF_LOCAL;
 }
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2e9f5c0018a..736bca45088 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -994,7 +994,7 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 		return __xfrm_policy_check(sk, ndir, skb, family);
 
 	return	(!net->xfrm.policy_count[dir] && !skb->sp) ||
-		(skb->dst->flags & DST_NOPOLICY) ||
+		(skb_dst(skb)->flags & DST_NOPOLICY) ||
 		__xfrm_policy_check(sk, ndir, skb, family);
 }
 
@@ -1048,7 +1048,7 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 	struct net *net = dev_net(skb->dev);
 
 	return	!net->xfrm.policy_count[XFRM_POLICY_OUT] ||
-		(skb->dst->flags & DST_NOXFRM) ||
+		(skb_dst(skb)->flags & DST_NOXFRM) ||
 		__xfrm_route_forward(skb, family);
 }
 
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index bfa8fa9894f..2912665fc58 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -228,7 +228,7 @@ static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct br2684_dev *brdev = BRPRIV(dev);
 	struct br2684_vcc *brvcc;
 
-	pr_debug("br2684_start_xmit, skb->dst=%p\n", skb->dst);
+	pr_debug("br2684_start_xmit, skb_dst(skb)=%p\n", skb_dst(skb));
 	read_lock(&devs_lock);
 	brvcc = pick_outgoing_vcc(skb, brdev);
 	if (brvcc == NULL) {
diff --git a/net/atm/clip.c b/net/atm/clip.c
index fb7623c080f..e65a3b1477f 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -369,16 +369,16 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned long flags;
 
 	pr_debug("clip_start_xmit (skb %p)\n", skb);
-	if (!skb->dst) {
-		printk(KERN_ERR "clip_start_xmit: skb->dst == NULL\n");
+	if (!skb_dst(skb)) {
+		printk(KERN_ERR "clip_start_xmit: skb_dst(skb) == NULL\n");
 		dev_kfree_skb(skb);
 		dev->stats.tx_dropped++;
 		return 0;
 	}
-	if (!skb->dst->neighbour) {
+	if (!skb_dst(skb)->neighbour) {
 #if 0
-		skb->dst->neighbour = clip_find_neighbour(skb->dst, 1);
-		if (!skb->dst->neighbour) {
+		skb_dst(skb)->neighbour = clip_find_neighbour(skb_dst(skb), 1);
+		if (!skb_dst(skb)->neighbour) {
 			dev_kfree_skb(skb);	/* lost that one */
 			dev->stats.tx_dropped++;
 			return 0;
@@ -389,7 +389,7 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev->stats.tx_dropped++;
 		return 0;
 	}
-	entry = NEIGH2ENTRY(skb->dst->neighbour);
+	entry = NEIGH2ENTRY(skb_dst(skb)->neighbour);
 	if (!entry->vccs) {
 		if (time_after(jiffies, entry->expires)) {
 			/* should be resolved */
@@ -406,7 +406,7 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	pr_debug("neigh %p, vccs %p\n", entry, entry->vccs);
 	ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc;
-	pr_debug("using neighbour %p, vcc %p\n", skb->dst->neighbour, vcc);
+	pr_debug("using neighbour %p, vcc %p\n", skb_dst(skb)->neighbour, vcc);
 	if (entry->vccs->encap) {
 		void *here;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index e0ceb66a9ec..d22f611e400 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -242,7 +242,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 		return 0;
 	}
 	dst_hold(&rt->u.dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 
 	skb->dev = nf_bridge->physindev;
 	nf_bridge_push_encap_header(skb);
@@ -322,7 +322,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
 
 	skb->dev = bridge_parent(skb->dev);
 	if (skb->dev) {
-		struct dst_entry *dst = skb->dst;
+		struct dst_entry *dst = skb_dst(skb);
 
 		nf_bridge_pull_encap_header(skb);
 
@@ -375,7 +375,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 				/* - Bridged-and-DNAT'ed traffic doesn't
 				 *   require ip_forwarding. */
 				if (((struct dst_entry *)rt)->dev == dev) {
-					skb->dst = (struct dst_entry *)rt;
+					skb_dst_set(skb, (struct dst_entry *)rt);
 					goto bridged_dnat;
 				}
 				/* we are sure that forwarding is disabled, so printing
@@ -389,7 +389,7 @@ free_skb:
 			kfree_skb(skb);
 			return 0;
 		} else {
-			if (skb->dst->dev == dev) {
+			if (skb_dst(skb)->dev == dev) {
 bridged_dnat:
 				/* Tell br_nf_local_out this is a
 				 * bridged frame */
@@ -412,7 +412,7 @@ bridged_dnat:
 			return 0;
 		}
 		dst_hold(&rt->u.dst);
-		skb->dst = &rt->u.dst;
+		skb_dst_set(skb, &rt->u.dst);
 	}
 
 	skb->dev = nf_bridge->physindev;
@@ -633,10 +633,8 @@ static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb,
 {
 	struct rtable *rt = skb_rtable(skb);
 
-	if (rt && rt == bridge_parent_rtable(in)) {
-		dst_release(&rt->u.dst);
-		skb->dst = NULL;
-	}
+	if (rt && rt == bridge_parent_rtable(in))
+		skb_dst_drop(skb);
 
 	return NF_ACCEPT;
 }
@@ -851,7 +849,7 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 		return NF_ACCEPT;
 
 #ifdef CONFIG_NETFILTER_DEBUG
-	if (skb->dst == NULL) {
+	if (skb_dst(skb) == NULL) {
 		printk(KERN_INFO "br_netfilter post_routing: skb->dst == NULL\n");
 		goto print_error;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index e2fcc5f1017..34b49a6a22f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1693,10 +1693,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		 * If device doesnt need skb->dst, release it right now while
 		 * its hot in this cpu cache
 		 */
-		if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) {
-			dst_release(skb->dst);
-			skb->dst = NULL;
-		}
+		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+			skb_dst_drop(skb);
+
 		rc = ops->ndo_start_xmit(skb, dev);
 		if (rc == 0)
 			txq_trans_update(txq);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a1cbce7fdae..c54229befcf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1088,8 +1088,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 			struct neighbour *n1 = neigh;
 			write_unlock_bh(&neigh->lock);
 			/* On shaper/eql skb->dst->neighbour != neigh :( */
-			if (skb->dst && skb->dst->neighbour)
-				n1 = skb->dst->neighbour;
+			if (skb_dst(skb) && skb_dst(skb)->neighbour)
+				n1 = skb_dst(skb)->neighbour;
 			n1->output(skb);
 			write_lock_bh(&neigh->lock);
 		}
@@ -1182,7 +1182,7 @@ EXPORT_SYMBOL(neigh_compat_output);
 
 int neigh_resolve_output(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh;
 	int rc = 0;
 
@@ -1229,7 +1229,7 @@ EXPORT_SYMBOL(neigh_resolve_output);
 int neigh_connected_output(struct sk_buff *skb)
 {
 	int err;
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
 
@@ -1298,8 +1298,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		if (time_before(tbl->proxy_timer.expires, sched_next))
 			sched_next = tbl->proxy_timer.expires;
 	}
-	dst_release(skb->dst);
-	skb->dst = NULL;
+	skb_dst_drop(skb);
 	dev_hold(skb->dev);
 	__skb_queue_tail(&tbl->proxy_queue, skb);
 	mod_timer(&tbl->proxy_timer, sched_next);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8e815e685f2..6adf19ec95c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -381,7 +381,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 
 static void skb_release_head_state(struct sk_buff *skb)
 {
-	dst_release(skb->dst);
+	skb_dst_drop(skb);
 #ifdef CONFIG_XFRM
 	secpath_put(skb->sp);
 #endif
@@ -521,7 +521,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->transport_header	= old->transport_header;
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
-	new->dst		= dst_clone(old->dst);
+	skb_dst_set(new, dst_clone(skb_dst(old)));
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2cf48ba0dbb..a0a36c9e6cc 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -507,7 +507,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	const struct iphdr *rxiph;
 	struct sk_buff *skb;
 	struct dst_entry *dst;
-	struct net *net = dev_net(rxskb->dst->dev);
+	struct net *net = dev_net(skb_dst(rxskb)->dev);
 	struct sock *ctl_sk = net->dccp.v4_ctl_sk;
 
 	/* Never send a reset in response to a reset. */
@@ -528,7 +528,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	rxiph = ip_hdr(rxskb);
 	dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
 								 rxiph->daddr);
-	skb->dst = dst_clone(dst);
+	skb_dst_set(skb, dst_clone(dst));
 
 	bh_lock_sock(ctl_sk);
 	err = ip_build_and_send_pkt(skb, ctl_sk,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index b963f35c65f..05ea7440d9e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -314,8 +314,9 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	struct ipv6hdr *rxip6h;
 	struct sk_buff *skb;
 	struct flowi fl;
-	struct net *net = dev_net(rxskb->dst->dev);
+	struct net *net = dev_net(skb_dst(rxskb)->dev);
 	struct sock *ctl_sk = net->dccp.v6_ctl_sk;
+	struct dst_entry *dst;
 
 	if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
 		return;
@@ -342,8 +343,9 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	security_skb_classify_flow(rxskb, &fl);
 
 	/* sk = NULL, but it is safe for now. RST socket required. */
-	if (!ip6_dst_lookup(ctl_sk, &skb->dst, &fl)) {
-		if (xfrm_lookup(net, &skb->dst, &fl, NULL, 0) >= 0) {
+	if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) {
+		if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) {
+			skb_dst_set(skb, dst);
 			ip6_xmit(ctl_sk, skb, &fl, NULL, 0);
 			DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
 			DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 36bcc00654d..c0e88c16d08 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -350,7 +350,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	/* Reserve space for headers. */
 	skb_reserve(skb, sk->sk_prot->max_header);
 
-	skb->dst = dst_clone(dst);
+	skb_dst_set(skb, dst_clone(dst));
 
 	dreq = dccp_rsk(req);
 	if (inet_rsk(req)->acked)	/* increase ISS upon retransmission */
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index bccb3887773..a5e3a593e47 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1075,6 +1075,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 	int err = 0;
 	unsigned char type;
 	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	struct dst_entry *dst;
 
 	lock_sock(sk);
 
@@ -1102,8 +1103,9 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 	}
 	release_sock(sk);
 
-	dst_release(xchg(&newsk->sk_dst_cache, skb->dst));
-	skb->dst = NULL;
+	dst = skb_dst(skb);
+	dst_release(xchg(&newsk->sk_dst_cache, dst));
+	skb_dst_set(skb, NULL);
 
 	DN_SK(newsk)->state        = DN_CR;
 	DN_SK(newsk)->addrrem      = cb->src_port;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 05b5aa05e50..923786bd6d0 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -204,7 +204,7 @@ static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb)
 
 static int dn_neigh_output_packet(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct dn_route *rt = (struct dn_route *)dst;
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
@@ -224,7 +224,7 @@ static int dn_neigh_output_packet(struct sk_buff *skb)
 
 static int dn_long_output(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
 	int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
@@ -270,7 +270,7 @@ static int dn_long_output(struct sk_buff *skb)
 
 static int dn_short_output(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
 	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
@@ -313,7 +313,7 @@ static int dn_short_output(struct sk_buff *skb)
  */
 static int dn_phase3_output(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
 	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index da04f459337..a65e929ce76 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -85,7 +85,7 @@ static void dn_nsp_send(struct sk_buff *skb)
 	dst = sk_dst_check(sk, 0);
 	if (dst) {
 try_again:
-		skb->dst = dst;
+		skb_dst_set(skb, dst);
 		dst_output(skb);
 		return;
 	}
@@ -582,7 +582,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
 	 * to be able to send disc packets out which have no socket
 	 * associations.
 	 */
-	skb->dst = dst_clone(dst);
+	skb_dst_set(skb, dst_clone(dst));
 	dst_output(skb);
 }
 
@@ -611,7 +611,7 @@ void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
 	int ddl = 0;
 	gfp_t gfp = GFP_ATOMIC;
 
-	dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb->dst, ddl,
+	dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl,
 			NULL, cb->src_port, cb->dst_port);
 }
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 0cc4394117d..1d6ca8a98dc 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -678,7 +678,7 @@ out:
 
 static int dn_output(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct dn_route *rt = (struct dn_route *)dst;
 	struct net_device *dev = dst->dev;
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -717,7 +717,7 @@ error:
 static int dn_forward(struct sk_buff *skb)
 {
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct dn_dev *dn_db = dst->dev->dn_ptr;
 	struct dn_route *rt;
 	struct neighbour *neigh = dst->neighbour;
@@ -730,7 +730,7 @@ static int dn_forward(struct sk_buff *skb)
 		goto drop;
 
 	/* Ensure that we have enough space for headers */
-	rt = (struct dn_route *)skb->dst;
+	rt = (struct dn_route *)skb_dst(skb);
 	header_len = dn_db->use_long ? 21 : 6;
 	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+header_len))
 		goto drop;
@@ -1392,7 +1392,8 @@ make_route:
 		goto e_neighbour;
 
 	hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst);
-	dn_insert_route(rt, hash, (struct dn_route **)&skb->dst);
+	dn_insert_route(rt, hash, &rt);
+	skb_dst_set(skb, &rt->u.dst);
 
 done:
 	if (neigh)
@@ -1424,7 +1425,7 @@ static int dn_route_input(struct sk_buff *skb)
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
 	unsigned hash = dn_hash(cb->src, cb->dst);
 
-	if (skb->dst)
+	if (skb_dst(skb))
 		return 0;
 
 	rcu_read_lock();
@@ -1437,7 +1438,7 @@ static int dn_route_input(struct sk_buff *skb)
 		    (rt->fl.iif == cb->iif)) {
 			dst_use(&rt->u.dst, jiffies);
 			rcu_read_unlock();
-			skb->dst = (struct dst_entry *)rt;
+			skb_dst_set(skb, (struct dst_entry *)rt);
 			return 0;
 		}
 	}
@@ -1449,7 +1450,7 @@ static int dn_route_input(struct sk_buff *skb)
 static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 			   int event, int nowait, unsigned int flags)
 {
-	struct dn_route *rt = (struct dn_route *)skb->dst;
+	struct dn_route *rt = (struct dn_route *)skb_dst(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
@@ -1554,7 +1555,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 		err = dn_route_input(skb);
 		local_bh_enable();
 		memset(cb, 0, sizeof(struct dn_skb_cb));
-		rt = (struct dn_route *)skb->dst;
+		rt = (struct dn_route *)skb_dst(skb);
 		if (!err && -rt->u.dst.error)
 			err = rt->u.dst.error;
 	} else {
@@ -1570,7 +1571,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	skb->dev = NULL;
 	if (err)
 		goto out_free;
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
@@ -1622,15 +1623,15 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			rt = rcu_dereference(rt->u.dst.dn_next), idx++) {
 			if (idx < s_idx)
 				continue;
-			skb->dst = dst_clone(&rt->u.dst);
+			skb_dst_set(skb, dst_clone(&rt->u.dst));
 			if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
 					cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					1, NLM_F_MULTI) <= 0) {
-				dst_release(xchg(&skb->dst, NULL));
+				skb_dst_drop(skb);
 				rcu_read_unlock_bh();
 				goto done;
 			}
-			dst_release(xchg(&skb->dst, NULL));
+			skb_dst_drop(skb);
 		}
 		rcu_read_unlock_bh();
 	}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 816494f271a..8a3881e28ac 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -468,7 +468,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 	__be32 paddr;
 	struct neighbour *n;
 
-	if (!skb->dst) {
+	if (!skb_dst(skb)) {
 		printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
 		kfree_skb(skb);
 		return 1;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 94f75efae93..97c410e8438 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -591,13 +591,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 				goto relookup_failed;
 
 			/* Ugh! */
-			odst = skb_in->dst;
+			odst = skb_dst(skb_in);
 			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
 					     RT_TOS(tos), rt2->u.dst.dev);
 
 			dst_release(&rt2->u.dst);
 			rt2 = skb_rtable(skb_in);
-			skb_in->dst = odst;
+			skb_dst_set(skb_in, odst);
 		}
 
 		if (err)
@@ -659,7 +659,7 @@ static void icmp_unreach(struct sk_buff *skb)
 	u32 info = 0;
 	struct net *net;
 
-	net = dev_net(skb->dst->dev);
+	net = dev_net(skb_dst(skb)->dev);
 
 	/*
 	 *	Incomplete header ?
@@ -822,7 +822,7 @@ static void icmp_echo(struct sk_buff *skb)
 {
 	struct net *net;
 
-	net = dev_net(skb->dst->dev);
+	net = dev_net(skb_dst(skb)->dev);
 	if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
 		struct icmp_bxm icmp_param;
 
@@ -873,7 +873,7 @@ static void icmp_timestamp(struct sk_buff *skb)
 out:
 	return;
 out_err:
-	ICMP_INC_STATS_BH(dev_net(skb->dst->dev), ICMP_MIB_INERRORS);
+	ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
 	goto out;
 }
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index afabd2758b6..01b4284ed69 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -311,7 +311,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 		return NULL;
 	}
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 	skb->dev = dev;
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -659,7 +659,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 		return -1;
 	}
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
 
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 0761cd9bbd1..a2991bc8e32 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -42,7 +42,7 @@ static int ip_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
@@ -123,7 +123,7 @@ sr_failed:
 
 too_many_hops:
 	/* Tell the sender its packet died... */
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_INHDRERRORS);
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
 	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
 drop:
 	kfree_skb(skb);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7985346653b..1f1b82475ea 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -573,7 +573,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	struct ipq *qp;
 	struct net *net;
 
-	net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev);
+	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 
 	/* Start by cleaning up the memory. */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 85ddad45a91..44e2a3d2359 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -643,8 +643,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 		stats->rx_packets++;
 		stats->rx_bytes += len;
 		skb->dev = tunnel->dev;
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 
 		skb_reset_network_header(skb);
@@ -698,7 +697,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if ((dst = tiph->daddr) == 0) {
 		/* NBMA tunnel */
 
-		if (skb->dst == NULL) {
+		if (skb_dst(skb) == NULL) {
 			stats->tx_fifo_errors++;
 			goto tx_error;
 		}
@@ -712,7 +711,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		else if (skb->protocol == htons(ETH_P_IPV6)) {
 			struct in6_addr *addr6;
 			int addr_type;
-			struct neighbour *neigh = skb->dst->neighbour;
+			struct neighbour *neigh = skb_dst(skb)->neighbour;
 
 			if (neigh == NULL)
 				goto tx_error;
@@ -766,10 +765,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (df)
 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
 	else
-		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (skb->protocol == htons(ETH_P_IP)) {
 		df |= (old_iph->frag_off&htons(IP_DF));
@@ -783,14 +782,14 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 #ifdef CONFIG_IPV6
 	else if (skb->protocol == htons(ETH_P_IPV6)) {
-		struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 
-		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 			if ((tunnel->parms.iph.daddr &&
 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 			    rt6->rt6i_dst.plen == 128) {
 				rt6->rt6i_flags |= RTF_MODIFIED;
-				skb->dst->metrics[RTAX_MTU-1] = mtu;
+				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
 			}
 		}
 
@@ -837,8 +836,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index cea784b0aa4..490ce20faf3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -329,7 +329,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */
-	if (skb->dst == NULL) {
+	if (skb_dst(skb) == NULL) {
 		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
 					 skb->dev);
 		if (unlikely(err)) {
@@ -344,9 +344,9 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (unlikely(skb->dst->tclassid)) {
+	if (unlikely(skb_dst(skb)->tclassid)) {
 		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
-		u32 idx = skb->dst->tclassid;
+		u32 idx = skb_dst(skb)->tclassid;
 		st[idx&0xFF].o_packets++;
 		st[idx&0xFF].o_bytes += skb->len;
 		st[(idx>>16)&0xFF].i_packets++;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 7e1074ffdbd..94bf105ef3c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -143,7 +143,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 						__be32 addr;
 
 						memcpy(&addr, sptr+soffset-1, 4);
-						if (inet_addr_type(dev_net(skb->dst->dev), addr) != RTN_LOCAL) {
+						if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) {
 							dopt->ts_needtime = 1;
 							soffset += 8;
 						}
@@ -624,12 +624,12 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
 		rt = skb_rtable(skb);
-		skb->dst = NULL;
+		skb_dst_set(skb, NULL);
 		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
 		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
 			ip_rt_put(rt2);
-			skb->dst = &rt->u.dst;
+			skb_dst_set(skb, &rt->u.dst);
 			return -EINVAL;
 		}
 		ip_rt_put(rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d845ebfcca..3d6167fb2d9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -95,7 +95,7 @@ int __ip_local_out(struct sk_buff *skb)
 
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
-	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
+	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
 }
 
@@ -118,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 	__skb_pull(newskb, skb_network_offset(newskb));
 	newskb->pkt_type = PACKET_LOOPBACK;
 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
-	WARN_ON(!newskb->dst);
+	WARN_ON(!skb_dst(newskb));
 	netif_rx(newskb);
 	return 0;
 }
@@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = (struct rtable *)dst;
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
@@ -217,14 +217,14 @@ static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 
 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
-	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 }
 
 static int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
-	if (skb->dst->xfrm != NULL) {
+	if (skb_dst(skb)->xfrm != NULL) {
 		IPCB(skb)->flags |= IPSKB_REROUTED;
 		return dst_output(skb);
 	}
@@ -296,7 +296,7 @@ int ip_mc_output(struct sk_buff *skb)
 
 int ip_output(struct sk_buff *skb)
 {
-	struct net_device *dev = skb->dst->dev;
+	struct net_device *dev = skb_dst(skb)->dev;
 
 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 
@@ -355,7 +355,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 		}
 		sk_setup_caps(sk, &rt->u.dst);
 	}
-	skb->dst = dst_clone(&rt->u.dst);
+	skb_dst_set(skb, dst_clone(&rt->u.dst));
 
 packet_routed:
 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -401,8 +401,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	dst_release(to->dst);
-	to->dst = dst_clone(from->dst);
+	skb_dst_drop(to);
+	skb_dst_set(to, dst_clone(skb_dst(from)));
 	to->dev = from->dev;
 	to->mark = from->mark;
 
@@ -1294,7 +1294,7 @@ int ip_push_pending_frames(struct sock *sk)
 	 * on dst refcount
 	 */
 	inet->cork.dst = NULL;
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 
 	if (iph->protocol == IPPROTO_ICMP)
 		icmp_out_count(net, ((struct icmphdr *)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 0c6e7bf18a4..93e2b787da2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -370,8 +370,7 @@ static int ipip_rcv(struct sk_buff *skb)
 		tunnel->dev->stats.rx_packets++;
 		tunnel->dev->stats.rx_bytes += skb->len;
 		skb->dev = tunnel->dev;
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 		ipip_ecn_decapsulate(iph, skb);
 		netif_rx(skb);
@@ -447,15 +446,15 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (tiph->frag_off)
 		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 	else
-		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
 	if (mtu < 68) {
 		stats->collisions++;
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	df |= (old_iph->frag_off&htons(IP_DF));
 
@@ -502,8 +501,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 69dd058283e..ffd98610446 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -651,7 +651,7 @@ static int ipmr_cache_report(struct net *net,
 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
 	msg = (struct igmpmsg *)skb_network_header(skb);
 	msg->im_vif = vifi;
-	skb->dst = dst_clone(pkt->dst);
+	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
 
 	/*
 	 *	Add our header
@@ -1201,7 +1201,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	iph->protocol	=	IPPROTO_IPIP;
 	iph->ihl	=	5;
 	iph->tot_len	=	htons(skb->len);
-	ip_select_ident(iph, skb->dst, NULL);
+	ip_select_ident(iph, skb_dst(skb), NULL);
 	ip_send_check(iph);
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1212,7 +1212,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
@@ -1290,8 +1290,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 	vif->pkt_out++;
 	vif->bytes_out += skb->len;
 
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 	ip_decrease_ttl(ip_hdr(skb));
 
 	/* FIXME: forward and output firewalls used to be called here.
@@ -1543,8 +1543,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 	skb->protocol = htons(ETH_P_IP);
 	skb->ip_summed = 0;
 	skb->pkt_type = PACKET_HOST;
-	dst_release(skb->dst);
-	skb->dst = NULL;
+	skb_dst_drop(skb);
 	reg_dev->stats.rx_bytes += skb->len;
 	reg_dev->stats.rx_packets++;
 	nf_reset(skb);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index fdf6811c31a..1725dc0ef68 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,7 +12,7 @@
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 {
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi fl = {};
@@ -41,8 +41,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 			return -1;
 
 		/* Drop old route. */
-		dst_release(skb->dst);
-		skb->dst = &rt->u.dst;
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->u.dst);
 	} else {
 		/* non-local src, find valid iif to satisfy
 		 * rp-filter when calling ip_route_input. */
@@ -50,7 +50,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
-		odst = skb->dst;
+		odst = skb_dst(skb);
 		if (ip_route_input(skb, iph->daddr, iph->saddr,
 				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
 			dst_release(&rt->u.dst);
@@ -60,18 +60,22 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		dst_release(odst);
 	}
 
-	if (skb->dst->error)
+	if (skb_dst(skb)->error)
 		return -1;
 
 #ifdef CONFIG_XFRM
 	if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
-	    xfrm_decode_session(skb, &fl, AF_INET) == 0)
-		if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0))
+	    xfrm_decode_session(skb, &fl, AF_INET) == 0) {
+		struct dst_entry *dst = skb_dst(skb);
+		skb_dst_set(skb, NULL);
+		if (xfrm_lookup(net, &dst, &fl, skb->sk, 0))
 			return -1;
+		skb_dst_set(skb, dst);
+	}
 #endif
 
 	/* Change in oif may mean change in hh_len. */
-	hh_len = skb->dst->dev->hard_header_len;
+	hh_len = skb_dst(skb)->dev->hard_header_len;
 	if (skb_headroom(skb) < hh_len &&
 	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
 		return -1;
@@ -92,7 +96,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
 	if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
 		return -1;
 
-	dst = skb->dst;
+	dst = skb_dst(skb);
 	if (dst->xfrm)
 		dst = ((struct xfrm_dst *)dst)->route;
 	dst_hold(dst);
@@ -100,11 +104,11 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
 	if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
 		return -1;
 
-	dst_release(skb->dst);
-	skb->dst = dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
 
 	/* Change in oif may mean change in hh_len. */
-	hh_len = skb->dst->dev->hard_header_len;
+	hh_len = skb_dst(skb)->dev->hard_header_len;
 	if (skb_headroom(skb) < hh_len &&
 	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
 		return -1;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0b4b6e0ff2b..c93ae44bff2 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -108,17 +108,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 		addr_type = RTN_LOCAL;
 
 	/* ip_route_me_harder expects skb->dst to be set */
-	dst_hold(oldskb->dst);
-	nskb->dst = oldskb->dst;
+	skb_dst_set(nskb, dst_clone(skb_dst(oldskb)));
 
 	if (ip_route_me_harder(nskb, addr_type))
 		goto free_nskb;
 
-	niph->ttl	= dst_metric(nskb->dst, RTAX_HOPLIMIT);
+	niph->ttl	= dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
 	nskb->ip_summed = CHECKSUM_NONE;
 
 	/* "Never happens" */
-	if (nskb->len > dst_mtu(nskb->dst))
+	if (nskb->len > dst_mtu(skb_dst(nskb)))
 		goto free_nskb;
 
 	nf_ct_attach(nskb, oldskb);
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index b7dd695691a..5567bd0d075 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -167,10 +167,9 @@ nf_nat_in(unsigned int hooknum,
 
 	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    daddr != ip_hdr(skb)->daddr) {
-		dst_release(skb->dst);
-		skb->dst = NULL;
-	}
+	    daddr != ip_hdr(skb)->daddr)
+		skb_dst_drop(skb);
+
 	return ret;
 }
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f774651f0a4..3dc9171a272 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -343,7 +343,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	skb->dst = dst_clone(&rt->u.dst);
+	skb_dst_set(skb, dst_clone(&rt->u.dst));
 
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f20060ac2f0..a849bb15d86 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1118,7 +1118,7 @@ restart:
 			if (rp)
 				*rp = rth;
 			else
-				skb->dst = &rth->u.dst;
+				skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 
@@ -1217,7 +1217,7 @@ restart:
 	if (rp)
 		*rp = rt;
 	else
-		skb->dst = &rt->u.dst;
+		skb_dst_set(skb, &rt->u.dst);
 	return 0;
 }
 
@@ -2251,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb->dst = &rth->u.dst;
+			skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2934,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
@@ -2975,15 +2975,15 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb->dst = dst_clone(&rt->u.dst);
+			skb_dst_set(skb, dst_clone(&rt->u.dst));
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
-				dst_release(xchg(&skb->dst, NULL));
+				skb_dst_drop(skb);
 				rcu_read_unlock_bh();
 				goto done;
 			}
-			dst_release(xchg(&skb->dst, NULL));
+			skb_dst_drop(skb);
 		}
 		rcu_read_unlock_bh();
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 319c8852644..5a1ca2698c8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -590,7 +590,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 
-	net = dev_net(skb->dst->dev);
+	net = dev_net(skb_dst(skb)->dev);
 	ip_send_reply(net->ipv4.tcp_sock, skb,
 		      &arg, arg.iov[0].iov_len);
 
@@ -617,7 +617,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 			];
 	} rep;
 	struct ip_reply_arg arg;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 79c39dc9b01..416fc4c2e7e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2202,7 +2202,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	/* Reserve space for headers. */
 	skb_reserve(skb, MAX_TCP_HEADER);
 
-	skb->dst = dst_clone(dst);
+	skb_dst_set(skb, dst_clone(dst));
 
 	mss = dst_metric(dst, RTAX_ADVMSS);
 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7a1d1ce22e6..8f4158d7c9a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -328,7 +328,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 	if (unlikely(sk = skb_steal_sock(skb)))
 		return sk;
 	else
-		return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport,
+		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
 					 iph->daddr, dport, inet_iif(skb),
 					 udptable);
 }
@@ -1237,7 +1237,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	struct sock *sk;
 	struct udphdr *uh;
 	unsigned short ulen;
-	struct rtable *rt = (struct rtable*)skb->dst;
+	struct rtable *rt = skb_rtable(skb);
 	__be32 saddr, daddr;
 	struct net *net = dev_net(skb->dev);
 
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 4ec2162a437..f9f922a0ba8 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -23,7 +23,7 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 
 static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 {
-	if (skb->dst == NULL) {
+	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 7135279f3f8..3444f3b34ec 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -28,7 +28,7 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
  */
 static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct iphdr *top_iph;
 	int flags;
 
@@ -41,7 +41,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	top_iph->ihl = 5;
 	top_iph->version = 4;
 
-	top_iph->protocol = xfrm_af2proto(skb->dst->ops->family);
+	top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
 
 	/* DS disclosed */
 	top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 8c3180adddb..c908bd99bcb 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -29,7 +29,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
 		goto out;
 
-	dst = skb->dst;
+	dst = skb_dst(skb);
 	mtu = dst_mtu(dst);
 	if (skb->len > mtu) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(xfrm4_prepare_output);
 static int xfrm4_output_finish(struct sk_buff *skb)
 {
 #ifdef CONFIG_NETFILTER
-	if (!skb->dst->xfrm) {
+	if (!skb_dst(skb)->xfrm) {
 		IPCB(skb)->flags |= IPSKB_REROUTED;
 		return dst_output(skb);
 	}
@@ -87,6 +87,6 @@ static int xfrm4_output_finish(struct sk_buff *skb)
 int xfrm4_output(struct sk_buff *skb)
 {
 	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb,
-			    NULL, skb->dst->dev, xfrm4_output_finish,
+			    NULL, skb_dst(skb)->dev, xfrm4_output_finish,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 1c7f400a3cf..4aae658e550 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -277,7 +277,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		IP6_INC_STATS_BH(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_INHDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -288,7 +288,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	dstbuf = opt->dst1;
 #endif
 
-	dst = dst_clone(skb->dst);
+	dst = dst_clone(skb_dst(skb));
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
 		dst_release(dst);
 		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
@@ -333,7 +333,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_INHDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -343,7 +343,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
 	    skb->pkt_type != PACKET_HOST) {
-		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -358,7 +358,7 @@ looped_back:
 			 * processed by own
 			 */
 			if (!addr) {
-				IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+				IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 						 IPSTATS_MIB_INADDRERRORS);
 				kfree_skb(skb);
 				return -1;
@@ -384,7 +384,7 @@ looped_back:
 			goto unknown_rh;
 		/* Silently discard invalid RTH type 2 */
 		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
-			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 					 IPSTATS_MIB_INHDRERRORS);
 			kfree_skb(skb);
 			return -1;
@@ -403,7 +403,7 @@ looped_back:
 	n = hdr->hdrlen >> 1;
 
 	if (hdr->segments_left > n) {
-		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((&hdr->segments_left) -
@@ -417,7 +417,7 @@ looped_back:
 	if (skb_cloned(skb)) {
 		/* the copy is a forwarded packet */
 		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
-			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 					 IPSTATS_MIB_OUTDISCARDS);
 			kfree_skb(skb);
 			return -1;
@@ -440,13 +440,13 @@ looped_back:
 		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
 				     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
 				     IPPROTO_ROUTING) < 0) {
-			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 					 IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
-		if (!ipv6_chk_home_addr(dev_net(skb->dst->dev), addr)) {
-			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+		if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 					 IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
@@ -458,7 +458,7 @@ looped_back:
 	}
 
 	if (ipv6_addr_is_multicast(addr)) {
-		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
@@ -468,17 +468,17 @@ looped_back:
 	ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr);
 	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr);
 
-	dst_release(xchg(&skb->dst, NULL));
+	skb_dst_drop(skb);
 	ip6_route_input(skb);
-	if (skb->dst->error) {
+	if (skb_dst(skb)->error) {
 		skb_push(skb, skb->data - skb_network_header(skb));
 		dst_input(skb);
 		return -1;
 	}
 
-	if (skb->dst->dev->flags&IFF_LOOPBACK) {
+	if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
-			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 					 IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 				    0, skb->dev);
@@ -494,7 +494,7 @@ looped_back:
 	return -1;
 
 unknown_rh:
-	IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 			  (&hdr->type) - skb_network_header(skb));
 	return -1;
@@ -552,11 +552,11 @@ void ipv6_exthdrs_exit(void)
  **********************************/
 
 /*
- * Note: we cannot rely on skb->dst before we assign it in ip6_route_input().
+ * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input().
  */
 static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb)
 {
-	return skb->dst ? ip6_dst_idev(skb->dst) : __in6_dev_get(skb->dev);
+	return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev);
 }
 
 /* Router Alert as of RFC 2711 */
@@ -581,7 +581,7 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
 {
 	const unsigned char *nh = skb_network_header(skb);
 	u32 pkt_len;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
 		LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3c3732d50c1..cc4797dd832 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -228,7 +228,7 @@ int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
 		__inet6_csk_dst_store(sk, dst, NULL, NULL);
 	}
 
-	skb->dst = dst_clone(dst);
+	skb_dst_set(skb, dst_clone(dst));
 
 	/* Restore final destination back after routing done */
 	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index bc1a920c34a..c3a07d75b5f 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -48,7 +48,7 @@
 
 inline int ip6_rcv_finish( struct sk_buff *skb)
 {
-	if (skb->dst == NULL)
+	if (skb_dst(skb) == NULL)
 		ip6_route_input(skb);
 
 	return dst_input(skb);
@@ -91,7 +91,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	 * arrived via the sending interface (ethX), because of the
 	 * nature of scoping architecture. --yoshfuji
 	 */
-	IP6CB(skb)->iif = skb->dst ? ip6_dst_idev(skb->dst)->dev->ifindex : dev->ifindex;
+	IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
 
 	if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
 		goto err;
@@ -161,7 +161,7 @@ static int ip6_input_finish(struct sk_buff *skb)
 	int nexthdr, raw;
 	u8 hash;
 	struct inet6_dev *idev;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	/*
 	 *	Parse extension headers
@@ -169,7 +169,7 @@ static int ip6_input_finish(struct sk_buff *skb)
 
 	rcu_read_lock();
 resubmit:
-	idev = ip6_dst_idev(skb->dst);
+	idev = ip6_dst_idev(skb_dst(skb));
 	if (!pskb_pull(skb, skb_transport_offset(skb)))
 		goto discard;
 	nhoff = IP6CB(skb)->nhoff;
@@ -242,8 +242,8 @@ int ip6_mc_input(struct sk_buff *skb)
 	struct ipv6hdr *hdr;
 	int deliver;
 
-	IP6_UPD_PO_STATS_BH(dev_net(skb->dst->dev),
-			 ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCAST,
+	IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev),
+			 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
 			 skb->len);
 
 	hdr = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 735a2bf4b5f..c8dc8e5a822 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -78,7 +78,7 @@ int __ip6_local_out(struct sk_buff *skb)
 		len = 0;
 	ipv6_hdr(skb)->payload_len = htons(len);
 
-	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
+	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
 }
 
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(ip6_local_out);
 
 static int ip6_output_finish(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 
 	if (dst->hh)
 		return neigh_hh_output(dst->hh, skb);
@@ -117,7 +117,7 @@ static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 	__skb_pull(newskb, skb_network_offset(newskb));
 	newskb->pkt_type = PACKET_LOOPBACK;
 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
-	WARN_ON(!newskb->dst);
+	WARN_ON(!skb_dst(newskb));
 
 	netif_rx(newskb);
 	return 0;
@@ -126,7 +126,7 @@ static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 
 static int ip6_output2(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
 
 	skb->protocol = htons(ETH_P_IPV6);
@@ -134,7 +134,7 @@ static int ip6_output2(struct sk_buff *skb)
 
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
-		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
+		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 		    ((mroute6_socket(dev_net(dev)) &&
@@ -172,21 +172,21 @@ static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 
 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
-	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 }
 
 int ip6_output(struct sk_buff *skb)
 {
-	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
+	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 	if (unlikely(idev->cnf.disable_ipv6)) {
-		IP6_INC_STATS(dev_net(skb->dst->dev), idev,
+		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 			      IPSTATS_MIB_OUTDISCARDS);
 		kfree_skb(skb);
 		return 0;
 	}
 
 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
-				dst_allfrag(skb->dst))
+				dst_allfrag(skb_dst(skb)))
 		return ip6_fragment(skb, ip6_output2);
 	else
 		return ip6_output2(skb);
@@ -202,7 +202,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 	struct net *net = sock_net(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *first_hop = &fl->fl6_dst;
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *hdr;
 	u8  proto = fl->proto;
 	int seg_len = skb->len;
@@ -222,7 +222,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		if (skb_headroom(skb) < head_room) {
 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 			if (skb2 == NULL) {
-				IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 					      IPSTATS_MIB_OUTDISCARDS);
 				kfree_skb(skb);
 				return -ENOBUFS;
@@ -276,7 +276,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
-		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb->dst),
+		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_OUT, skb->len);
 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 				dst_output);
@@ -286,7 +286,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 	skb->dev = dst->dev;
 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-	IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return -EMSGSIZE;
 }
@@ -416,7 +416,7 @@ static inline int ip6_forward_finish(struct sk_buff *skb)
 
 int ip6_forward(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct net *net = dev_net(dst->dev);
@@ -485,7 +485,7 @@ int ip6_forward(struct sk_buff *skb)
 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
-	dst = skb->dst;
+	dst = skb_dst(skb);
 
 	/* IPv6 specs say nothing about it, but it is clear that we cannot
 	   send redirects to source routed frames.
@@ -566,8 +566,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	dst_release(to->dst);
-	to->dst = dst_clone(from->dst);
+	skb_dst_drop(to);
+	skb_dst_set(to, dst_clone(skb_dst(from)));
 	to->dev = from->dev;
 	to->mark = from->mark;
 
@@ -624,7 +624,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
-	struct rt6_info *rt = (struct rt6_info*)skb->dst;
+	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
@@ -632,7 +632,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	__be32 frag_id = 0;
 	int ptr, offset = 0, err=0;
 	u8 *prevhdr, nexthdr = 0;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
@@ -644,9 +644,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	 * check should be redundant, but it's free.)
 	 */
 	if (!skb->local_df) {
-		skb->dev = skb->dst->dev;
+		skb->dev = skb_dst(skb)->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
 		return -EMSGSIZE;
@@ -696,7 +696,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		*prevhdr = NEXTHDR_FRAGMENT;
 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 		if (!tmp_hdr) {
-			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 				      IPSTATS_MIB_FRAGFAILS);
 			return -ENOMEM;
 		}
@@ -809,7 +809,7 @@ slow_path:
 
 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
-			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 				      IPSTATS_MIB_FRAGFAILS);
 			err = -ENOMEM;
 			goto fail;
@@ -873,16 +873,16 @@ slow_path:
 		if (err)
 			goto fail;
 
-		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_FRAGCREATES);
 	}
-	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 		      IPSTATS_MIB_FRAGOKS);
 	kfree_skb(skb);
 	return err;
 
 fail:
-	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 		      IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return err;
@@ -1516,10 +1516,10 @@ int ip6_push_pending_frames(struct sock *sk)
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
-	skb->dst = dst_clone(&rt->u.dst);
+	skb_dst_set(skb, dst_clone(&rt->u.dst));
 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	if (proto == IPPROTO_ICMPV6) {
-		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
+		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
@@ -1545,8 +1545,8 @@ void ip6_flush_pending_frames(struct sock *sk)
 	struct sk_buff *skb;
 
 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
-		if (skb->dst)
-			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
+		if (skb_dst(skb))
+			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
 				      IPSTATS_MIB_OUTDISCARDS);
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index af256d47fd3..404d16a97d5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -532,8 +532,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!skb2)
 		return 0;
 
-	dst_release(skb2->dst);
-	skb2->dst = NULL;
+	skb_dst_drop(skb2);
+
 	skb_pull(skb2, offset);
 	skb_reset_network_header(skb2);
 	eiph = ip_hdr(skb2);
@@ -560,21 +560,21 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			ip_rt_put(rt);
 			goto out;
 		}
-		skb2->dst = (struct dst_entry *)rt;
+		skb_dst_set(skb2, (struct dst_entry *)rt);
 	} else {
 		ip_rt_put(rt);
 		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
 				   skb2->dev) ||
-		    skb2->dst->dev->type != ARPHRD_TUNNEL)
+		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
 			goto out;
 	}
 
 	/* change mtu on this route */
 	if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
-		if (rel_info > dst_mtu(skb2->dst))
+		if (rel_info > dst_mtu(skb_dst(skb2)))
 			goto out;
 
-		skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
 	}
 
 	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
@@ -606,8 +606,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!skb2)
 			return 0;
 
-		dst_release(skb2->dst);
-		skb2->dst = NULL;
+		skb_dst_drop(skb2);
 		skb_pull(skb2, offset);
 		skb_reset_network_header(skb2);
 
@@ -720,8 +719,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		skb->pkt_type = PACKET_HOST;
 		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 		skb->dev = t->dev;
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 
 		dscp_ecn_decapsulate(t, ipv6h, skb);
@@ -885,8 +883,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	}
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 	if (skb->len > mtu) {
 		*pmtu = mtu;
 		err = -EMSGSIZE;
@@ -910,8 +908,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		kfree_skb(skb);
 		skb = new_skb;
 	}
-	dst_release(skb->dst);
-	skb->dst = dst_clone(dst);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst_clone(dst));
 
 	skb->transport_header = skb->network_header;
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 228be551e9c..a35d8fc55b0 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -398,10 +398,9 @@ static int pim6_rcv(struct sk_buff *skb)
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->ip_summed = 0;
 	skb->pkt_type = PACKET_HOST;
-	dst_release(skb->dst);
+	skb_dst_drop(skb);
 	reg_dev->stats.rx_bytes += skb->len;
 	reg_dev->stats.rx_packets++;
-	skb->dst = NULL;
 	nf_reset(skb);
 	netif_rx(skb);
 	dev_put(reg_dev);
@@ -849,7 +848,7 @@ static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi,
 	ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
 	ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
 
-	skb->dst = dst_clone(pkt->dst);
+	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
@@ -1487,7 +1486,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 
 static inline int ip6mr_forward2_finish(struct sk_buff *skb)
 {
-	IP6_INC_STATS_BH(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
+	IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
 			 IPSTATS_MIB_OUTFORWDATAGRAMS);
 	return dst_output(skb);
 }
@@ -1532,8 +1531,8 @@ static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi)
 	if (!dst)
 		goto out_free;
 
-	dst_release(skb->dst);
-	skb->dst = dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
 
 	/*
 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
@@ -1722,7 +1721,7 @@ int ip6mr_get_route(struct net *net,
 {
 	int err;
 	struct mfc6_cache *cache;
-	struct rt6_info *rt = (struct rt6_info *)skb->dst;
+	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 
 	read_lock(&mrt_lock);
 	cache = ip6mr_cache_find(net, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4b48819a5b8..4b264ed40a8 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1448,6 +1448,7 @@ static void mld_sendpack(struct sk_buff *skb)
 	struct net *net = dev_net(skb->dev);
 	int err;
 	struct flowi fl;
+	struct dst_entry *dst;
 
 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 
@@ -1459,9 +1460,9 @@ static void mld_sendpack(struct sk_buff *skb)
 		IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb),
 					     mldlen, 0));
 
-	skb->dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
+	dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
 
-	if (!skb->dst) {
+	if (!dst) {
 		err = -ENOMEM;
 		goto err_out;
 	}
@@ -1470,7 +1471,8 @@ static void mld_sendpack(struct sk_buff *skb)
 			 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
 			 skb->dev->ifindex);
 
-	err = xfrm_lookup(net, &skb->dst, &fl, NULL, 0);
+	err = xfrm_lookup(net, &dst, &fl, NULL, 0);
+	skb_dst_set(skb, dst);
 	if (err)
 		goto err_out;
 
@@ -1775,6 +1777,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 		     IPV6_TLV_ROUTERALERT, 2, 0, 0,
 		     IPV6_TLV_PADN, 0 };
 	struct flowi fl;
+	struct dst_entry *dst;
 
 	if (type == ICMPV6_MGM_REDUCTION)
 		snd_addr = &in6addr_linklocal_allrouters;
@@ -1828,8 +1831,8 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 
 	idev = in6_dev_get(skb->dev);
 
-	skb->dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
-	if (!skb->dst) {
+	dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
+	if (!dst) {
 		err = -ENOMEM;
 		goto err_out;
 	}
@@ -1838,11 +1841,11 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 			 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
 			 skb->dev->ifindex);
 
-	err = xfrm_lookup(net, &skb->dst, &fl, NULL, 0);
+	err = xfrm_lookup(net, &dst, &fl, NULL, 0);
 	if (err)
 		goto err_out;
 
-
+	skb_dst_set(skb, dst);
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev,
 		      dst_output);
 out:
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 1d13d996498..9eb68e92cc1 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -530,7 +530,7 @@ void ndisc_send_skb(struct sk_buff *skb,
 		return;
 	}
 
-	skb->dst = dst;
+	skb_dst_set(skb, dst);
 
 	idev = in6_dev_get(dst->dev);
 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
@@ -1612,7 +1612,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 					     len, IPPROTO_ICMPV6,
 					     csum_partial(icmph, len, 0));
 
-	buff->dst = dst;
+	skb_dst_set(buff, dst);
 	idev = in6_dev_get(dst->dev);
 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 	err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 834cea69fb5..d5ed92b1434 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -12,7 +12,7 @@
 
 int ip6_route_me_harder(struct sk_buff *skb)
 {
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct dst_entry *dst;
 	struct flowi fl = {
@@ -28,9 +28,15 @@ int ip6_route_me_harder(struct sk_buff *skb)
 
 #ifdef CONFIG_XFRM
 	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
-	    xfrm_decode_session(skb, &fl, AF_INET6) == 0)
-		if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0))
+	    xfrm_decode_session(skb, &fl, AF_INET6) == 0) {
+		struct dst_entry *dst2 = skb_dst(skb);
+
+		if (xfrm_lookup(net, &dst2, &fl, skb->sk, 0)) {
+			skb_dst_set(skb, NULL);
 			return -1;
+		}
+		skb_dst_set(skb, dst2);
+	}
 #endif
 
 	if (dst->error) {
@@ -41,9 +47,9 @@ int ip6_route_me_harder(struct sk_buff *skb)
 	}
 
 	/* Drop old route. */
-	dst_release(skb->dst);
+	skb_dst_drop(skb);
 
-	skb->dst = dst;
+	skb_dst_set(skb, dst);
 	return 0;
 }
 EXPORT_SYMBOL(ip6_route_me_harder);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 5a2d0a41694..5a7f00cd15c 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -112,7 +112,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
 		return;
 	}
 
-	nskb->dst = dst;
+	skb_dst_set(nskb, dst);
 
 	skb_reserve(nskb, hh_len + dst->header_len);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e99307fba0b..36a090d87a3 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -625,7 +625,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	skb->dst = dst_clone(&rt->u.dst);
+	skb_dst_set(skb, dst_clone(&rt->u.dst));
 
 	skb_put(skb, length);
 	skb_reset_network_header(skb);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e9ac7a12f59..54a387d31e1 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -267,7 +267,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
 	int offset, end;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	if (fq->q.last_in & INET_FRAG_COMPLETE)
 		goto err;
@@ -277,7 +277,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
-		IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 				 IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((u8 *)&fhdr->frag_off -
@@ -310,7 +310,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			/* RFC2460 says always send parameter problem in
 			 * this case. -DaveM
 			 */
-			IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst),
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 					 IPSTATS_MIB_INHDRERRORS);
 			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 					  offsetof(struct ipv6hdr, payload_len));
@@ -434,7 +434,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -1;
 
 err:
-	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 		      IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
 	return -1;
@@ -576,9 +576,9 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_hdr *fhdr;
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
-	IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
 
 	/* Jumbo payload inhibits frag. header */
 	if (hdr->payload_len==0)
@@ -595,17 +595,17 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		/* It is not a fragmented frame */
 		skb->transport_header += sizeof(struct frag_hdr);
 		IP6_INC_STATS_BH(net,
-				 ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMOKS);
+				 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
 
 		IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
 		return 1;
 	}
 
 	if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh)
-		ip6_evictor(net, ip6_dst_idev(skb->dst));
+		ip6_evictor(net, ip6_dst_idev(skb_dst(skb)));
 
 	if ((fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
-			  ip6_dst_idev(skb->dst))) != NULL) {
+			  ip6_dst_idev(skb_dst(skb)))) != NULL) {
 		int ret;
 
 		spin_lock(&fq->q.lock);
@@ -617,12 +617,12 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return ret;
 	}
 
-	IP6_INC_STATS_BH(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
 	return -1;
 
 fail_hdr:
-	IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS);
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
 	return -1;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 032a5ec391c..658293ea05b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -800,7 +800,7 @@ void ip6_route_input(struct sk_buff *skb)
 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
 		flags |= RT6_LOOKUP_F_IFACE;
 
-	skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
+	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
 }
 
 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
@@ -911,7 +911,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
 
-	rt = (struct rt6_info *) skb->dst;
+	rt = (struct rt6_info *) skb_dst(skb);
 	if (rt) {
 		if (rt->rt6i_flags&RTF_CACHE) {
 			dst_set_expires(&rt->u.dst, 0);
@@ -1868,7 +1868,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
 {
 	int type;
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	switch (ipstats_mib_noroutes) {
 	case IPSTATS_MIB_INNOROUTES:
 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
@@ -1895,7 +1895,7 @@ static int ip6_pkt_discard(struct sk_buff *skb)
 
 static int ip6_pkt_discard_out(struct sk_buff *skb)
 {
-	skb->dev = skb->dst->dev;
+	skb->dev = skb_dst(skb)->dev;
 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
 }
 
@@ -1908,7 +1908,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb)
 
 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
 {
-	skb->dev = skb->dst->dev;
+	skb->dev = skb_dst(skb)->dev;
 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
 }
 
@@ -2366,7 +2366,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
 
 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 
 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b3a59bd40f0..68e52308e55 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -575,8 +575,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 		tunnel->dev->stats.rx_packets++;
 		tunnel->dev->stats.rx_bytes += skb->len;
 		skb->dev = tunnel->dev;
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 		ipip6_ecn_decapsulate(iph, skb);
 		netif_rx(skb);
@@ -638,8 +637,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (dev->priv_flags & IFF_ISATAP) {
 		struct neighbour *neigh = NULL;
 
-		if (skb->dst)
-			neigh = skb->dst->neighbour;
+		if (skb_dst(skb))
+			neigh = skb_dst(skb)->neighbour;
 
 		if (neigh == NULL) {
 			if (net_ratelimit())
@@ -663,8 +662,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (!dst) {
 		struct neighbour *neigh = NULL;
 
-		if (skb->dst)
-			neigh = skb->dst->neighbour;
+		if (skb_dst(skb))
+			neigh = skb_dst(skb)->neighbour;
 
 		if (neigh == NULL) {
 			if (net_ratelimit())
@@ -714,7 +713,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (tiph->frag_off)
 		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 	else
-		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
 	if (mtu < 68) {
 		stats->collisions++;
@@ -723,8 +722,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
-	if (tunnel->parms.iph.daddr && skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (tunnel->parms.iph.daddr && skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (skb->len > mtu) {
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
@@ -768,8 +767,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags = 0;
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ea37741062a..53b6a4192b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -981,9 +981,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	struct tcphdr *th = tcp_hdr(skb), *t1;
 	struct sk_buff *buff;
 	struct flowi fl;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
 	unsigned int tot_len = sizeof(struct tcphdr);
+	struct dst_entry *dst;
 	__be32 *topt;
 
 	if (ts)
@@ -1052,8 +1053,9 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	 * Underlying function will use this to retrieve the network
 	 * namespace
 	 */
-	if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) {
-		if (xfrm_lookup(net, &buff->dst, &fl, NULL, 0) >= 0) {
+	if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) {
+		if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) {
+			skb_dst_set(buff, dst);
 			ip6_xmit(ctl_sk, buff, &fl, NULL, 0);
 			TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 			if (rst)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8905712cfbb..fc333d85472 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -177,10 +177,9 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 
 	if (unlikely(sk = skb_steal_sock(skb)))
 		return sk;
-	else
-		return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport,
-					 &iph->daddr, dport, inet6_iif(skb),
-					 udptable);
+	return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport,
+				 &iph->daddr, dport, inet6_iif(skb),
+				 udptable);
 }
 
 /*
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index e20529b4c82..3927832227b 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -31,7 +31,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
  */
 static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *top_iph;
 	int dsfield;
 
@@ -45,7 +45,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 
 	memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
 	       sizeof(top_iph->flow_lbl));
-	top_iph->nexthdr = xfrm_af2proto(skb->dst->ops->family);
+	top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family);
 
 	dsfield = XFRM_MODE_SKB_CB(skb)->tos;
 	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 5ee5a031bc9..c4f4eef032a 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -30,7 +30,7 @@ EXPORT_SYMBOL(xfrm6_find_1stfragopt);
 static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 
 	mtu = dst_mtu(dst);
 	if (mtu < IPV6_MIN_MTU)
@@ -90,6 +90,6 @@ static int xfrm6_output_finish(struct sk_buff *skb)
 
 int xfrm6_output(struct sk_buff *skb)
 {
-	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dst->dev,
+	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb_dst(skb)->dev,
 		       xfrm6_output_finish);
 }
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 425ab144f15..5874657af7f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -260,8 +260,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_send_check(ip_hdr(skb));
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -324,8 +324,8 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	}
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -388,8 +388,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
@@ -465,8 +465,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
@@ -553,8 +553,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
 		goto tx_error;
 	}
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	df |= (old_iph->frag_off & htons(IP_DF));
 
@@ -596,8 +596,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -665,8 +665,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
 		goto tx_error;
 	}
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
@@ -702,8 +702,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -775,8 +775,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_send_check(ip_hdr(skb));
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -828,8 +828,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	}
 
 	/* drop old route */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -900,8 +900,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 
 	/* drop the old route when skb is not shared */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
@@ -975,8 +975,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 
 	/* drop the old route when skb is not shared */
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 117b80112fc..a6d6ec320fb 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -176,7 +176,7 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
 static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 			     struct nf_conntrack_tuple *tuple)
 {
-	struct net *net = dev_net(skb->dev ? skb->dev : skb->dst->dev);
+	struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev);
 	const struct gre_hdr_pptp *pgrehdr;
 	struct gre_hdr_pptp _pgrehdr;
 	__be16 srckey;
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 4f3b1f80879..eda64c1cb1e 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -73,11 +73,11 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 	}
 
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
-		if (dst_mtu(skb->dst) <= minlen) {
+		if (dst_mtu(skb_dst(skb)) <= minlen) {
 			if (net_ratelimit())
 				printk(KERN_ERR "xt_TCPMSS: "
 				       "unknown or invalid path-MTU (%u)\n",
-				       dst_mtu(skb->dst));
+				       dst_mtu(skb_dst(skb)));
 			return -1;
 		}
 		if (in_mtu <= minlen) {
@@ -86,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 				       "invalid path-MTU (%u)\n", in_mtu);
 			return -1;
 		}
-		newmss = min(dst_mtu(skb->dst), in_mtu) - minlen;
+		newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
 	} else
 		newmss = info->mss;
 
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 328bd20ddd2..4cbfebda8fa 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -86,7 +86,7 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
 		 unsigned short family)
 {
 	const struct xt_policy_elem *e;
-	const struct dst_entry *dst = skb->dst;
+	const struct dst_entry *dst = skb_dst(skb);
 	int strict = info->flags & XT_POLICY_MATCH_STRICT;
 	int i, pos;
 
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 67419287bc7..484d1689bfd 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -25,7 +25,7 @@ static bool
 realm_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const struct xt_realm_info *info = par->matchinfo;
-	const struct dst_entry *dst = skb->dst;
+	const struct dst_entry *dst = skb_dst(skb);
 
 	return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
 }
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 6da9f38ef5c..4f76e5552d8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -372,8 +372,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
 		goto oom;
 
 	/* drop any routing info */
-	dst_release(skb->dst);
-	skb->dst = NULL;
+	skb_dst_drop(skb);
 
 	/* drop conntrack reference */
 	nf_reset(skb);
@@ -621,8 +620,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
 
 	skb_set_owner_r(skb, sk);
 	skb->dev = NULL;
-	dst_release(skb->dst);
-	skb->dst = NULL;
+	skb_dst_drop(skb);
 
 	/* drop conntrack reference */
 	nf_reset(skb);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 0ef4e3065bc..9402a7fd378 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -84,7 +84,7 @@ static u32 flow_get_dst(const struct sk_buff *skb)
 	case htons(ETH_P_IPV6):
 		return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
 	default:
-		return addr_fold(skb->dst) ^ (__force u16)skb->protocol;
+		return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
 	}
 }
 
@@ -163,7 +163,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb)
 		break;
 	}
 	default:
-		res = addr_fold(skb->dst) ^ (__force u16)skb->protocol;
+		res = addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
 	}
 
 	return res;
@@ -251,8 +251,8 @@ fallback:
 static u32 flow_get_rtclassid(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (skb->dst)
-		return skb->dst->tclassid;
+	if (skb_dst(skb))
+		return skb_dst(skb)->tclassid;
 #endif
 	return 0;
 }
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index bdf1f4172ee..dd872d5383e 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -137,7 +137,7 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	u32 id, h;
 	int iif, dont_cache = 0;
 
-	if ((dst = skb->dst) == NULL)
+	if ((dst = skb_dst(skb)) == NULL)
 		goto failure;
 
 	id = dst->tclassid;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index b6b588bed4e..266151ae85a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -246,11 +246,11 @@ META_COLLECTOR(int_tcindex)
 
 META_COLLECTOR(int_rtclassid)
 {
-	if (unlikely(skb->dst == NULL))
+	if (unlikely(skb_dst(skb) == NULL))
 		*err = -1;
 	else
 #ifdef CONFIG_NET_CLS_ROUTE
-		dst->value = skb->dst->tclassid;
+		dst->value = skb_dst(skb)->tclassid;
 #else
 		dst->value = 0;
 #endif
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 33133d27b53..8706920a6d4 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -149,7 +149,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 		break;
 	}
 	default:
-		h = (unsigned long)skb->dst ^ skb->protocol;
+		h = (unsigned long)skb_dst(skb) ^ skb->protocol;
 		h2 = (unsigned long)skb->sk;
 	}
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index a886496bdc3..cb1cb1e76b9 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -222,7 +222,7 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
 {
 	struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
 	struct teql_sched_data *q = qdisc_priv(dev_queue->qdisc);
-	struct neighbour *mn = skb->dst->neighbour;
+	struct neighbour *mn = skb_dst(skb)->neighbour;
 	struct neighbour *n = q->ncache;
 
 	if (mn->tbl == NULL)
@@ -262,8 +262,8 @@ static inline int teql_resolve(struct sk_buff *skb,
 		return -ENODEV;
 
 	if (dev->header_ops == NULL ||
-	    skb->dst == NULL ||
-	    skb->dst->neighbour == NULL)
+	    skb_dst(skb) == NULL ||
+	    skb_dst(skb)->neighbour == NULL)
 		return 0;
 	return __teql_resolve(skb, skb_res, dev);
 }
diff --git a/net/sctp/output.c b/net/sctp/output.c
index f0c91df59d4..b7641144451 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -405,10 +405,10 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 			sctp_assoc_sync_pmtu(asoc);
 		}
 	}
-	nskb->dst = dst_clone(tp->dst);
-	if (!nskb->dst)
+	dst = dst_clone(tp->dst);
+	skb_dst_set(nskb, dst);
+	if (dst)
 		goto no_route;
-	dst = nskb->dst;
 
 	/* Build the SCTP header.  */
 	sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e1859614601..6c2d6158655 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -918,7 +918,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
 	UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS);
 
 	/* Something worked... */
-	dst_confirm(skb->dst);
+	dst_confirm(skb_dst(skb));
 
 	xprt_adjust_cwnd(task, copied);
 	xprt_update_rtt(task);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index b4a13178fb4..e0009c17d80 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -251,8 +251,7 @@ resume:
 	nf_reset(skb);
 
 	if (decaps) {
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		netif_rx(skb);
 		return 0;
 	} else {
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index c235597ba8d..b9fe13138c0 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -22,7 +22,7 @@ static int xfrm_output2(struct sk_buff *skb);
 
 static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev)
 		- skb_headroom(skb);
 	int ntail = dst->dev->needed_tailroom - skb_tailroom(skb);
@@ -39,7 +39,7 @@ static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb)
 
 static int xfrm_output_one(struct sk_buff *skb, int err)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
 	struct net *net = xs_net(x);
 
@@ -94,12 +94,13 @@ resume:
 			goto error_nolock;
 		}
 
-		if (!(skb->dst = dst_pop(dst))) {
+		dst = dst_pop(dst);
+		if (!dst) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
 			err = -EHOSTUNREACH;
 			goto error_nolock;
 		}
-		dst = skb->dst;
+		skb_dst_set(skb, dst);
 		x = dst->xfrm;
 	} while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
 
@@ -119,16 +120,16 @@ int xfrm_output_resume(struct sk_buff *skb, int err)
 	while (likely((err = xfrm_output_one(skb, err)) == 0)) {
 		nf_reset(skb);
 
-		err = skb->dst->ops->local_out(skb);
+		err = skb_dst(skb)->ops->local_out(skb);
 		if (unlikely(err != 1))
 			goto out;
 
-		if (!skb->dst->xfrm)
+		if (!skb_dst(skb)->xfrm)
 			return dst_output(skb);
 
-		err = nf_hook(skb->dst->ops->family,
+		err = nf_hook(skb_dst(skb)->ops->family,
 			      NF_INET_POST_ROUTING, skb,
-			      NULL, skb->dst->dev, xfrm_output2);
+			      NULL, skb_dst(skb)->dev, xfrm_output2);
 		if (unlikely(err != 1))
 			goto out;
 	}
@@ -179,7 +180,7 @@ static int xfrm_output_gso(struct sk_buff *skb)
 
 int xfrm_output(struct sk_buff *skb)
 {
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	int err;
 
 	if (skb_is_gso(skb))
@@ -202,7 +203,7 @@ int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct xfrm_mode *inner_mode;
 	if (x->sel.family == AF_UNSPEC)
 		inner_mode = xfrm_ip2inner_mode(x,
-				xfrm_af2proto(skb->dst->ops->family));
+				xfrm_af2proto(skb_dst(skb)->ops->family));
 	else
 		inner_mode = x->inner_mode;
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9c068ab3a83..cb81ca35b0d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2027,6 +2027,8 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct net *net = dev_net(skb->dev);
 	struct flowi fl;
+	struct dst_entry *dst;
+	int res;
 
 	if (xfrm_decode_session(skb, &fl, family) < 0) {
 		/* XXX: we should have something like FWDHDRERROR here. */
@@ -2034,7 +2036,11 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 		return 0;
 	}
 
-	return xfrm_lookup(net, &skb->dst, &fl, NULL, 0) == 0;
+	dst = skb_dst(skb);
+
+	res = xfrm_lookup(net, &dst, &fl, NULL, 0) == 0;
+	skb_dst_set(skb, dst);
+	return res;
 }
 EXPORT_SYMBOL(__xfrm_route_forward);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2fcad7c33ea..4bfc6153ad4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4503,7 +4503,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
 	 * when the packet is on it's final way out.
 	 * NOTE: there appear to be some IPv6 multicast cases where skb->dst
 	 *       is NULL, in this case go ahead and apply access control. */
-	if (skb->dst != NULL && skb->dst->xfrm != NULL)
+	if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL)
 		return NF_ACCEPT;
 #endif
 	secmark_active = selinux_secmark_enabled();
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index c0eb72013d6..72b18452e1a 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -447,7 +447,7 @@ int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
 	struct dst_entry *dst;
 	int rc = 0;
 
-	dst = skb->dst;
+	dst = skb_dst(skb);
 
 	if (dst) {
 		struct dst_entry *dst_test;
-- 
cgit v1.2.3-70-g09d2


From e5b9215ef9a274eb9fb65f6aa4602ad82d10a6cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:20:21 +0000
Subject: net: skb cleanup

Can remove anonymous union now it has one field.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9ef6eb20247..7305da92be8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -322,9 +322,7 @@ struct sk_buff {
 	ktime_t			tstamp;
 	struct net_device	*dev;
 
-	union {
-		unsigned long		_skb_dst;
-	};
+	unsigned long		_skb_dst;
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
-- 
cgit v1.2.3-70-g09d2


From a84db7949eab7a42e715192f62c55c554e195e54 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 7 Apr 2009 15:41:39 +0800
Subject: sctp: fix error cause codes of ADD-IP extension

RFC5061 had changed the error cause codes for Dynamic Address
Reconfiguration As the following:

       Cause Code
       Value          Cause Code
       ---------      ----------------
       0x00A0          Request to Delete Last Remaining IP Address
       0x00A1          Operation Refused Due to Resource Shortage
       0x00A2          Request to Delete Source IP Address
       0x00A3          Association Aborted Due to Illegal ASCONF-ACK
       0x00A4          Request Refused - No Authorization

This patch fix the error cause codes.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 include/linux/sctp.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index c2731bfe04d..b464b9d3d24 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -487,17 +487,17 @@ typedef enum {
 	 *
 	 * Value          Cause Code
 	 * ---------      ----------------
-	 * 0x0100          Request to Delete Last Remaining IP Address.
-	 * 0x0101          Operation Refused Due to Resource Shortage.
-	 * 0x0102          Request to Delete Source IP Address.
-	 * 0x0103          Association Aborted due to illegal ASCONF-ACK
-	 * 0x0104          Request refused - no authorization.
+	 * 0x00A0          Request to Delete Last Remaining IP Address.
+	 * 0x00A1          Operation Refused Due to Resource Shortage.
+	 * 0x00A2          Request to Delete Source IP Address.
+	 * 0x00A3          Association Aborted due to illegal ASCONF-ACK
+	 * 0x00A4          Request refused - no authorization.
 	 */
-	SCTP_ERROR_DEL_LAST_IP	= cpu_to_be16(0x0100),
-	SCTP_ERROR_RSRC_LOW	= cpu_to_be16(0x0101),
-	SCTP_ERROR_DEL_SRC_IP	= cpu_to_be16(0x0102),
-	SCTP_ERROR_ASCONF_ACK   = cpu_to_be16(0x0103),
-	SCTP_ERROR_REQ_REFUSED	= cpu_to_be16(0x0104),
+	SCTP_ERROR_DEL_LAST_IP	= cpu_to_be16(0x00A0),
+	SCTP_ERROR_RSRC_LOW	= cpu_to_be16(0x00A1),
+	SCTP_ERROR_DEL_SRC_IP	= cpu_to_be16(0x00A2),
+	SCTP_ERROR_ASCONF_ACK   = cpu_to_be16(0x00A3),
+	SCTP_ERROR_REQ_REFUSED	= cpu_to_be16(0x00A4),
 
 	/* AUTH Section 4.  New Error Cause
 	 *
-- 
cgit v1.2.3-70-g09d2


From 8fc0fee09245fe740d81aded36f4af6db1054015 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sun, 24 May 2009 16:57:19 +0200
Subject: cfg80211: use key size constants

Instead of hardcoding the key length for validation, use the
constants Zhu Yi recently added and add one for AES_CMAC too.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h |  1 +
 net/wireless/util.c       | 11 +++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 34de8b21f6d..a9173d5434d 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1092,6 +1092,7 @@ enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP104 = 13,
 	WLAN_KEY_LEN_CCMP = 16,
 	WLAN_KEY_LEN_TKIP = 32,
+	WLAN_KEY_LEN_AES_CMAC = 16,
 };
 
 /*
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d072bff463a..5be9ed7ae1e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -157,26 +157,25 @@ int cfg80211_validate_key_settings(struct key_params *params, int key_idx,
 	    params->cipher != WLAN_CIPHER_SUITE_WEP104)
 		return -EINVAL;
 
-	/* TODO: add definitions for the lengths to linux/ieee80211.h */
 	switch (params->cipher) {
 	case WLAN_CIPHER_SUITE_WEP40:
-		if (params->key_len != 5)
+		if (params->key_len != WLAN_KEY_LEN_WEP40)
 			return -EINVAL;
 		break;
 	case WLAN_CIPHER_SUITE_TKIP:
-		if (params->key_len != 32)
+		if (params->key_len != WLAN_KEY_LEN_TKIP)
 			return -EINVAL;
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
-		if (params->key_len != 16)
+		if (params->key_len != WLAN_KEY_LEN_CCMP)
 			return -EINVAL;
 		break;
 	case WLAN_CIPHER_SUITE_WEP104:
-		if (params->key_len != 13)
+		if (params->key_len != WLAN_KEY_LEN_WEP104)
 			return -EINVAL;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
-		if (params->key_len != 16)
+		if (params->key_len != WLAN_KEY_LEN_AES_CMAC)
 			return -EINVAL;
 		break;
 	default:
-- 
cgit v1.2.3-70-g09d2


From 3b8bcfd5d31ea0fec58681d035544ace707d2536 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sat, 30 May 2009 01:39:53 +0200
Subject: net: introduce pre-up netdev notifier

NETDEV_UP is called after the device is set UP, but sometimes
it is useful to be able to veto the device UP. Introduce a
new NETDEV_PRE_UP notifier that can be used for exactly this.
The first use case will be cfg80211 denying interfaces to be
set UP if the device is known to be rfkill'ed.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/notifier.h | 1 +
 net/core/dev.c           | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b86fa2ffca0..81bc252dc8a 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -198,6 +198,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_CHANGENAME	0x000A
 #define NETDEV_FEAT_CHANGE	0x000B
 #define NETDEV_BONDING_FAILOVER 0x000C
+#define NETDEV_PRE_UP		0x000D
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
diff --git a/net/core/dev.c b/net/core/dev.c
index 34b49a6a22f..1f38401fc02 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1048,7 +1048,7 @@ void dev_load(struct net *net, const char *name)
 int dev_open(struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	int ret = 0;
+	int ret;
 
 	ASSERT_RTNL();
 
@@ -1065,6 +1065,11 @@ int dev_open(struct net_device *dev)
 	if (!netif_device_present(dev))
 		return -ENODEV;
 
+	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		return ret;
+
 	/*
 	 *	Call device private open method
 	 */
-- 
cgit v1.2.3-70-g09d2


From 19d337dff95cbf76edd3ad95c0cee2732c3e1ec5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 2 Jun 2009 13:01:37 +0200
Subject: rfkill: rewrite

This patch completely rewrites the rfkill core to address
the following deficiencies:

 * all rfkill drivers need to implement polling where necessary
   rather than having one central implementation

 * updating the rfkill state cannot be done from arbitrary
   contexts, forcing drivers to use schedule_work and requiring
   lots of code

 * rfkill drivers need to keep track of soft/hard blocked
   internally -- the core should do this

 * the rfkill API has many unexpected quirks, for example being
   asymmetric wrt. alloc/free and register/unregister

 * rfkill can call back into a driver from within a function the
   driver called -- this is prone to deadlocks and generally
   should be avoided

 * rfkill-input pointlessly is a separate module

 * drivers need to #ifdef rfkill functions (unless they want to
   depend on or select RFKILL) -- rfkill should provide inlines
   that do nothing if it isn't compiled in

 * the rfkill structure is not opaque -- drivers need to initialise
   it correctly (lots of sanity checking code required) -- instead
   force drivers to pass the right variables to rfkill_alloc()

 * the documentation is hard to read because it always assumes the
   reader is completely clueless and contains way TOO MANY CAPS

 * the rfkill code needlessly uses a lot of locks and atomic
   operations in locked sections

 * fix LED trigger to actually change the LED when the radio state
   changes -- this wasn't done before

Tested-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br> [thinkpad]
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/rfkill.txt                   | 597 +++----------------
 MAINTAINERS                                |   6 +-
 arch/arm/mach-pxa/tosa-bt.c                |  30 +-
 arch/arm/mach-pxa/tosa.c                   |   1 -
 drivers/net/usb/hso.c                      |  42 +-
 drivers/net/wireless/ath/ath9k/ath9k.h     |   7 +-
 drivers/net/wireless/ath/ath9k/main.c      | 115 +---
 drivers/net/wireless/ath/ath9k/pci.c       |  15 -
 drivers/net/wireless/b43/Kconfig           |   2 +-
 drivers/net/wireless/b43/leds.c            |   2 +-
 drivers/net/wireless/b43/main.c            |   4 +-
 drivers/net/wireless/b43/phy_a.c           |   4 +-
 drivers/net/wireless/b43/phy_common.c      |  17 +-
 drivers/net/wireless/b43/phy_common.h      |   4 +-
 drivers/net/wireless/b43/phy_g.c           |   4 +-
 drivers/net/wireless/b43/phy_lp.c          |   2 +-
 drivers/net/wireless/b43/phy_n.c           |   2 +-
 drivers/net/wireless/b43/rfkill.c          | 123 ++--
 drivers/net/wireless/b43/rfkill.h          |   5 +-
 drivers/net/wireless/b43legacy/Kconfig     |   2 +-
 drivers/net/wireless/b43legacy/leds.c      |   3 +-
 drivers/net/wireless/b43legacy/rfkill.c    | 123 ++--
 drivers/net/wireless/b43legacy/rfkill.h    |   6 +-
 drivers/net/wireless/iwlwifi/Kconfig       |   5 +-
 drivers/net/wireless/iwlwifi/iwl-rfkill.c  |  69 +--
 drivers/net/wireless/iwmc3200wifi/rfkill.c |  39 +-
 drivers/platform/x86/Kconfig               |  14 +-
 drivers/platform/x86/acer-wmi.c            |  50 +-
 drivers/platform/x86/dell-laptop.c         | 101 ++--
 drivers/platform/x86/eeepc-laptop.c        |  99 +---
 drivers/platform/x86/hp-wmi.c              | 103 ++--
 drivers/platform/x86/sony-laptop.c         | 191 +++---
 drivers/platform/x86/thinkpad_acpi.c       | 873 ++++++++++++++--------------
 drivers/platform/x86/toshiba_acpi.c        | 159 ++---
 include/linux/Kbuild                       |   1 +
 include/linux/rfkill.h                     | 325 ++++++++---
 include/net/wimax.h                        |   8 +-
 net/rfkill/Kconfig                         |  21 +-
 net/rfkill/Makefile                        |   5 +-
 net/rfkill/core.c                          | 896 +++++++++++++++++++++++++++++
 net/rfkill/input.c                         | 342 +++++++++++
 net/rfkill/rfkill-input.c                  | 390 -------------
 net/rfkill/rfkill-input.h                  |  21 -
 net/rfkill/rfkill.c                        | 855 ---------------------------
 net/rfkill/rfkill.h                        |  27 +
 net/wimax/Kconfig                          |  14 -
 net/wimax/op-rfkill.c                      | 123 +---
 47 files changed, 2555 insertions(+), 3292 deletions(-)
 create mode 100644 net/rfkill/core.c
 create mode 100644 net/rfkill/input.c
 delete mode 100644 net/rfkill/rfkill-input.c
 delete mode 100644 net/rfkill/rfkill-input.h
 delete mode 100644 net/rfkill/rfkill.c
 create mode 100644 net/rfkill/rfkill.h

(limited to 'include/linux')

diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 40c3a3f1081..de941e309d4 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -1,571 +1,130 @@
-rfkill - RF switch subsystem support
-====================================
+rfkill - RF kill switch support
+===============================
 
-1 Introduction
-2 Implementation details
-3 Kernel driver guidelines
-3.1 wireless device drivers
-3.2 platform/switch drivers
-3.3 input device drivers
-4 Kernel API
-5 Userspace support
+1. Introduction
+2. Implementation details
+3. Kernel driver guidelines
+4. Kernel API
+5. Userspace support
 
 
-1. Introduction:
+1. Introduction
 
-The rfkill switch subsystem exists to add a generic interface to circuitry that
-can enable or disable the signal output of a wireless *transmitter* of any
-type.  By far, the most common use is to disable radio-frequency transmitters.
+The rfkill subsystem provides a generic interface to disabling any radio
+transmitter in the system. When a transmitter is blocked, it shall not
+radiate any power.
 
-Note that disabling the signal output means that the the transmitter is to be
-made to not emit any energy when "blocked".  rfkill is not about blocking data
-transmissions, it is about blocking energy emission.
+The subsystem also provides the ability to react on button presses and
+disable all transmitters of a certain type (or all). This is intended for
+situations where transmitters need to be turned off, for example on
+aircraft.
 
-The rfkill subsystem offers support for keys and switches often found on
-laptops to enable wireless devices like WiFi and Bluetooth, so that these keys
-and switches actually perform an action in all wireless devices of a given type
-attached to the system.
 
-The buttons to enable and disable the wireless transmitters are important in
-situations where the user is for example using his laptop on a location where
-radio-frequency transmitters _must_ be disabled (e.g. airplanes).
 
-Because of this requirement, userspace support for the keys should not be made
-mandatory.  Because userspace might want to perform some additional smarter
-tasks when the key is pressed, rfkill provides userspace the possibility to
-take over the task to handle the key events.
-
-===============================================================================
-2: Implementation details
+2. Implementation details
 
 The rfkill subsystem is composed of various components: the rfkill class, the
 rfkill-input module (an input layer handler), and some specific input layer
 events.
 
-The rfkill class provides kernel drivers with an interface that allows them to
-know when they should enable or disable a wireless network device transmitter.
-This is enabled by the CONFIG_RFKILL Kconfig option.
-
-The rfkill class support makes sure userspace will be notified of all state
-changes on rfkill devices through uevents.  It provides a notification chain
-for interested parties in the kernel to also get notified of rfkill state
-changes in other drivers.  It creates several sysfs entries which can be used
-by userspace.  See section "Userspace support".
-
-The rfkill-input module provides the kernel with the ability to implement a
-basic response when the user presses a key or button (or toggles a switch)
-related to rfkill functionality.  It is an in-kernel implementation of default
-policy of reacting to rfkill-related input events and neither mandatory nor
-required for wireless drivers to operate.  It is enabled by the
-CONFIG_RFKILL_INPUT Kconfig option.
-
-rfkill-input is a rfkill-related events input layer handler.  This handler will
-listen to all rfkill key events and will change the rfkill state of the
-wireless devices accordingly.  With this option enabled userspace could either
-do nothing or simply perform monitoring tasks.
-
-The rfkill-input module also provides EPO (emergency power-off) functionality
-for all wireless transmitters.  This function cannot be overridden, and it is
-always active.  rfkill EPO is related to *_RFKILL_ALL input layer events.
-
-
-Important terms for the rfkill subsystem:
-
-In order to avoid confusion, we avoid the term "switch" in rfkill when it is
-referring to an electronic control circuit that enables or disables a
-transmitter.  We reserve it for the physical device a human manipulates
-(which is an input device, by the way):
-
-rfkill switch:
-
-	A physical device a human manipulates.  Its state can be perceived by
-	the kernel either directly (through a GPIO pin, ACPI GPE) or by its
-	effect on a rfkill line of a wireless device.
-
-rfkill controller:
-
-	A hardware circuit that controls the state of a rfkill line, which a
-	kernel driver can interact with *to modify* that state (i.e. it has
-	either write-only or read/write access).
-
-rfkill line:
-
-	An input channel (hardware or software) of a wireless device, which
-	causes a wireless transmitter to stop emitting energy (BLOCK) when it
-	is active.  Point of view is extremely important here: rfkill lines are
-	always seen from the PoV of a wireless device (and its driver).
-
-soft rfkill line/software rfkill line:
-
-	A rfkill line the wireless device driver can directly change the state
-	of.  Related to rfkill_state RFKILL_STATE_SOFT_BLOCKED.
-
-hard rfkill line/hardware rfkill line:
-
-	A rfkill line that works fully in hardware or firmware, and that cannot
-	be overridden by the kernel driver.  The hardware device or the
-	firmware just exports its status to the driver, but it is read-only.
-	Related to rfkill_state RFKILL_STATE_HARD_BLOCKED.
-
-The enum rfkill_state describes the rfkill state of a transmitter:
-
-When a rfkill line or rfkill controller is in the RFKILL_STATE_UNBLOCKED state,
-the wireless transmitter (radio TX circuit for example) is *enabled*.  When the
-it is in the RFKILL_STATE_SOFT_BLOCKED or RFKILL_STATE_HARD_BLOCKED, the
-wireless transmitter is to be *blocked* from operating.
-
-RFKILL_STATE_SOFT_BLOCKED indicates that a call to toggle_radio() can change
-that state.  RFKILL_STATE_HARD_BLOCKED indicates that a call to toggle_radio()
-will not be able to change the state and will return with a suitable error if
-attempts are made to set the state to RFKILL_STATE_UNBLOCKED.
-
-RFKILL_STATE_HARD_BLOCKED is used by drivers to signal that the device is
-locked in the BLOCKED state by a hardwire rfkill line (typically an input pin
-that, when active, forces the transmitter to be disabled) which the driver
-CANNOT override.
-
-Full rfkill functionality requires two different subsystems to cooperate: the
-input layer and the rfkill class.  The input layer issues *commands* to the
-entire system requesting that devices registered to the rfkill class change
-state.  The way this interaction happens is not complex, but it is not obvious
-either:
-
-Kernel Input layer:
-
-	* Generates KEY_WWAN, KEY_WLAN, KEY_BLUETOOTH, SW_RFKILL_ALL, and
-	  other such events when the user presses certain keys, buttons, or
-	  toggles certain physical switches.
-
-	THE INPUT LAYER IS NEVER USED TO PROPAGATE STATUS, NOTIFICATIONS OR THE
-	KIND OF STUFF AN ON-SCREEN-DISPLAY APPLICATION WOULD REPORT.  It is
-	used to issue *commands* for the system to change behaviour, and these
-	commands may or may not be carried out by some kernel driver or
-	userspace application.  It follows that doing user feedback based only
-	on input events is broken, as there is no guarantee that an input event
-	will be acted upon.
-
-	Most wireless communication device drivers implementing rfkill
-	functionality MUST NOT generate these events, and have no reason to
-	register themselves with the input layer.  Doing otherwise is a common
-	misconception.  There is an API to propagate rfkill status change
-	information, and it is NOT the input layer.
-
-rfkill class:
-
-	* Calls a hook in a driver to effectively change the wireless
-	  transmitter state;
-	* Keeps track of the wireless transmitter state (with help from
-	  the driver);
-	* Generates userspace notifications (uevents) and a call to a
-	  notification chain (kernel) when there is a wireless transmitter
-	  state change;
-	* Connects a wireless communications driver with the common rfkill
-	  control system, which, for example, allows actions such as
-	  "switch all bluetooth devices offline" to be carried out by
-	  userspace or by rfkill-input.
-
-	THE RFKILL CLASS NEVER ISSUES INPUT EVENTS.  THE RFKILL CLASS DOES
-	NOT LISTEN TO INPUT EVENTS.  NO DRIVER USING THE RFKILL CLASS SHALL
-	EVER LISTEN TO, OR ACT ON RFKILL INPUT EVENTS.  Doing otherwise is
-	a layering violation.
-
-	Most wireless data communication drivers in the kernel have just to
-	implement the rfkill class API to work properly.  Interfacing to the
-	input layer is not often required (and is very often a *bug*) on
-	wireless drivers.
-
-	Platform drivers often have to attach to the input layer to *issue*
-	(but never to listen to) rfkill events for rfkill switches, and also to
-	the rfkill class to export a control interface for the platform rfkill
-	controllers to the rfkill subsystem.  This does NOT mean the rfkill
-	switch is attached to a rfkill class (doing so is almost always wrong).
-	It just means the same kernel module is the driver for different
-	devices (rfkill switches and rfkill controllers).
-
-
-Userspace input handlers (uevents) or kernel input handlers (rfkill-input):
-
-	* Implements the policy of what should happen when one of the input
-	  layer events related to rfkill operation is received.
-	* Uses the sysfs interface (userspace) or private rfkill API calls
-	  to tell the devices registered with the rfkill class to change
-	  their state (i.e. translates the input layer event into real
-	  action).
-
-	* rfkill-input implements EPO by handling EV_SW SW_RFKILL_ALL 0
-	  (power off all transmitters) in a special way: it ignores any
-	  overrides and local state cache and forces all transmitters to the
-	  RFKILL_STATE_SOFT_BLOCKED state (including those which are already
-	  supposed to be BLOCKED).
-	* rfkill EPO will remain active until rfkill-input receives an
-	  EV_SW SW_RFKILL_ALL 1 event.  While the EPO is active, transmitters
-	  are locked in the blocked state (rfkill will refuse to unblock them).
-	* rfkill-input implements different policies that the user can
-	  select for handling EV_SW SW_RFKILL_ALL 1.  It will unlock rfkill,
-	  and either do nothing (leave transmitters blocked, but now unlocked),
-	  restore the transmitters to their state before the EPO, or unblock
-	  them all.
-
-Userspace uevent handler or kernel platform-specific drivers hooked to the
-rfkill notifier chain:
-
-	* Taps into the rfkill notifier chain or to KOBJ_CHANGE uevents,
-	  in order to know when a device that is registered with the rfkill
-	  class changes state;
-	* Issues feedback notifications to the user;
-	* In the rare platforms where this is required, synthesizes an input
-	  event to command all *OTHER* rfkill devices to also change their
-	  statues when a specific rfkill device changes state.
-
-
-===============================================================================
-3: Kernel driver guidelines
-
-Remember: point-of-view is everything for a driver that connects to the rfkill
-subsystem.  All the details below must be measured/perceived from the point of
-view of the specific driver being modified.
-
-The first thing one needs to know is whether his driver should be talking to
-the rfkill class or to the input layer.  In rare cases (platform drivers), it
-could happen that you need to do both, as platform drivers often handle a
-variety of devices in the same driver.
-
-Do not mistake input devices for rfkill controllers.  The only type of "rfkill
-switch" device that is to be registered with the rfkill class are those
-directly controlling the circuits that cause a wireless transmitter to stop
-working (or the software equivalent of them), i.e. what we call a rfkill
-controller.  Every other kind of "rfkill switch" is just an input device and
-MUST NOT be registered with the rfkill class.
-
-A driver should register a device with the rfkill class when ALL of the
-following conditions are met (they define a rfkill controller):
-
-1. The device is/controls a data communications wireless transmitter;
-
-2. The kernel can interact with the hardware/firmware to CHANGE the wireless
-   transmitter state (block/unblock TX operation);
-
-3. The transmitter can be made to not emit any energy when "blocked":
-   rfkill is not about blocking data transmissions, it is about blocking
-   energy emission;
-
-A driver should register a device with the input subsystem to issue
-rfkill-related events (KEY_WLAN, KEY_BLUETOOTH, KEY_WWAN, KEY_WIMAX,
-SW_RFKILL_ALL, etc) when ALL of the folowing conditions are met:
-
-1. It is directly related to some physical device the user interacts with, to
-   command the O.S./firmware/hardware to enable/disable a data communications
-   wireless transmitter.
-
-   Examples of the physical device are: buttons, keys and switches the user
-   will press/touch/slide/switch to enable or disable the wireless
-   communication device.
-
-2. It is NOT slaved to another device, i.e. there is no other device that
-   issues rfkill-related input events in preference to this one.
-
-   Please refer to the corner cases and examples section for more details.
-
-When in doubt, do not issue input events.  For drivers that should generate
-input events in some platforms, but not in others (e.g. b43), the best solution
-is to NEVER generate input events in the first place.  That work should be
-deferred to a platform-specific kernel module (which will know when to generate
-events through the rfkill notifier chain) or to userspace.  This avoids the
-usual maintenance problems with DMI whitelisting.
-
-
-Corner cases and examples:
-====================================
-
-1. If the device is an input device that, because of hardware or firmware,
-causes wireless transmitters to be blocked regardless of the kernel's will, it
-is still just an input device, and NOT to be registered with the rfkill class.
-
-2. If the wireless transmitter switch control is read-only, it is an input
-device and not to be registered with the rfkill class (and maybe not to be made
-an input layer event source either, see below).
-
-3. If there is some other device driver *closer* to the actual hardware the
-user interacted with (the button/switch/key) to issue an input event, THAT is
-the device driver that should be issuing input events.
-
-E.g:
-  [RFKILL slider switch] -- [GPIO hardware] -- [WLAN card rf-kill input]
-                           (platform driver)    (wireless card driver)
-
-The user is closer to the RFKILL slide switch plaform driver, so the driver
-which must issue input events is the platform driver looking at the GPIO
-hardware, and NEVER the wireless card driver (which is just a slave).  It is
-very likely that there are other leaves than just the WLAN card rf-kill input
-(e.g. a bluetooth card, etc)...
-
-On the other hand, some embedded devices do this:
-
-  [RFKILL slider switch] -- [WLAN card rf-kill input]
-                             (wireless card driver)
-
-In this situation, the wireless card driver *could* register itself as an input
-device and issue rf-kill related input events... but in order to AVOID the need
-for DMI whitelisting, the wireless card driver does NOT do it.  Userspace (HAL)
-or a platform driver (that exists only on these embedded devices) will do the
-dirty job of issuing the input events.
-
-
-COMMON MISTAKES in kernel drivers, related to rfkill:
-====================================
-
-1. NEVER confuse input device keys and buttons with input device switches.
-
-  1a. Switches are always set or reset.  They report the current state
-      (on position or off position).
-
-  1b. Keys and buttons are either in the pressed or not-pressed state, and
-      that's it.  A "button" that latches down when you press it, and
-      unlatches when you press it again is in fact a switch as far as input
-      devices go.
-
-Add the SW_* events you need for switches, do NOT try to emulate a button using
-KEY_* events just because there is no such SW_* event yet.  Do NOT try to use,
-for example, KEY_BLUETOOTH when you should be using SW_BLUETOOTH instead.
-
-2. Input device switches (sources of EV_SW events) DO store their current state
-(so you *must* initialize it by issuing a gratuitous input layer event on
-driver start-up and also when resuming from sleep), and that state CAN be
-queried from userspace through IOCTLs.  There is no sysfs interface for this,
-but that doesn't mean you should break things trying to hook it to the rfkill
-class to get a sysfs interface :-)
-
-3. Do not issue *_RFKILL_ALL events by default, unless you are sure it is the
-correct event for your switch/button.  These events are emergency power-off
-events when they are trying to turn the transmitters off.  An example of an
-input device which SHOULD generate *_RFKILL_ALL events is the wireless-kill
-switch in a laptop which is NOT a hotkey, but a real sliding/rocker switch.
-An example of an input device which SHOULD NOT generate *_RFKILL_ALL events by
-default, is any sort of hot key that is type-specific (e.g. the one for WLAN).
-
-
-3.1 Guidelines for wireless device drivers
-------------------------------------------
-
-(in this text, rfkill->foo means the foo field of struct rfkill).
-
-1. Each independent transmitter in a wireless device (usually there is only one
-transmitter per device) should have a SINGLE rfkill class attached to it.
-
-2. If the device does not have any sort of hardware assistance to allow the
-driver to rfkill the device, the driver should emulate it by taking all actions
-required to silence the transmitter.
-
-3. If it is impossible to silence the transmitter (i.e. it still emits energy,
-even if it is just in brief pulses, when there is no data to transmit and there
-is no hardware support to turn it off) do NOT lie to the users.  Do not attach
-it to a rfkill class.  The rfkill subsystem does not deal with data
-transmission, it deals with energy emission.  If the transmitter is emitting
-energy, it is not blocked in rfkill terms.
-
-4. It doesn't matter if the device has multiple rfkill input lines affecting
-the same transmitter, their combined state is to be exported as a single state
-per transmitter (see rule 1).
-
-This rule exists because users of the rfkill subsystem expect to get (and set,
-when possible) the overall transmitter rfkill state, not of a particular rfkill
-line.
+The rfkill class is provided for kernel drivers to register their radio
+transmitter with the kernel, provide methods for turning it on and off and,
+optionally, letting the system know about hardware-disabled states that may
+be implemented on the device. This code is enabled with the CONFIG_RFKILL
+Kconfig option, which drivers can "select".
 
-5. The wireless device driver MUST NOT leave the transmitter enabled during
-suspend and hibernation unless:
+The rfkill class code also notifies userspace of state changes, this is
+achieved via uevents. It also provides some sysfs files for userspace to
+check the status of radio transmitters. See the "Userspace support" section
+below.
 
-	5.1. The transmitter has to be enabled for some sort of functionality
-	like wake-on-wireless-packet or autonomous packed forwarding in a mesh
-	network, and that functionality is enabled for this suspend/hibernation
-	cycle.
 
-AND
+The rfkill-input code implements a basic response to rfkill buttons -- it
+implements turning on/off all devices of a certain class (or all).
 
-	5.2. The device was not on a user-requested BLOCKED state before
-	the suspend (i.e. the driver must NOT unblock a device, not even
-	to support wake-on-wireless-packet or remain in the mesh).
+When the device is hard-blocked (either by a call to rfkill_set_hw_state()
+or from query_hw_block) set_block() will be invoked but drivers can well
+ignore the method call since they can use the return value of the function
+rfkill_set_hw_state() to sync the software state instead of keeping track
+of calls to set_block().
 
-In other words, there is absolutely no allowed scenario where a driver can
-automatically take action to unblock a rfkill controller (obviously, this deals
-with scenarios where soft-blocking or both soft and hard blocking is happening.
-Scenarios where hardware rfkill lines are the only ones blocking the
-transmitter are outside of this rule, since the wireless device driver does not
-control its input hardware rfkill lines in the first place).
 
-6. During resume, rfkill will try to restore its previous state.
+The entire functionality is spread over more than one subsystem:
 
-7. After a rfkill class is suspended, it will *not* call rfkill->toggle_radio
-until it is resumed.
+ * The kernel input layer generates KEY_WWAN, KEY_WLAN etc. and
+   SW_RFKILL_ALL -- when the user presses a button. Drivers for radio
+   transmitters generally do not register to the input layer, unless the
+   device really provides an input device (i.e. a button that has no
+   effect other than generating a button press event)
 
+ * The rfkill-input code hooks up to these events and switches the soft-block
+   of the various radio transmitters, depending on the button type.
 
-Example of a WLAN wireless driver connected to the rfkill subsystem:
---------------------------------------------------------------------
+ * The rfkill drivers turn off/on their transmitters as requested.
 
-A certain WLAN card has one input pin that causes it to block the transmitter
-and makes the status of that input pin available (only for reading!) to the
-kernel driver.  This is a hard rfkill input line (it cannot be overridden by
-the kernel driver).
+ * The rfkill class will generate userspace notifications (uevents) to tell
+   userspace what the current state is.
 
-The card also has one PCI register that, if manipulated by the driver, causes
-it to block the transmitter.  This is a soft rfkill input line.
 
-It has also a thermal protection circuitry that shuts down its transmitter if
-the card overheats, and makes the status of that protection available (only for
-reading!) to the kernel driver.  This is also a hard rfkill input line.
 
-If either one of these rfkill lines are active, the transmitter is blocked by
-the hardware and forced offline.
+3. Kernel driver guidelines
 
-The driver should allocate and attach to its struct device *ONE* instance of
-the rfkill class (there is only one transmitter).
 
-It can implement the get_state() hook, and return RFKILL_STATE_HARD_BLOCKED if
-either one of its two hard rfkill input lines are active.  If the two hard
-rfkill lines are inactive, it must return RFKILL_STATE_SOFT_BLOCKED if its soft
-rfkill input line is active.  Only if none of the rfkill input lines are
-active, will it return RFKILL_STATE_UNBLOCKED.
+Drivers for radio transmitters normally implement only the rfkill class.
+These drivers may not unblock the transmitter based on own decisions, they
+should act on information provided by the rfkill class only.
 
-Since the device has a hardware rfkill line, it IS subject to state changes
-external to rfkill.  Therefore, the driver must make sure that it calls
-rfkill_force_state() to keep the status always up-to-date, and it must do a
-rfkill_force_state() on resume from sleep.
+Platform drivers might implement input devices if the rfkill button is just
+that, a button. If that button influences the hardware then you need to
+implement an rfkill class instead. This also applies if the platform provides
+a way to turn on/off the transmitter(s).
 
-Every time the driver gets a notification from the card that one of its rfkill
-lines changed state (polling might be needed on badly designed cards that don't
-generate interrupts for such events), it recomputes the rfkill state as per
-above, and calls rfkill_force_state() to update it.
+During suspend/hibernation, transmitters should only be left enabled when
+wake-on wlan or similar functionality requires it and the device wasn't
+blocked before suspend/hibernate. Note that it may be necessary to update
+the rfkill subsystem's idea of what the current state is at resume time if
+the state may have changed over suspend.
 
-The driver should implement the toggle_radio() hook, that:
 
-1. Returns an error if one of the hardware rfkill lines are active, and the
-caller asked for RFKILL_STATE_UNBLOCKED.
 
-2. Activates the soft rfkill line if the caller asked for state
-RFKILL_STATE_SOFT_BLOCKED.  It should do this even if one of the hard rfkill
-lines are active, effectively double-blocking the transmitter.
-
-3. Deactivates the soft rfkill line if none of the hardware rfkill lines are
-active and the caller asked for RFKILL_STATE_UNBLOCKED.
-
-===============================================================================
-4: Kernel API
+4. Kernel API
 
 To build a driver with rfkill subsystem support, the driver should depend on
-(or select) the Kconfig symbol RFKILL; it should _not_ depend on RKFILL_INPUT.
+(or select) the Kconfig symbol RFKILL.
 
 The hardware the driver talks to may be write-only (where the current state
 of the hardware is unknown), or read-write (where the hardware can be queried
 about its current state).
 
-The rfkill class will call the get_state hook of a device every time it needs
-to know the *real* current state of the hardware.  This can happen often, but
-it does not do any polling, so it is not enough on hardware that is subject
-to state changes outside of the rfkill subsystem.
-
-Therefore, calling rfkill_force_state() when a state change happens is
-mandatory when the device has a hardware rfkill line, or when something else
-like the firmware could cause its state to be changed without going through the
-rfkill class.
-
-Some hardware provides events when its status changes.  In these cases, it is
-best for the driver to not provide a get_state hook, and instead register the
-rfkill class *already* with the correct status, and keep it updated using
-rfkill_force_state() when it gets an event from the hardware.
-
-rfkill_force_state() must be used on the device resume handlers to update the
-rfkill status, should there be any chance of the device status changing during
-the sleep.
-
-There is no provision for a statically-allocated rfkill struct.  You must
-use rfkill_allocate() to allocate one.
-
-You should:
-	- rfkill_allocate()
-	- modify rfkill fields (flags, name)
-	- modify state to the current hardware state (THIS IS THE ONLY TIME
-	  YOU CAN ACCESS state DIRECTLY)
-	- rfkill_register()
+Calling rfkill_set_hw_state() when a state change happens is required from
+rfkill drivers that control devices that can be hard-blocked unless they also
+assign the poll_hw_block() callback (then the rfkill core will poll the
+device). Don't do this unless you cannot get the event in any other way.
 
-The only way to set a device to the RFKILL_STATE_HARD_BLOCKED state is through
-a suitable return of get_state() or through rfkill_force_state().
 
-When a device is in the RFKILL_STATE_HARD_BLOCKED state, the only way to switch
-it to a different state is through a suitable return of get_state() or through
-rfkill_force_state().
 
-If toggle_radio() is called to set a device to state RFKILL_STATE_SOFT_BLOCKED
-when that device is already at the RFKILL_STATE_HARD_BLOCKED state, it should
-not return an error.  Instead, it should try to double-block the transmitter,
-so that its state will change from RFKILL_STATE_HARD_BLOCKED to
-RFKILL_STATE_SOFT_BLOCKED should the hardware blocking cease.
-
-Please refer to the source for more documentation.
-
-===============================================================================
-5: Userspace support
-
-rfkill devices issue uevents (with an action of "change"), with the following
-environment variables set:
-
-RFKILL_NAME
-RFKILL_STATE
-RFKILL_TYPE
+5. Userspace support
 
-The ABI for these variables is defined by the sysfs attributes.  It is best
-to take a quick look at the source to make sure of the possible values.
-
-It is expected that HAL will trap those, and bridge them to DBUS, etc.  These
-events CAN and SHOULD be used to give feedback to the user about the rfkill
-status of the system.
-
-Input devices may issue events that are related to rfkill.  These are the
-various KEY_* events and SW_* events supported by rfkill-input.c.
-
-Userspace may not change the state of an rfkill switch in response to an
-input event, it should refrain from changing states entirely.
-
-Userspace cannot assume it is the only source of control for rfkill switches.
-Their state can change due to firmware actions, direct user actions, and the
-rfkill-input EPO override for *_RFKILL_ALL.
-
-When rfkill-input is not active, userspace must initiate a rfkill status
-change by writing to the "state" attribute in order for anything to happen.
-
-Take particular care to implement EV_SW SW_RFKILL_ALL properly.  When that
-switch is set to OFF, *every* rfkill device *MUST* be immediately put into the
-RFKILL_STATE_SOFT_BLOCKED state, no questions asked.
-
-The following sysfs entries will be created:
+The following sysfs entries exist for every rfkill device:
 
 	name: Name assigned by driver to this key (interface or driver name).
 	type: Name of the key type ("wlan", "bluetooth", etc).
 	state: Current state of the transmitter
 		0: RFKILL_STATE_SOFT_BLOCKED
-			transmitter is forced off, but one can override it
-			by a write to the state attribute;
+			transmitter is turned off by software
 		1: RFKILL_STATE_UNBLOCKED
-			transmiter is NOT forced off, and may operate if
-			all other conditions for such operation are met
-			(such as interface is up and configured, etc);
+			transmiter is (potentially) active
 		2: RFKILL_STATE_HARD_BLOCKED
 			transmitter is forced off by something outside of
-			the driver's control.  One cannot set a device to
-			this state through writes to the state attribute;
-	claim: 1: Userspace handles events, 0: Kernel handles events
-
-Both the "state" and "claim" entries are also writable. For the "state" entry
-this means that when 1 or 0 is written, the device rfkill state (if not yet in
-the requested state), will be will be toggled accordingly.
-
-For the "claim" entry writing 1 to it means that the kernel no longer handles
-key events even though RFKILL_INPUT input was enabled. When "claim" has been
-set to 0, userspace should make sure that it listens for the input events or
-check the sysfs "state" entry regularly to correctly perform the required tasks
-when the rkfill key is pressed.
-
-A note about input devices and EV_SW events:
-
-In order to know the current state of an input device switch (like
-SW_RFKILL_ALL), you will need to use an IOCTL.  That information is not
-available through sysfs in a generic way at this time, and it is not available
-through the rfkill class AT ALL.
+			the driver's control.
+	claim: 0: Kernel handles events (currently always reads that value)
+
+rfkill devices also issue uevents (with an action of "change"), with the
+following environment variables set:
+
+RFKILL_NAME
+RFKILL_STATE
+RFKILL_TYPE
+
+The contents of these variables corresponds to the "name", "state" and
+"type" sysfs files explained above.
diff --git a/MAINTAINERS b/MAINTAINERS
index e18baa410b5..2f6a8fcfb1f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4753,9 +4753,9 @@ S:	Supported
 F:	fs/reiserfs/
 
 RFKILL
-P:	Ivo van Doorn
-M:	IvDoorn@gmail.com
-L:	netdev@vger.kernel.org
+P:	Johannes Berg
+M:	johannes@sipsolutions.net
+L:	linux-wireless@vger.kernel.org
 S:	Maintained
 F	Documentation/rfkill.txt
 F:	net/rfkill/
diff --git a/arch/arm/mach-pxa/tosa-bt.c b/arch/arm/mach-pxa/tosa-bt.c
index bde42aa2937..c31e601eb49 100644
--- a/arch/arm/mach-pxa/tosa-bt.c
+++ b/arch/arm/mach-pxa/tosa-bt.c
@@ -35,21 +35,25 @@ static void tosa_bt_off(struct tosa_bt_data *data)
 	gpio_set_value(data->gpio_reset, 0);
 }
 
-static int tosa_bt_toggle_radio(void *data, enum rfkill_state state)
+static int tosa_bt_set_block(void *data, bool blocked)
 {
-	pr_info("BT_RADIO going: %s\n",
-			state == RFKILL_STATE_UNBLOCKED ? "on" : "off");
+	pr_info("BT_RADIO going: %s\n", blocked ? "off" : "on");
 
-	if (state == RFKILL_STATE_UNBLOCKED) {
+	if (!blocked) {
 		pr_info("TOSA_BT: going ON\n");
 		tosa_bt_on(data);
 	} else {
 		pr_info("TOSA_BT: going OFF\n");
 		tosa_bt_off(data);
 	}
+
 	return 0;
 }
 
+static const struct rfkill_ops tosa_bt_rfkill_ops = {
+	.set_block = tosa_bt_set_block,
+};
+
 static int tosa_bt_probe(struct platform_device *dev)
 {
 	int rc;
@@ -70,18 +74,14 @@ static int tosa_bt_probe(struct platform_device *dev)
 	if (rc)
 		goto err_pwr_dir;
 
-	rfk = rfkill_allocate(&dev->dev, RFKILL_TYPE_BLUETOOTH);
+	rfk = rfkill_alloc("tosa-bt", &dev->dev, RFKILL_TYPE_BLUETOOTH,
+			   &tosa_bt_rfkill_ops, data);
 	if (!rfk) {
 		rc = -ENOMEM;
 		goto err_rfk_alloc;
 	}
 
-	rfk->name = "tosa-bt";
-	rfk->toggle_radio = tosa_bt_toggle_radio;
-	rfk->data = data;
-#ifdef CONFIG_RFKILL_LEDS
-	rfk->led_trigger.name = "tosa-bt";
-#endif
+	rfkill_set_led_trigger_name(rfk, "tosa-bt");
 
 	rc = rfkill_register(rfk);
 	if (rc)
@@ -92,9 +92,7 @@ static int tosa_bt_probe(struct platform_device *dev)
 	return 0;
 
 err_rfkill:
-	if (rfk)
-		rfkill_free(rfk);
-	rfk = NULL;
+	rfkill_destroy(rfk);
 err_rfk_alloc:
 	tosa_bt_off(data);
 err_pwr_dir:
@@ -113,8 +111,10 @@ static int __devexit tosa_bt_remove(struct platform_device *dev)
 
 	platform_set_drvdata(dev, NULL);
 
-	if (rfk)
+	if (rfk) {
 		rfkill_unregister(rfk);
+		rfkill_destroy(rfk);
+	}
 	rfk = NULL;
 
 	tosa_bt_off(data);
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index afac5b6d3d7..58ce807fe44 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -31,7 +31,6 @@
 #include <linux/input.h>
 #include <linux/gpio.h>
 #include <linux/pda_power.h>
-#include <linux/rfkill.h>
 #include <linux/spi/spi.h>
 
 #include <asm/setup.h>
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 837135f0390..5ddd8c4f901 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2481,10 +2481,10 @@ static int add_net_device(struct hso_device *hso_dev)
 	return 0;
 }
 
-static int hso_radio_toggle(void *data, enum rfkill_state state)
+static int hso_rfkill_set_block(void *data, bool blocked)
 {
 	struct hso_device *hso_dev = data;
-	int enabled = (state == RFKILL_STATE_UNBLOCKED);
+	int enabled = !blocked;
 	int rv;
 
 	mutex_lock(&hso_dev->mutex);
@@ -2498,6 +2498,10 @@ static int hso_radio_toggle(void *data, enum rfkill_state state)
 	return rv;
 }
 
+static const struct rfkill_ops hso_rfkill_ops = {
+	.set_block = hso_rfkill_set_block,
+};
+
 /* Creates and sets up everything for rfkill */
 static void hso_create_rfkill(struct hso_device *hso_dev,
 			     struct usb_interface *interface)
@@ -2506,29 +2510,25 @@ static void hso_create_rfkill(struct hso_device *hso_dev,
 	struct device *dev = &hso_net->net->dev;
 	char *rfkn;
 
-	hso_net->rfkill = rfkill_allocate(&interface_to_usbdev(interface)->dev,
-				 RFKILL_TYPE_WWAN);
-	if (!hso_net->rfkill) {
-		dev_err(dev, "%s - Out of memory\n", __func__);
-		return;
-	}
 	rfkn = kzalloc(20, GFP_KERNEL);
-	if (!rfkn) {
-		rfkill_free(hso_net->rfkill);
-		hso_net->rfkill = NULL;
+	if (!rfkn)
 		dev_err(dev, "%s - Out of memory\n", __func__);
-		return;
-	}
+
 	snprintf(rfkn, 20, "hso-%d",
 		 interface->altsetting->desc.bInterfaceNumber);
-	hso_net->rfkill->name = rfkn;
-	hso_net->rfkill->state = RFKILL_STATE_UNBLOCKED;
-	hso_net->rfkill->data = hso_dev;
-	hso_net->rfkill->toggle_radio = hso_radio_toggle;
+
+	hso_net->rfkill = rfkill_alloc(rfkn,
+				       &interface_to_usbdev(interface)->dev,
+				       RFKILL_TYPE_WWAN,
+				       &hso_rfkill_ops, hso_dev);
+	if (!hso_net->rfkill) {
+		dev_err(dev, "%s - Out of memory\n", __func__);
+		kfree(rfkn);
+		return;
+	}
 	if (rfkill_register(hso_net->rfkill) < 0) {
+		rfkill_destroy(hso_net->rfkill);
 		kfree(rfkn);
-		hso_net->rfkill->name = NULL;
-		rfkill_free(hso_net->rfkill);
 		hso_net->rfkill = NULL;
 		dev_err(dev, "%s - Failed to register rfkill\n", __func__);
 		return;
@@ -3165,8 +3165,10 @@ static void hso_free_interface(struct usb_interface *interface)
 			hso_stop_net_device(network_table[i]);
 			cancel_work_sync(&network_table[i]->async_put_intf);
 			cancel_work_sync(&network_table[i]->async_get_intf);
-			if (rfk)
+			if (rfk) {
 				rfkill_unregister(rfk);
+				rfkill_destroy(rfk);
+			}
 			hso_free_net_device(network_table[i]);
 		}
 	}
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index 796a3adffea..515880aa211 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -460,12 +460,9 @@ struct ath_led {
 	bool registered;
 };
 
-/* Rfkill */
-#define ATH_RFKILL_POLL_INTERVAL	2000 /* msecs */
-
 struct ath_rfkill {
 	struct rfkill *rfkill;
-	struct delayed_work rfkill_poll;
+	struct rfkill_ops ops;
 	char rfkill_name[32];
 };
 
@@ -509,8 +506,6 @@ struct ath_rfkill {
 #define SC_OP_RXFLUSH           BIT(7)
 #define SC_OP_LED_ASSOCIATED    BIT(8)
 #define SC_OP_RFKILL_REGISTERED BIT(9)
-#define SC_OP_RFKILL_SW_BLOCKED BIT(10)
-#define SC_OP_RFKILL_HW_BLOCKED BIT(11)
 #define SC_OP_WAIT_FOR_BEACON   BIT(12)
 #define SC_OP_LED_ON            BIT(13)
 #define SC_OP_SCANNING          BIT(14)
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 61da08a1648..f7baa406918 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1192,120 +1192,69 @@ static bool ath_is_rfkill_set(struct ath_softc *sc)
 				  ah->rfkill_polarity;
 }
 
-/* h/w rfkill poll function */
-static void ath_rfkill_poll(struct work_struct *work)
+/* s/w rfkill handlers */
+static int ath_rfkill_set_block(void *data, bool blocked)
 {
-	struct ath_softc *sc = container_of(work, struct ath_softc,
-					    rf_kill.rfkill_poll.work);
-	bool radio_on;
-
-	if (sc->sc_flags & SC_OP_INVALID)
-		return;
-
-	radio_on = !ath_is_rfkill_set(sc);
-
-	/*
-	 * enable/disable radio only when there is a
-	 * state change in RF switch
-	 */
-	if (radio_on == !!(sc->sc_flags & SC_OP_RFKILL_HW_BLOCKED)) {
-		enum rfkill_state state;
-
-		if (sc->sc_flags & SC_OP_RFKILL_SW_BLOCKED) {
-			state = radio_on ? RFKILL_STATE_SOFT_BLOCKED
-				: RFKILL_STATE_HARD_BLOCKED;
-		} else if (radio_on) {
-			ath_radio_enable(sc);
-			state = RFKILL_STATE_UNBLOCKED;
-		} else {
-			ath_radio_disable(sc);
-			state = RFKILL_STATE_HARD_BLOCKED;
-		}
-
-		if (state == RFKILL_STATE_HARD_BLOCKED)
-			sc->sc_flags |= SC_OP_RFKILL_HW_BLOCKED;
-		else
-			sc->sc_flags &= ~SC_OP_RFKILL_HW_BLOCKED;
+	struct ath_softc *sc = data;
 
-		rfkill_force_state(sc->rf_kill.rfkill, state);
-	}
+	if (blocked)
+		ath_radio_disable(sc);
+	else
+		ath_radio_enable(sc);
 
-	queue_delayed_work(sc->hw->workqueue, &sc->rf_kill.rfkill_poll,
-			   msecs_to_jiffies(ATH_RFKILL_POLL_INTERVAL));
+	return 0;
 }
 
-/* s/w rfkill handler */
-static int ath_sw_toggle_radio(void *data, enum rfkill_state state)
+static void ath_rfkill_poll_state(struct rfkill *rfkill, void *data)
 {
 	struct ath_softc *sc = data;
+	bool blocked = !!ath_is_rfkill_set(sc);
 
-	switch (state) {
-	case RFKILL_STATE_SOFT_BLOCKED:
-		if (!(sc->sc_flags & (SC_OP_RFKILL_HW_BLOCKED |
-		    SC_OP_RFKILL_SW_BLOCKED)))
-			ath_radio_disable(sc);
-		sc->sc_flags |= SC_OP_RFKILL_SW_BLOCKED;
-		return 0;
-	case RFKILL_STATE_UNBLOCKED:
-		if ((sc->sc_flags & SC_OP_RFKILL_SW_BLOCKED)) {
-			sc->sc_flags &= ~SC_OP_RFKILL_SW_BLOCKED;
-			if (sc->sc_flags & SC_OP_RFKILL_HW_BLOCKED) {
-				DPRINTF(sc, ATH_DBG_FATAL, "Can't turn on the"
-					"radio as it is disabled by h/w\n");
-				return -EPERM;
-			}
-			ath_radio_enable(sc);
-		}
-		return 0;
-	default:
-		return -EINVAL;
-	}
+	if (rfkill_set_hw_state(rfkill, blocked))
+		ath_radio_disable(sc);
+	else
+		ath_radio_enable(sc);
 }
 
 /* Init s/w rfkill */
 static int ath_init_sw_rfkill(struct ath_softc *sc)
 {
-	sc->rf_kill.rfkill = rfkill_allocate(wiphy_dev(sc->hw->wiphy),
-					     RFKILL_TYPE_WLAN);
+	sc->rf_kill.ops.set_block = ath_rfkill_set_block;
+	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
+		sc->rf_kill.ops.poll = ath_rfkill_poll_state;
+
+	snprintf(sc->rf_kill.rfkill_name, sizeof(sc->rf_kill.rfkill_name),
+		"ath9k-%s::rfkill", wiphy_name(sc->hw->wiphy));
+
+	sc->rf_kill.rfkill = rfkill_alloc(sc->rf_kill.rfkill_name,
+					  wiphy_dev(sc->hw->wiphy),
+					  RFKILL_TYPE_WLAN,
+					  &sc->rf_kill.ops, sc);
 	if (!sc->rf_kill.rfkill) {
 		DPRINTF(sc, ATH_DBG_FATAL, "Failed to allocate rfkill\n");
 		return -ENOMEM;
 	}
 
-	snprintf(sc->rf_kill.rfkill_name, sizeof(sc->rf_kill.rfkill_name),
-		"ath9k-%s::rfkill", wiphy_name(sc->hw->wiphy));
-	sc->rf_kill.rfkill->name = sc->rf_kill.rfkill_name;
-	sc->rf_kill.rfkill->data = sc;
-	sc->rf_kill.rfkill->toggle_radio = ath_sw_toggle_radio;
-	sc->rf_kill.rfkill->state = RFKILL_STATE_UNBLOCKED;
-
 	return 0;
 }
 
 /* Deinitialize rfkill */
 static void ath_deinit_rfkill(struct ath_softc *sc)
 {
-	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
-		cancel_delayed_work_sync(&sc->rf_kill.rfkill_poll);
-
 	if (sc->sc_flags & SC_OP_RFKILL_REGISTERED) {
 		rfkill_unregister(sc->rf_kill.rfkill);
+		rfkill_destroy(sc->rf_kill.rfkill);
 		sc->sc_flags &= ~SC_OP_RFKILL_REGISTERED;
-		sc->rf_kill.rfkill = NULL;
 	}
 }
 
 static int ath_start_rfkill_poll(struct ath_softc *sc)
 {
-	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
-		queue_delayed_work(sc->hw->workqueue,
-				   &sc->rf_kill.rfkill_poll, 0);
-
 	if (!(sc->sc_flags & SC_OP_RFKILL_REGISTERED)) {
 		if (rfkill_register(sc->rf_kill.rfkill)) {
 			DPRINTF(sc, ATH_DBG_FATAL,
 				"Unable to register rfkill\n");
-			rfkill_free(sc->rf_kill.rfkill);
+			rfkill_destroy(sc->rf_kill.rfkill);
 
 			/* Deinitialize the device */
 			ath_cleanup(sc);
@@ -1678,10 +1627,6 @@ int ath_attach(u16 devid, struct ath_softc *sc)
 		goto error_attach;
 
 #if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
-	/* Initialze h/w Rfkill */
-	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
-		INIT_DELAYED_WORK(&sc->rf_kill.rfkill_poll, ath_rfkill_poll);
-
 	/* Initialize s/w rfkill */
 	error = ath_init_sw_rfkill(sc);
 	if (error)
@@ -2214,10 +2159,8 @@ static void ath9k_stop(struct ieee80211_hw *hw)
 	} else
 		sc->rx.rxlink = NULL;
 
-#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
-	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
-		cancel_delayed_work_sync(&sc->rf_kill.rfkill_poll);
-#endif
+	rfkill_pause_polling(sc->rf_kill.rfkill);
+
 	/* disable HAL and put h/w to sleep */
 	ath9k_hw_disable(sc->sc_ah);
 	ath9k_hw_configpcipowersave(sc->sc_ah, 1);
diff --git a/drivers/net/wireless/ath/ath9k/pci.c b/drivers/net/wireless/ath/ath9k/pci.c
index 168411d322a..ccdf20a2e9b 100644
--- a/drivers/net/wireless/ath/ath9k/pci.c
+++ b/drivers/net/wireless/ath/ath9k/pci.c
@@ -227,11 +227,6 @@ static int ath_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 
 	ath9k_hw_set_gpio(sc->sc_ah, ATH_LED_PIN, 1);
 
-#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
-	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
-		cancel_delayed_work_sync(&sc->rf_kill.rfkill_poll);
-#endif
-
 	pci_save_state(pdev);
 	pci_disable_device(pdev);
 	pci_set_power_state(pdev, PCI_D3hot);
@@ -256,16 +251,6 @@ static int ath_pci_resume(struct pci_dev *pdev)
 			    AR_GPIO_OUTPUT_MUX_AS_OUTPUT);
 	ath9k_hw_set_gpio(sc->sc_ah, ATH_LED_PIN, 1);
 
-#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
-	/*
-	 * check the h/w rfkill state on resume
-	 * and start the rfkill poll timer
-	 */
-	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_RFSILENT)
-		queue_delayed_work(sc->hw->workqueue,
-				   &sc->rf_kill.rfkill_poll, 0);
-#endif
-
 	return 0;
 }
 
diff --git a/drivers/net/wireless/b43/Kconfig b/drivers/net/wireless/b43/Kconfig
index 21572e40b79..07a99e3faf9 100644
--- a/drivers/net/wireless/b43/Kconfig
+++ b/drivers/net/wireless/b43/Kconfig
@@ -102,7 +102,7 @@ config B43_LEDS
 # if it's possible.
 config B43_RFKILL
 	bool
-	depends on B43 && (RFKILL = y || RFKILL = B43) && RFKILL_INPUT && (INPUT_POLLDEV = y || INPUT_POLLDEV = B43)
+	depends on B43 && (RFKILL = y || RFKILL = B43)
 	default y
 
 # This config option automatically enables b43 HW-RNG support,
diff --git a/drivers/net/wireless/b43/leds.c b/drivers/net/wireless/b43/leds.c
index 76f4c7bad8b..9a498d3fc65 100644
--- a/drivers/net/wireless/b43/leds.c
+++ b/drivers/net/wireless/b43/leds.c
@@ -87,7 +87,7 @@ static void b43_led_brightness_set(struct led_classdev *led_dev,
 }
 
 static int b43_register_led(struct b43_wldev *dev, struct b43_led *led,
-			    const char *name, char *default_trigger,
+			    const char *name, const char *default_trigger,
 			    u8 led_index, bool activelow)
 {
 	int err;
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index cb4a8712946..1d3e40095ad 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -3470,7 +3470,7 @@ static int b43_op_config(struct ieee80211_hw *hw, u32 changed)
 
 	if (!!conf->radio_enabled != phy->radio_on) {
 		if (conf->radio_enabled) {
-			b43_software_rfkill(dev, RFKILL_STATE_UNBLOCKED);
+			b43_software_rfkill(dev, false);
 			b43info(dev->wl, "Radio turned on by software\n");
 			if (!dev->radio_hw_enable) {
 				b43info(dev->wl, "The hardware RF-kill button "
@@ -3478,7 +3478,7 @@ static int b43_op_config(struct ieee80211_hw *hw, u32 changed)
 					"Press the button to turn it on.\n");
 			}
 		} else {
-			b43_software_rfkill(dev, RFKILL_STATE_SOFT_BLOCKED);
+			b43_software_rfkill(dev, true);
 			b43info(dev->wl, "Radio turned off by software\n");
 		}
 	}
diff --git a/drivers/net/wireless/b43/phy_a.c b/drivers/net/wireless/b43/phy_a.c
index c836c077d51..816e028a262 100644
--- a/drivers/net/wireless/b43/phy_a.c
+++ b/drivers/net/wireless/b43/phy_a.c
@@ -480,11 +480,11 @@ static bool b43_aphy_op_supports_hwpctl(struct b43_wldev *dev)
 }
 
 static void b43_aphy_op_software_rfkill(struct b43_wldev *dev,
-					enum rfkill_state state)
+					bool blocked)
 {
 	struct b43_phy *phy = &dev->phy;
 
-	if (state == RFKILL_STATE_UNBLOCKED) {
+	if (!blocked) {
 		if (phy->radio_on)
 			return;
 		b43_radio_write16(dev, 0x0004, 0x00C0);
diff --git a/drivers/net/wireless/b43/phy_common.c b/drivers/net/wireless/b43/phy_common.c
index e176b6e0d9c..6d241622210 100644
--- a/drivers/net/wireless/b43/phy_common.c
+++ b/drivers/net/wireless/b43/phy_common.c
@@ -84,7 +84,7 @@ int b43_phy_init(struct b43_wldev *dev)
 
 	phy->channel = ops->get_default_chan(dev);
 
-	ops->software_rfkill(dev, RFKILL_STATE_UNBLOCKED);
+	ops->software_rfkill(dev, false);
 	err = ops->init(dev);
 	if (err) {
 		b43err(dev->wl, "PHY init failed\n");
@@ -104,7 +104,7 @@ err_phy_exit:
 	if (ops->exit)
 		ops->exit(dev);
 err_block_rf:
-	ops->software_rfkill(dev, RFKILL_STATE_SOFT_BLOCKED);
+	ops->software_rfkill(dev, true);
 
 	return err;
 }
@@ -113,7 +113,7 @@ void b43_phy_exit(struct b43_wldev *dev)
 {
 	const struct b43_phy_operations *ops = dev->phy.ops;
 
-	ops->software_rfkill(dev, RFKILL_STATE_SOFT_BLOCKED);
+	ops->software_rfkill(dev, true);
 	if (ops->exit)
 		ops->exit(dev);
 }
@@ -295,18 +295,13 @@ err_restore_cookie:
 	return err;
 }
 
-void b43_software_rfkill(struct b43_wldev *dev, enum rfkill_state state)
+void b43_software_rfkill(struct b43_wldev *dev, bool blocked)
 {
 	struct b43_phy *phy = &dev->phy;
 
-	if (state == RFKILL_STATE_HARD_BLOCKED) {
-		/* We cannot hardware-block the device */
-		state = RFKILL_STATE_SOFT_BLOCKED;
-	}
-
 	b43_mac_suspend(dev);
-	phy->ops->software_rfkill(dev, state);
-	phy->radio_on = (state == RFKILL_STATE_UNBLOCKED);
+	phy->ops->software_rfkill(dev, blocked);
+	phy->radio_on = !blocked;
 	b43_mac_enable(dev);
 }
 
diff --git a/drivers/net/wireless/b43/phy_common.h b/drivers/net/wireless/b43/phy_common.h
index b2d99101947..f4c2d79cbc8 100644
--- a/drivers/net/wireless/b43/phy_common.h
+++ b/drivers/net/wireless/b43/phy_common.h
@@ -159,7 +159,7 @@ struct b43_phy_operations {
 
 	/* Radio */
 	bool (*supports_hwpctl)(struct b43_wldev *dev);
-	void (*software_rfkill)(struct b43_wldev *dev, enum rfkill_state state);
+	void (*software_rfkill)(struct b43_wldev *dev, bool blocked);
 	void (*switch_analog)(struct b43_wldev *dev, bool on);
 	int (*switch_channel)(struct b43_wldev *dev, unsigned int new_channel);
 	unsigned int (*get_default_chan)(struct b43_wldev *dev);
@@ -364,7 +364,7 @@ int b43_switch_channel(struct b43_wldev *dev, unsigned int new_channel);
 /**
  * b43_software_rfkill - Turn the radio ON or OFF in software.
  */
-void b43_software_rfkill(struct b43_wldev *dev, enum rfkill_state state);
+void b43_software_rfkill(struct b43_wldev *dev, bool blocked);
 
 /**
  * b43_phy_txpower_check - Check TX power output.
diff --git a/drivers/net/wireless/b43/phy_g.c b/drivers/net/wireless/b43/phy_g.c
index e7b98f013b0..5300232449f 100644
--- a/drivers/net/wireless/b43/phy_g.c
+++ b/drivers/net/wireless/b43/phy_g.c
@@ -2592,7 +2592,7 @@ static bool b43_gphy_op_supports_hwpctl(struct b43_wldev *dev)
 }
 
 static void b43_gphy_op_software_rfkill(struct b43_wldev *dev,
-					enum rfkill_state state)
+					bool blocked)
 {
 	struct b43_phy *phy = &dev->phy;
 	struct b43_phy_g *gphy = phy->g;
@@ -2600,7 +2600,7 @@ static void b43_gphy_op_software_rfkill(struct b43_wldev *dev,
 
 	might_sleep();
 
-	if (state == RFKILL_STATE_UNBLOCKED) {
+	if (!blocked) {
 		/* Turn radio ON */
 		if (phy->radio_on)
 			return;
diff --git a/drivers/net/wireless/b43/phy_lp.c b/drivers/net/wireless/b43/phy_lp.c
index 58e319d6b1e..ea0d3a3a6a6 100644
--- a/drivers/net/wireless/b43/phy_lp.c
+++ b/drivers/net/wireless/b43/phy_lp.c
@@ -488,7 +488,7 @@ static void b43_lpphy_op_radio_write(struct b43_wldev *dev, u16 reg, u16 value)
 }
 
 static void b43_lpphy_op_software_rfkill(struct b43_wldev *dev,
-					 enum rfkill_state state)
+					 bool blocked)
 {
 	//TODO
 }
diff --git a/drivers/net/wireless/b43/phy_n.c b/drivers/net/wireless/b43/phy_n.c
index 8bcfda5f3f0..be7b5604947 100644
--- a/drivers/net/wireless/b43/phy_n.c
+++ b/drivers/net/wireless/b43/phy_n.c
@@ -579,7 +579,7 @@ static void b43_nphy_op_radio_write(struct b43_wldev *dev, u16 reg, u16 value)
 }
 
 static void b43_nphy_op_software_rfkill(struct b43_wldev *dev,
-					enum rfkill_state state)
+					bool blocked)
 {//TODO
 }
 
diff --git a/drivers/net/wireless/b43/rfkill.c b/drivers/net/wireless/b43/rfkill.c
index 9e1d00bc24d..96047843cd5 100644
--- a/drivers/net/wireless/b43/rfkill.c
+++ b/drivers/net/wireless/b43/rfkill.c
@@ -45,12 +45,11 @@ static bool b43_is_hw_radio_enabled(struct b43_wldev *dev)
 }
 
 /* The poll callback for the hardware button. */
-static void b43_rfkill_poll(struct input_polled_dev *poll_dev)
+static void b43_rfkill_poll(struct rfkill *rfkill, void *data)
 {
-	struct b43_wldev *dev = poll_dev->private;
+	struct b43_wldev *dev = data;
 	struct b43_wl *wl = dev->wl;
 	bool enabled;
-	bool report_change = 0;
 
 	mutex_lock(&wl->mutex);
 	if (unlikely(b43_status(dev) < B43_STAT_INITIALIZED)) {
@@ -60,68 +59,55 @@ static void b43_rfkill_poll(struct input_polled_dev *poll_dev)
 	enabled = b43_is_hw_radio_enabled(dev);
 	if (unlikely(enabled != dev->radio_hw_enable)) {
 		dev->radio_hw_enable = enabled;
-		report_change = 1;
 		b43info(wl, "Radio hardware status changed to %s\n",
 			enabled ? "ENABLED" : "DISABLED");
+		enabled = !rfkill_set_hw_state(rfkill, !enabled);
+		if (enabled != dev->phy.radio_on)
+			b43_software_rfkill(dev, !enabled);
 	}
 	mutex_unlock(&wl->mutex);
-
-	/* send the radio switch event to the system - note both a key press
-	 * and a release are required */
-	if (unlikely(report_change)) {
-		input_report_key(poll_dev->input, KEY_WLAN, 1);
-		input_report_key(poll_dev->input, KEY_WLAN, 0);
-	}
 }
 
 /* Called when the RFKILL toggled in software. */
-static int b43_rfkill_soft_toggle(void *data, enum rfkill_state state)
+static int b43_rfkill_soft_set(void *data, bool blocked)
 {
 	struct b43_wldev *dev = data;
 	struct b43_wl *wl = dev->wl;
-	int err = -EBUSY;
+	int err = -EINVAL;
 
-	if (!wl->rfkill.registered)
-		return 0;
+	if (WARN_ON(!wl->rfkill.registered))
+		return -EINVAL;
 
 	mutex_lock(&wl->mutex);
+
 	if (b43_status(dev) < B43_STAT_INITIALIZED)
 		goto out_unlock;
+
+	if (!dev->radio_hw_enable)
+		goto out_unlock;
+
+	if (!blocked != dev->phy.radio_on)
+		b43_software_rfkill(dev, blocked);
 	err = 0;
-	switch (state) {
-	case RFKILL_STATE_UNBLOCKED:
-		if (!dev->radio_hw_enable) {
-			/* No luck. We can't toggle the hardware RF-kill
-			 * button from software. */
-			err = -EBUSY;
-			goto out_unlock;
-		}
-		if (!dev->phy.radio_on)
-			b43_software_rfkill(dev, state);
-		break;
-	case RFKILL_STATE_SOFT_BLOCKED:
-		if (dev->phy.radio_on)
-			b43_software_rfkill(dev, state);
-		break;
-	default:
-		b43warn(wl, "Received unexpected rfkill state %d.\n", state);
-		break;
-	}
 out_unlock:
 	mutex_unlock(&wl->mutex);
-
 	return err;
 }
 
-char *b43_rfkill_led_name(struct b43_wldev *dev)
+const char *b43_rfkill_led_name(struct b43_wldev *dev)
 {
 	struct b43_rfkill *rfk = &(dev->wl->rfkill);
 
 	if (!rfk->registered)
 		return NULL;
-	return rfkill_get_led_name(rfk->rfkill);
+	return rfkill_get_led_trigger_name(rfk->rfkill);
 }
 
+static const struct rfkill_ops b43_rfkill_ops = {
+	.set_block = b43_rfkill_soft_set,
+	.poll = b43_rfkill_poll,
+};
+
 void b43_rfkill_init(struct b43_wldev *dev)
 {
 	struct b43_wl *wl = dev->wl;
@@ -130,65 +116,26 @@ void b43_rfkill_init(struct b43_wldev *dev)
 
 	rfk->registered = 0;
 
-	rfk->rfkill = rfkill_allocate(dev->dev->dev, RFKILL_TYPE_WLAN);
-	if (!rfk->rfkill)
-		goto out_error;
 	snprintf(rfk->name, sizeof(rfk->name),
 		 "b43-%s", wiphy_name(wl->hw->wiphy));
-	rfk->rfkill->name = rfk->name;
-	rfk->rfkill->state = RFKILL_STATE_UNBLOCKED;
-	rfk->rfkill->data = dev;
-	rfk->rfkill->toggle_radio = b43_rfkill_soft_toggle;
-
-	rfk->poll_dev = input_allocate_polled_device();
-	if (!rfk->poll_dev) {
-		rfkill_free(rfk->rfkill);
-		goto err_freed_rfk;
-	}
-
-	rfk->poll_dev->private = dev;
-	rfk->poll_dev->poll = b43_rfkill_poll;
-	rfk->poll_dev->poll_interval = 1000; /* msecs */
 
-	rfk->poll_dev->input->name = rfk->name;
-	rfk->poll_dev->input->id.bustype = BUS_HOST;
-	rfk->poll_dev->input->id.vendor = dev->dev->bus->boardinfo.vendor;
-	rfk->poll_dev->input->evbit[0] = BIT(EV_KEY);
-	set_bit(KEY_WLAN, rfk->poll_dev->input->keybit);
+	rfk->rfkill = rfkill_alloc(rfk->name,
+				   dev->dev->dev,
+				   RFKILL_TYPE_WLAN,
+				   &b43_rfkill_ops, dev);
+	if (!rfk->rfkill)
+		goto out_error;
 
 	err = rfkill_register(rfk->rfkill);
 	if (err)
-		goto err_free_polldev;
-
-#ifdef CONFIG_RFKILL_INPUT_MODULE
-	/* B43 RF-kill isn't useful without the rfkill-input subsystem.
-	 * Try to load the module. */
-	err = request_module("rfkill-input");
-	if (err)
-		b43warn(wl, "Failed to load the rfkill-input module. "
-			"The built-in radio LED will not work.\n");
-#endif /* CONFIG_RFKILL_INPUT */
-
-#if !defined(CONFIG_RFKILL_INPUT) && !defined(CONFIG_RFKILL_INPUT_MODULE)
-	b43warn(wl, "The rfkill-input subsystem is not available. "
-		"The built-in radio LED will not work.\n");
-#endif
-
-	err = input_register_polled_device(rfk->poll_dev);
-	if (err)
-		goto err_unreg_rfk;
+		goto err_free;
 
 	rfk->registered = 1;
 
 	return;
-err_unreg_rfk:
-	rfkill_unregister(rfk->rfkill);
-err_free_polldev:
-	input_free_polled_device(rfk->poll_dev);
-	rfk->poll_dev = NULL;
-err_freed_rfk:
-	rfk->rfkill = NULL;
-out_error:
+ err_free:
+	rfkill_destroy(rfk->rfkill);
+ out_error:
 	rfk->registered = 0;
 	b43warn(wl, "RF-kill button init failed\n");
 }
@@ -201,9 +148,7 @@ void b43_rfkill_exit(struct b43_wldev *dev)
 		return;
 	rfk->registered = 0;
 
-	input_unregister_polled_device(rfk->poll_dev);
 	rfkill_unregister(rfk->rfkill);
-	input_free_polled_device(rfk->poll_dev);
-	rfk->poll_dev = NULL;
+	rfkill_destroy(rfk->rfkill);
 	rfk->rfkill = NULL;
 }
diff --git a/drivers/net/wireless/b43/rfkill.h b/drivers/net/wireless/b43/rfkill.h
index adacf936d81..da497e01bbb 100644
--- a/drivers/net/wireless/b43/rfkill.h
+++ b/drivers/net/wireless/b43/rfkill.h
@@ -7,14 +7,11 @@ struct b43_wldev;
 #ifdef CONFIG_B43_RFKILL
 
 #include <linux/rfkill.h>
-#include <linux/input-polldev.h>
 
 
 struct b43_rfkill {
 	/* The RFKILL subsystem data structure */
 	struct rfkill *rfkill;
-	/* The poll device for the RFKILL input button */
-	struct input_polled_dev *poll_dev;
 	/* Did initialization succeed? Used for freeing. */
 	bool registered;
 	/* The unique name of this rfkill switch */
@@ -26,7 +23,7 @@ struct b43_rfkill {
 void b43_rfkill_init(struct b43_wldev *dev);
 void b43_rfkill_exit(struct b43_wldev *dev);
 
-char * b43_rfkill_led_name(struct b43_wldev *dev);
+const char *b43_rfkill_led_name(struct b43_wldev *dev);
 
 
 #else /* CONFIG_B43_RFKILL */
diff --git a/drivers/net/wireless/b43legacy/Kconfig b/drivers/net/wireless/b43legacy/Kconfig
index d4f628a74bb..6893f439df7 100644
--- a/drivers/net/wireless/b43legacy/Kconfig
+++ b/drivers/net/wireless/b43legacy/Kconfig
@@ -47,7 +47,7 @@ config B43LEGACY_LEDS
 # if it's possible.
 config B43LEGACY_RFKILL
 	bool
-	depends on B43LEGACY && (RFKILL = y || RFKILL = B43LEGACY) && RFKILL_INPUT && (INPUT_POLLDEV = y || INPUT_POLLDEV = B43LEGACY)
+	depends on B43LEGACY && (RFKILL = y || RFKILL = B43LEGACY)
 	default y
 
 # This config option automatically enables b43 HW-RNG support,
diff --git a/drivers/net/wireless/b43legacy/leds.c b/drivers/net/wireless/b43legacy/leds.c
index 3ea55b18c70..538d3117594 100644
--- a/drivers/net/wireless/b43legacy/leds.c
+++ b/drivers/net/wireless/b43legacy/leds.c
@@ -86,7 +86,8 @@ static void b43legacy_led_brightness_set(struct led_classdev *led_dev,
 
 static int b43legacy_register_led(struct b43legacy_wldev *dev,
 				  struct b43legacy_led *led,
-				  const char *name, char *default_trigger,
+				  const char *name,
+				  const char *default_trigger,
 				  u8 led_index, bool activelow)
 {
 	int err;
diff --git a/drivers/net/wireless/b43legacy/rfkill.c b/drivers/net/wireless/b43legacy/rfkill.c
index 4b0c7d27a51..c6230a64505 100644
--- a/drivers/net/wireless/b43legacy/rfkill.c
+++ b/drivers/net/wireless/b43legacy/rfkill.c
@@ -45,12 +45,11 @@ static bool b43legacy_is_hw_radio_enabled(struct b43legacy_wldev *dev)
 }
 
 /* The poll callback for the hardware button. */
-static void b43legacy_rfkill_poll(struct input_polled_dev *poll_dev)
+static void b43legacy_rfkill_poll(struct rfkill *rfkill, void *data)
 {
-	struct b43legacy_wldev *dev = poll_dev->private;
+	struct b43legacy_wldev *dev = data;
 	struct b43legacy_wl *wl = dev->wl;
 	bool enabled;
-	bool report_change = 0;
 
 	mutex_lock(&wl->mutex);
 	if (unlikely(b43legacy_status(dev) < B43legacy_STAT_INITIALIZED)) {
@@ -60,71 +59,64 @@ static void b43legacy_rfkill_poll(struct input_polled_dev *poll_dev)
 	enabled = b43legacy_is_hw_radio_enabled(dev);
 	if (unlikely(enabled != dev->radio_hw_enable)) {
 		dev->radio_hw_enable = enabled;
-		report_change = 1;
 		b43legacyinfo(wl, "Radio hardware status changed to %s\n",
 			enabled ? "ENABLED" : "DISABLED");
+		enabled = !rfkill_set_hw_state(rfkill, !enabled);
+		if (enabled != dev->phy.radio_on) {
+			if (enabled)
+				b43legacy_radio_turn_on(dev);
+			else
+				b43legacy_radio_turn_off(dev, 0);
+		}
 	}
 	mutex_unlock(&wl->mutex);
-
-	/* send the radio switch event to the system - note both a key press
-	 * and a release are required */
-	if (unlikely(report_change)) {
-		input_report_key(poll_dev->input, KEY_WLAN, 1);
-		input_report_key(poll_dev->input, KEY_WLAN, 0);
-	}
 }
 
 /* Called when the RFKILL toggled in software.
  * This is called without locking. */
-static int b43legacy_rfkill_soft_toggle(void *data, enum rfkill_state state)
+static int b43legacy_rfkill_soft_set(void *data, bool blocked)
 {
 	struct b43legacy_wldev *dev = data;
 	struct b43legacy_wl *wl = dev->wl;
-	int err = -EBUSY;
+	int ret = -EINVAL;
 
 	if (!wl->rfkill.registered)
-		return 0;
+		return -EINVAL;
 
 	mutex_lock(&wl->mutex);
 	if (b43legacy_status(dev) < B43legacy_STAT_INITIALIZED)
 		goto out_unlock;
-	err = 0;
-	switch (state) {
-	case RFKILL_STATE_UNBLOCKED:
-		if (!dev->radio_hw_enable) {
-			/* No luck. We can't toggle the hardware RF-kill
-			 * button from software. */
-			err = -EBUSY;
-			goto out_unlock;
-		}
-		if (!dev->phy.radio_on)
+
+	if (!dev->radio_hw_enable)
+		goto out_unlock;
+
+	if (!blocked != dev->phy.radio_on) {
+		if (!blocked)
 			b43legacy_radio_turn_on(dev);
-		break;
-	case RFKILL_STATE_SOFT_BLOCKED:
-		if (dev->phy.radio_on)
+		else
 			b43legacy_radio_turn_off(dev, 0);
-		break;
-	default:
-		b43legacywarn(wl, "Received unexpected rfkill state %d.\n",
-			      state);
-		break;
 	}
+	ret = 0;
 
 out_unlock:
 	mutex_unlock(&wl->mutex);
-
-	return err;
+	return ret;
 }
 
-char *b43legacy_rfkill_led_name(struct b43legacy_wldev *dev)
+const char *b43legacy_rfkill_led_name(struct b43legacy_wldev *dev)
 {
 	struct b43legacy_rfkill *rfk = &(dev->wl->rfkill);
 
 	if (!rfk->registered)
 		return NULL;
-	return rfkill_get_led_name(rfk->rfkill);
+	return rfkill_get_led_trigger_name(rfk->rfkill);
 }
 
+static const struct rfkill_ops b43legacy_rfkill_ops = {
+	.set_block = b43legacy_rfkill_soft_set,
+	.poll = b43legacy_rfkill_poll,
+};
+
 void b43legacy_rfkill_init(struct b43legacy_wldev *dev)
 {
 	struct b43legacy_wl *wl = dev->wl;
@@ -133,60 +125,25 @@ void b43legacy_rfkill_init(struct b43legacy_wldev *dev)
 
 	rfk->registered = 0;
 
-	rfk->rfkill = rfkill_allocate(dev->dev->dev, RFKILL_TYPE_WLAN);
-	if (!rfk->rfkill)
-		goto out_error;
 	snprintf(rfk->name, sizeof(rfk->name),
 		 "b43legacy-%s", wiphy_name(wl->hw->wiphy));
-	rfk->rfkill->name = rfk->name;
-	rfk->rfkill->state = RFKILL_STATE_UNBLOCKED;
-	rfk->rfkill->data = dev;
-	rfk->rfkill->toggle_radio = b43legacy_rfkill_soft_toggle;
-
-	rfk->poll_dev = input_allocate_polled_device();
-	if (!rfk->poll_dev) {
-		rfkill_free(rfk->rfkill);
-		goto err_freed_rfk;
-	}
-
-	rfk->poll_dev->private = dev;
-	rfk->poll_dev->poll = b43legacy_rfkill_poll;
-	rfk->poll_dev->poll_interval = 1000; /* msecs */
-
-	rfk->poll_dev->input->name = rfk->name;
-	rfk->poll_dev->input->id.bustype = BUS_HOST;
-	rfk->poll_dev->input->id.vendor = dev->dev->bus->boardinfo.vendor;
-	rfk->poll_dev->input->evbit[0] = BIT(EV_KEY);
-	set_bit(KEY_WLAN, rfk->poll_dev->input->keybit);
+	rfk->rfkill = rfkill_alloc(rfk->name,
+				   dev->dev->dev,
+				   RFKILL_TYPE_WLAN,
+				   &b43legacy_rfkill_ops, dev);
+	if (!rfk->rfkill)
+		goto out_error;
 
 	err = rfkill_register(rfk->rfkill);
 	if (err)
-		goto err_free_polldev;
-
-#ifdef CONFIG_RFKILL_INPUT_MODULE
-	/* B43legacy RF-kill isn't useful without the rfkill-input subsystem.
-	 * Try to load the module. */
-	err = request_module("rfkill-input");
-	if (err)
-		b43legacywarn(wl, "Failed to load the rfkill-input module."
-			"The built-in radio LED will not work.\n");
-#endif /* CONFIG_RFKILL_INPUT */
-
-	err = input_register_polled_device(rfk->poll_dev);
-	if (err)
-		goto err_unreg_rfk;
+		goto err_free;
 
 	rfk->registered = 1;
 
 	return;
-err_unreg_rfk:
-	rfkill_unregister(rfk->rfkill);
-err_free_polldev:
-	input_free_polled_device(rfk->poll_dev);
-	rfk->poll_dev = NULL;
-err_freed_rfk:
-	rfk->rfkill = NULL;
-out_error:
+ err_free:
+	rfkill_destroy(rfk->rfkill);
+ out_error:
 	rfk->registered = 0;
 	b43legacywarn(wl, "RF-kill button init failed\n");
 }
@@ -199,10 +156,8 @@ void b43legacy_rfkill_exit(struct b43legacy_wldev *dev)
 		return;
 	rfk->registered = 0;
 
-	input_unregister_polled_device(rfk->poll_dev);
 	rfkill_unregister(rfk->rfkill);
-	input_free_polled_device(rfk->poll_dev);
-	rfk->poll_dev = NULL;
+	rfkill_destroy(rfk->rfkill);
 	rfk->rfkill = NULL;
 }
 
diff --git a/drivers/net/wireless/b43legacy/rfkill.h b/drivers/net/wireless/b43legacy/rfkill.h
index 11150a8032f..adffc503a6a 100644
--- a/drivers/net/wireless/b43legacy/rfkill.h
+++ b/drivers/net/wireless/b43legacy/rfkill.h
@@ -6,16 +6,12 @@ struct b43legacy_wldev;
 #ifdef CONFIG_B43LEGACY_RFKILL
 
 #include <linux/rfkill.h>
-#include <linux/workqueue.h>
-#include <linux/input-polldev.h>
 
 
 struct b43legacy_rfkill {
 	/* The RFKILL subsystem data structure */
 	struct rfkill *rfkill;
-	/* The poll device for the RFKILL input button */
-	struct input_polled_dev *poll_dev;
 	/* Did initialization succeed? Used for freeing. */
 	bool registered;
 	/* The unique name of this rfkill switch */
@@ -27,7 +23,7 @@ struct b43legacy_rfkill {
 void b43legacy_rfkill_init(struct b43legacy_wldev *dev);
 void b43legacy_rfkill_exit(struct b43legacy_wldev *dev);
 
-char *b43legacy_rfkill_led_name(struct b43legacy_wldev *dev);
+const char *b43legacy_rfkill_led_name(struct b43legacy_wldev *dev);
 
 
 #else /* CONFIG_B43LEGACY_RFKILL */
diff --git a/drivers/net/wireless/iwlwifi/Kconfig b/drivers/net/wireless/iwlwifi/Kconfig
index 8304f6406a1..6fe259fcfb8 100644
--- a/drivers/net/wireless/iwlwifi/Kconfig
+++ b/drivers/net/wireless/iwlwifi/Kconfig
@@ -5,15 +5,14 @@ config IWLWIFI
 	select FW_LOADER
 	select MAC80211_LEDS if IWLWIFI_LEDS
 	select LEDS_CLASS if IWLWIFI_LEDS
-	select RFKILL if IWLWIFI_RFKILL
 
 config IWLWIFI_LEDS
 	bool "Enable LED support in iwlagn and iwl3945 drivers"
 	depends on IWLWIFI
 
 config IWLWIFI_RFKILL
-	bool "Enable RF kill support in iwlagn and iwl3945 drivers"
-	depends on IWLWIFI
+	def_bool y
+	depends on IWLWIFI && RFKILL
 
 config IWLWIFI_SPECTRUM_MEASUREMENT
 	bool "Enable Spectrum Measurement in iwlagn driver"
diff --git a/drivers/net/wireless/iwlwifi/iwl-rfkill.c b/drivers/net/wireless/iwlwifi/iwl-rfkill.c
index 65605ad44e4..13149936fd2 100644
--- a/drivers/net/wireless/iwlwifi/iwl-rfkill.c
+++ b/drivers/net/wireless/iwlwifi/iwl-rfkill.c
@@ -36,42 +36,37 @@
 #include "iwl-core.h"
 
 /* software rf-kill from user */
-static int iwl_rfkill_soft_rf_kill(void *data, enum rfkill_state state)
+static int iwl_rfkill_soft_rf_kill(void *data, bool blocked)
 {
 	struct iwl_priv *priv = data;
-	int err = 0;
 
 	if (!priv->rfkill)
-		return 0;
+		return -EINVAL;
 
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return 0;
 
-	IWL_DEBUG_RF_KILL(priv, "we received soft RFKILL set to state %d\n", state);
+	IWL_DEBUG_RF_KILL(priv, "received soft RFKILL: block=%d\n", blocked);
+
 	mutex_lock(&priv->mutex);
 
-	switch (state) {
-	case RFKILL_STATE_UNBLOCKED:
-		if (iwl_is_rfkill_hw(priv)) {
-			err = -EBUSY;
-			goto out_unlock;
-		}
+	if (iwl_is_rfkill_hw(priv))
+		goto out_unlock;
+
+	if (!blocked)
 		iwl_radio_kill_sw_enable_radio(priv);
-		break;
-	case RFKILL_STATE_SOFT_BLOCKED:
+	else
 		iwl_radio_kill_sw_disable_radio(priv);
-		break;
-	default:
-		IWL_WARN(priv, "we received unexpected RFKILL state %d\n",
-			state);
-		break;
-	}
+
 out_unlock:
 	mutex_unlock(&priv->mutex);
-
-	return err;
+	return 0;
 }
 
+static const struct rfkill_ops iwl_rfkill_ops = {
+	.set_block = iwl_rfkill_soft_rf_kill,
+};
+
 int iwl_rfkill_init(struct iwl_priv *priv)
 {
 	struct device *device = wiphy_dev(priv->hw->wiphy);
@@ -80,21 +75,16 @@ int iwl_rfkill_init(struct iwl_priv *priv)
 	BUG_ON(device == NULL);
 
 	IWL_DEBUG_RF_KILL(priv, "Initializing RFKILL.\n");
-	priv->rfkill = rfkill_allocate(device, RFKILL_TYPE_WLAN);
+	priv->rfkill = rfkill_alloc(priv->cfg->name,
+				    device,
+				    RFKILL_TYPE_WLAN,
+				    &iwl_rfkill_ops, priv);
 	if (!priv->rfkill) {
 		IWL_ERR(priv, "Unable to allocate RFKILL device.\n");
 		ret = -ENOMEM;
 		goto error;
 	}
 
-	priv->rfkill->name = priv->cfg->name;
-	priv->rfkill->data = priv;
-	priv->rfkill->state = RFKILL_STATE_UNBLOCKED;
-	priv->rfkill->toggle_radio = iwl_rfkill_soft_rf_kill;
-
-	priv->rfkill->dev.class->suspend = NULL;
-	priv->rfkill->dev.class->resume = NULL;
-
 	ret = rfkill_register(priv->rfkill);
 	if (ret) {
 		IWL_ERR(priv, "Unable to register RFKILL: %d\n", ret);
@@ -102,11 +92,10 @@ int iwl_rfkill_init(struct iwl_priv *priv)
 	}
 
 	IWL_DEBUG_RF_KILL(priv, "RFKILL initialization complete.\n");
-	return ret;
+	return 0;
 
 free_rfkill:
-	if (priv->rfkill != NULL)
-		rfkill_free(priv->rfkill);
+	rfkill_destroy(priv->rfkill);
 	priv->rfkill = NULL;
 
 error:
@@ -118,8 +107,10 @@ EXPORT_SYMBOL(iwl_rfkill_init);
 void iwl_rfkill_unregister(struct iwl_priv *priv)
 {
 
-	if (priv->rfkill)
+	if (priv->rfkill) {
 		rfkill_unregister(priv->rfkill);
+		rfkill_destroy(priv->rfkill);
+	}
 
 	priv->rfkill = NULL;
 }
@@ -131,14 +122,10 @@ void iwl_rfkill_set_hw_state(struct iwl_priv *priv)
 	if (!priv->rfkill)
 		return;
 
-	if (iwl_is_rfkill_hw(priv)) {
-		rfkill_force_state(priv->rfkill, RFKILL_STATE_HARD_BLOCKED);
-		return;
-	}
-
-	if (!iwl_is_rfkill_sw(priv))
-		rfkill_force_state(priv->rfkill, RFKILL_STATE_UNBLOCKED);
+	if (rfkill_set_hw_state(priv->rfkill,
+				!!iwl_is_rfkill_hw(priv)))
+		iwl_radio_kill_sw_disable_radio(priv);
 	else
-		rfkill_force_state(priv->rfkill, RFKILL_STATE_SOFT_BLOCKED);
+		iwl_radio_kill_sw_enable_radio(priv);
 }
 EXPORT_SYMBOL(iwl_rfkill_set_hw_state);
diff --git a/drivers/net/wireless/iwmc3200wifi/rfkill.c b/drivers/net/wireless/iwmc3200wifi/rfkill.c
index 4ca8b495f82..8ee2c3c09a0 100644
--- a/drivers/net/wireless/iwmc3200wifi/rfkill.c
+++ b/drivers/net/wireless/iwmc3200wifi/rfkill.c
@@ -25,47 +25,42 @@
 
 #include "iwm.h"
 
-static int iwm_rfkill_soft_toggle(void *data, enum rfkill_state state)
+static int iwm_rfkill_set_block(void *data, bool blocked)
 {
 	struct iwm_priv *iwm = data;
 
-	switch (state) {
-	case RFKILL_STATE_UNBLOCKED:
+	if (!blocked) {
 		if (test_bit(IWM_RADIO_RFKILL_HW, &iwm->radio))
 			return -EBUSY;
 
 		if (test_and_clear_bit(IWM_RADIO_RFKILL_SW, &iwm->radio) &&
 		    (iwm_to_ndev(iwm)->flags & IFF_UP))
-			iwm_up(iwm);
-
-		break;
-	case RFKILL_STATE_SOFT_BLOCKED:
+			return iwm_up(iwm);
+	} else {
 		if (!test_and_set_bit(IWM_RADIO_RFKILL_SW, &iwm->radio))
-			iwm_down(iwm);
-
-		break;
-	default:
-		break;
+			return iwm_down(iwm);
 	}
 
 	return 0;
 }
 
+static const struct rfkill_ops iwm_rfkill_ops = {
+	.set_block = iwm_rfkill_set_block,
+};
+
 int iwm_rfkill_init(struct iwm_priv *iwm)
 {
 	int ret;
 
-	iwm->rfkill = rfkill_allocate(iwm_to_dev(iwm), RFKILL_TYPE_WLAN);
+	iwm->rfkill = rfkill_alloc(KBUILD_MODNAME,
+				   iwm_to_dev(iwm),
+				   RFKILL_TYPE_WLAN,
+				   &iwm_rfkill_ops, iwm);
 	if (!iwm->rfkill) {
 		IWM_ERR(iwm, "Unable to allocate rfkill device\n");
 		return -ENOMEM;
 	}
 
-	iwm->rfkill->name = KBUILD_MODNAME;
-	iwm->rfkill->data = iwm;
-	iwm->rfkill->state = RFKILL_STATE_UNBLOCKED;
-	iwm->rfkill->toggle_radio = iwm_rfkill_soft_toggle;
-
 	ret = rfkill_register(iwm->rfkill);
 	if (ret) {
 		IWM_ERR(iwm, "Failed to register rfkill device\n");
@@ -74,15 +69,15 @@ int iwm_rfkill_init(struct iwm_priv *iwm)
 
 	return 0;
  fail:
-	rfkill_free(iwm->rfkill);
+	rfkill_destroy(iwm->rfkill);
 	return ret;
 }
 
 void iwm_rfkill_exit(struct iwm_priv *iwm)
 {
-	if (iwm->rfkill)
+	if (iwm->rfkill) {
 		rfkill_unregister(iwm->rfkill);
-
-	rfkill_free(iwm->rfkill);
+		rfkill_destroy(iwm->rfkill);
+	}
 	iwm->rfkill = NULL;
 }
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 284ebaca6e4..c682ac53641 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -21,7 +21,7 @@ config ACER_WMI
 	depends on NEW_LEDS
 	depends on BACKLIGHT_CLASS_DEVICE
 	depends on SERIO_I8042
-	depends on RFKILL
+	depends on RFKILL || RFKILL = n
 	select ACPI_WMI
 	---help---
 	  This is a driver for newer Acer (and Wistron) laptops. It adds
@@ -60,7 +60,7 @@ config DELL_LAPTOP
 	depends on DCDBAS
 	depends on EXPERIMENTAL
 	depends on BACKLIGHT_CLASS_DEVICE
-	depends on RFKILL
+	depends on RFKILL || RFKILL = n
 	depends on POWER_SUPPLY
 	default n
 	---help---
@@ -117,7 +117,7 @@ config HP_WMI
 	tristate "HP WMI extras"
 	depends on ACPI_WMI
 	depends on INPUT
-	depends on RFKILL
+	depends on RFKILL || RFKILL = n
 	help
 	 Say Y here if you want to support WMI-based hotkeys on HP laptops and
 	 to read data from WMI such as docking or ambient light sensor state.
@@ -196,14 +196,13 @@ config THINKPAD_ACPI
 	tristate "ThinkPad ACPI Laptop Extras"
 	depends on ACPI
 	depends on INPUT
+	depends on RFKILL || RFKILL = n
 	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
 	select HWMON
 	select NVRAM
 	select NEW_LEDS
 	select LEDS_CLASS
-	select NET
-	select RFKILL
 	---help---
 	  This is a driver for the IBM and Lenovo ThinkPad laptops. It adds
 	  support for Fn-Fx key combinations, Bluetooth control, video
@@ -338,9 +337,9 @@ config EEEPC_LAPTOP
 	depends on ACPI
 	depends on INPUT
 	depends on EXPERIMENTAL
+	depends on RFKILL || RFKILL = n
 	select BACKLIGHT_CLASS_DEVICE
 	select HWMON
-	select RFKILL
 	---help---
 	  This driver supports the Fn-Fx keys on Eee PC laptops.
 	  It also adds the ability to switch camera/wlan on/off.
@@ -405,9 +404,8 @@ config ACPI_TOSHIBA
 	tristate "Toshiba Laptop Extras"
 	depends on ACPI
 	depends on INPUT
+	depends on RFKILL || RFKILL = n
 	select INPUT_POLLDEV
-	select NET
-	select RFKILL
 	select BACKLIGHT_CLASS_DEVICE
 	---help---
 	  This driver adds support for access to certain system settings
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 62d02b3c998..b618fa51db2 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -958,58 +958,50 @@ static void acer_rfkill_update(struct work_struct *ignored)
 
 	status = get_u32(&state, ACER_CAP_WIRELESS);
 	if (ACPI_SUCCESS(status))
-		rfkill_force_state(wireless_rfkill, state ?
-			RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED);
+		rfkill_set_sw_state(wireless_rfkill, !!state);
 
 	if (has_cap(ACER_CAP_BLUETOOTH)) {
 		status = get_u32(&state, ACER_CAP_BLUETOOTH);
 		if (ACPI_SUCCESS(status))
-			rfkill_force_state(bluetooth_rfkill, state ?
-				RFKILL_STATE_UNBLOCKED :
-				RFKILL_STATE_SOFT_BLOCKED);
+			rfkill_set_sw_state(bluetooth_rfkill, !!state);
 	}
 
 	schedule_delayed_work(&acer_rfkill_work, round_jiffies_relative(HZ));
 }
 
-static int acer_rfkill_set(void *data, enum rfkill_state state)
+static int acer_rfkill_set(void *data, bool blocked)
 {
 	acpi_status status;
-	u32 *cap = data;
-	status = set_u32((u32) (state == RFKILL_STATE_UNBLOCKED), *cap);
+	u32 cap = (unsigned long)data;
+	status = set_u32(!!blocked, cap);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
 	return 0;
 }
 
-static struct rfkill * acer_rfkill_register(struct device *dev,
-enum rfkill_type type, char *name, u32 cap)
+static const struct rfkill_ops acer_rfkill_ops = {
+	.set_block = acer_rfkill_set,
+};
+
+static struct rfkill *acer_rfkill_register(struct device *dev,
+					   enum rfkill_type type,
+					   char *name, u32 cap)
 {
 	int err;
 	u32 state;
-	u32 *data;
 	struct rfkill *rfkill_dev;
 
-	rfkill_dev = rfkill_allocate(dev, type);
+	rfkill_dev = rfkill_alloc(name, dev, type,
+				  &acer_rfkill_ops,
+				  (void *)(unsigned long)cap);
 	if (!rfkill_dev)
 		return ERR_PTR(-ENOMEM);
-	rfkill_dev->name = name;
 	get_u32(&state, cap);
-	rfkill_dev->state = state ? RFKILL_STATE_UNBLOCKED :
-		RFKILL_STATE_SOFT_BLOCKED;
-	data = kzalloc(sizeof(u32), GFP_KERNEL);
-	if (!data) {
-		rfkill_free(rfkill_dev);
-		return ERR_PTR(-ENOMEM);
-	}
-	*data = cap;
-	rfkill_dev->data = data;
-	rfkill_dev->toggle_radio = acer_rfkill_set;
+	rfkill_set_sw_state(rfkill_dev, !state);
 
 	err = rfkill_register(rfkill_dev);
 	if (err) {
-		kfree(rfkill_dev->data);
-		rfkill_free(rfkill_dev);
+		rfkill_destroy(rfkill_dev);
 		return ERR_PTR(err);
 	}
 	return rfkill_dev;
@@ -1027,8 +1019,8 @@ static int acer_rfkill_init(struct device *dev)
 			RFKILL_TYPE_BLUETOOTH, "acer-bluetooth",
 			ACER_CAP_BLUETOOTH);
 		if (IS_ERR(bluetooth_rfkill)) {
-			kfree(wireless_rfkill->data);
 			rfkill_unregister(wireless_rfkill);
+			rfkill_destroy(wireless_rfkill);
 			return PTR_ERR(bluetooth_rfkill);
 		}
 	}
@@ -1041,11 +1033,13 @@ static int acer_rfkill_init(struct device *dev)
 static void acer_rfkill_exit(void)
 {
 	cancel_delayed_work_sync(&acer_rfkill_work);
-	kfree(wireless_rfkill->data);
+
 	rfkill_unregister(wireless_rfkill);
+	rfkill_destroy(wireless_rfkill);
+
 	if (has_cap(ACER_CAP_BLUETOOTH)) {
-		kfree(bluetooth_rfkill->data);
 		rfkill_unregister(bluetooth_rfkill);
+		rfkill_destroy(bluetooth_rfkill);
 	}
 	return;
 }
diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index af9f4302117..2faf0e14f05 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -174,10 +174,11 @@ dell_send_request(struct calling_interface_buffer *buffer, int class,
    result[3]: NVRAM format version number
 */
 
-static int dell_rfkill_set(int radio, enum rfkill_state state)
+static int dell_rfkill_set(void *data, bool blocked)
 {
 	struct calling_interface_buffer buffer;
-	int disable = (state == RFKILL_STATE_UNBLOCKED) ? 0 : 1;
+	int disable = blocked ? 0 : 1;
+	unsigned long radio = (unsigned long)data;
 
 	memset(&buffer, 0, sizeof(struct calling_interface_buffer));
 	buffer.input[0] = (1 | (radio<<8) | (disable << 16));
@@ -186,56 +187,24 @@ static int dell_rfkill_set(int radio, enum rfkill_state state)
 	return 0;
 }
 
-static int dell_wifi_set(void *data, enum rfkill_state state)
-{
-	return dell_rfkill_set(1, state);
-}
-
-static int dell_bluetooth_set(void *data, enum rfkill_state state)
-{
-	return dell_rfkill_set(2, state);
-}
-
-static int dell_wwan_set(void *data, enum rfkill_state state)
-{
-	return dell_rfkill_set(3, state);
-}
-
-static int dell_rfkill_get(int bit, enum rfkill_state *state)
+static void dell_rfkill_query(struct rfkill *rfkill, void *data)
 {
 	struct calling_interface_buffer buffer;
 	int status;
-	int new_state = RFKILL_STATE_HARD_BLOCKED;
+	int bit = (unsigned long)data + 16;
 
 	memset(&buffer, 0, sizeof(struct calling_interface_buffer));
 	dell_send_request(&buffer, 17, 11);
 	status = buffer.output[1];
 
-	if (status & (1<<16))
-		new_state = RFKILL_STATE_SOFT_BLOCKED;
-
-	if (status & (1<<bit))
-		*state = new_state;
-	else
-		*state = RFKILL_STATE_UNBLOCKED;
-
-	return 0;
-}
-
-static int dell_wifi_get(void *data, enum rfkill_state *state)
-{
-	return dell_rfkill_get(17, state);
-}
-
-static int dell_bluetooth_get(void *data, enum rfkill_state *state)
-{
-	return dell_rfkill_get(18, state);
+	if (status & BIT(bit))
+		rfkill_set_hw_state(rfkill, !!(status & BIT(16)));
 }
 
-static int dell_wwan_get(void *data, enum rfkill_state *state)
-{
-	return dell_rfkill_get(19, state);
-}
+static const struct rfkill_ops dell_rfkill_ops = {
+	.set_block = dell_rfkill_set,
+	.query = dell_rfkill_query,
+};
 
 static int dell_setup_rfkill(void)
 {
@@ -248,36 +217,37 @@ static int dell_setup_rfkill(void)
 	status = buffer.output[1];
 
 	if ((status & (1<<2|1<<8)) == (1<<2|1<<8)) {
-		wifi_rfkill = rfkill_allocate(NULL, RFKILL_TYPE_WLAN);
-		if (!wifi_rfkill)
+		wifi_rfkill = rfkill_alloc("dell-wifi", NULL, RFKILL_TYPE_WLAN,
+					   &dell_rfkill_ops, (void *) 1);
+		if (!wifi_rfkill) {
+			ret = -ENOMEM;
 			goto err_wifi;
-		wifi_rfkill->name = "dell-wifi";
-		wifi_rfkill->toggle_radio = dell_wifi_set;
-		wifi_rfkill->get_state = dell_wifi_get;
+		}
 		ret = rfkill_register(wifi_rfkill);
 		if (ret)
 			goto err_wifi;
 	}
 
 	if ((status & (1<<3|1<<9)) == (1<<3|1<<9)) {
-		bluetooth_rfkill = rfkill_allocate(NULL, RFKILL_TYPE_BLUETOOTH);
-		if (!bluetooth_rfkill)
+		bluetooth_rfkill = rfkill_alloc("dell-bluetooth", NULL,
+						RFKILL_TYPE_BLUETOOTH,
+						&dell_rfkill_ops, (void *) 2);
+		if (!bluetooth_rfkill) {
+			ret = -ENOMEM;
 			goto err_bluetooth;
-		bluetooth_rfkill->name = "dell-bluetooth";
-		bluetooth_rfkill->toggle_radio = dell_bluetooth_set;
-		bluetooth_rfkill->get_state = dell_bluetooth_get;
+		}
 		ret = rfkill_register(bluetooth_rfkill);
 		if (ret)
 			goto err_bluetooth;
 	}
 
 	if ((status & (1<<4|1<<10)) == (1<<4|1<<10)) {
-		wwan_rfkill = rfkill_allocate(NULL, RFKILL_TYPE_WWAN);
-		if (!wwan_rfkill)
+		wwan_rfkill = rfkill_alloc("dell-wwan", NULL, RFKILL_TYPE_WWAN,
+					   &dell_rfkill_ops, (void *) 3);
+		if (!wwan_rfkill) {
+			ret = -ENOMEM;
 			goto err_wwan;
-		wwan_rfkill->name = "dell-wwan";
-		wwan_rfkill->toggle_radio = dell_wwan_set;
-		wwan_rfkill->get_state = dell_wwan_get;
+		}
 		ret = rfkill_register(wwan_rfkill);
 		if (ret)
 			goto err_wwan;
@@ -285,22 +255,15 @@ static int dell_setup_rfkill(void)
 
 	return 0;
 err_wwan:
-	if (wwan_rfkill)
-		rfkill_free(wwan_rfkill);
-	if (bluetooth_rfkill) {
+	rfkill_destroy(wwan_rfkill);
+	if (bluetooth_rfkill)
 		rfkill_unregister(bluetooth_rfkill);
-		bluetooth_rfkill = NULL;
-	}
 err_bluetooth:
-	if (bluetooth_rfkill)
-		rfkill_free(bluetooth_rfkill);
-	if (wifi_rfkill) {
+	rfkill_destroy(bluetooth_rfkill);
+	if (wifi_rfkill)
 		rfkill_unregister(wifi_rfkill);
-		wifi_rfkill = NULL;
-	}
 err_wifi:
-	if (wifi_rfkill)
-		rfkill_free(wifi_rfkill);
+	rfkill_destroy(wifi_rfkill);
 
 	return ret;
 }
diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 353a898c369..1208d0cedd1 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -299,39 +299,22 @@ static int update_bl_status(struct backlight_device *bd)
  * Rfkill helpers
  */
 
-static int eeepc_wlan_rfkill_set(void *data, enum rfkill_state state)
-{
-	if (state == RFKILL_STATE_SOFT_BLOCKED)
-		return set_acpi(CM_ASL_WLAN, 0);
-	else
-		return set_acpi(CM_ASL_WLAN, 1);
-}
-
-static int eeepc_wlan_rfkill_state(void *data, enum rfkill_state *state)
+static bool eeepc_wlan_rfkill_blocked(void)
 {
 	if (get_acpi(CM_ASL_WLAN) == 1)
-		*state = RFKILL_STATE_UNBLOCKED;
-	else
-		*state = RFKILL_STATE_SOFT_BLOCKED;
-	return 0;
+		return false;
+	return true;
 }
 
-static int eeepc_bluetooth_rfkill_set(void *data, enum rfkill_state state)
+static int eeepc_rfkill_set(void *data, bool blocked)
 {
-	if (state == RFKILL_STATE_SOFT_BLOCKED)
-		return set_acpi(CM_ASL_BLUETOOTH, 0);
-	else
-		return set_acpi(CM_ASL_BLUETOOTH, 1);
+	unsigned long asl = (unsigned long)data;
+	return set_acpi(asl, !blocked);
 }
 
-static int eeepc_bluetooth_rfkill_state(void *data, enum rfkill_state *state)
-{
-	if (get_acpi(CM_ASL_BLUETOOTH) == 1)
-		*state = RFKILL_STATE_UNBLOCKED;
-	else
-		*state = RFKILL_STATE_SOFT_BLOCKED;
-	return 0;
-}
+static const struct rfkill_ops eeepc_rfkill_ops = {
+	.set_block = eeepc_rfkill_set,
+};
 
 /*
  * Sys helpers
@@ -531,9 +514,9 @@ static int notify_brn(void)
 
 static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data)
 {
-	enum rfkill_state state;
 	struct pci_dev *dev;
 	struct pci_bus *bus = pci_find_bus(0, 1);
+	bool blocked;
 
 	if (event != ACPI_NOTIFY_BUS_CHECK)
 		return;
@@ -543,9 +526,8 @@ static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data)
 		return;
 	}
 
-	eeepc_wlan_rfkill_state(ehotk->eeepc_wlan_rfkill, &state);
-
-	if (state == RFKILL_STATE_UNBLOCKED) {
+	blocked = eeepc_wlan_rfkill_blocked();
+	if (!blocked) {
 		dev = pci_get_slot(bus, 0);
 		if (dev) {
 			/* Device already present */
@@ -566,7 +548,7 @@ static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data)
 		}
 	}
 
-	rfkill_force_state(ehotk->eeepc_wlan_rfkill, state);
+	rfkill_set_sw_state(ehotk->eeepc_wlan_rfkill, blocked);
 }
 
 static void eeepc_hotk_notify(acpi_handle handle, u32 event, void *data)
@@ -684,26 +666,17 @@ static int eeepc_hotk_add(struct acpi_device *device)
 	eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P7");
 
 	if (get_acpi(CM_ASL_WLAN) != -1) {
-		ehotk->eeepc_wlan_rfkill = rfkill_allocate(&device->dev,
-							   RFKILL_TYPE_WLAN);
+		ehotk->eeepc_wlan_rfkill = rfkill_alloc("eeepc-wlan",
+							&device->dev,
+							RFKILL_TYPE_WLAN,
+							&eeepc_rfkill_ops,
+							(void *)CM_ASL_WLAN);
 
 		if (!ehotk->eeepc_wlan_rfkill)
 			goto wlan_fail;
 
-		ehotk->eeepc_wlan_rfkill->name = "eeepc-wlan";
-		ehotk->eeepc_wlan_rfkill->toggle_radio = eeepc_wlan_rfkill_set;
-		ehotk->eeepc_wlan_rfkill->get_state = eeepc_wlan_rfkill_state;
-		if (get_acpi(CM_ASL_WLAN) == 1) {
-			ehotk->eeepc_wlan_rfkill->state =
-				RFKILL_STATE_UNBLOCKED;
-			rfkill_set_default(RFKILL_TYPE_WLAN,
-					   RFKILL_STATE_UNBLOCKED);
-		} else {
-			ehotk->eeepc_wlan_rfkill->state =
-				RFKILL_STATE_SOFT_BLOCKED;
-			rfkill_set_default(RFKILL_TYPE_WLAN,
-					   RFKILL_STATE_SOFT_BLOCKED);
-		}
+		rfkill_set_global_sw_state(RFKILL_TYPE_WLAN,
+					   get_acpi(CM_ASL_WLAN) != 1);
 		result = rfkill_register(ehotk->eeepc_wlan_rfkill);
 		if (result)
 			goto wlan_fail;
@@ -711,28 +684,17 @@ static int eeepc_hotk_add(struct acpi_device *device)
 
 	if (get_acpi(CM_ASL_BLUETOOTH) != -1) {
 		ehotk->eeepc_bluetooth_rfkill =
-			rfkill_allocate(&device->dev, RFKILL_TYPE_BLUETOOTH);
+			rfkill_alloc("eeepc-bluetooth",
+				     &device->dev,
+				     RFKILL_TYPE_BLUETOOTH,
+				     &eeepc_rfkill_ops,
+				     (void *)CM_ASL_BLUETOOTH);
 
 		if (!ehotk->eeepc_bluetooth_rfkill)
 			goto bluetooth_fail;
 
-		ehotk->eeepc_bluetooth_rfkill->name = "eeepc-bluetooth";
-		ehotk->eeepc_bluetooth_rfkill->toggle_radio =
-			eeepc_bluetooth_rfkill_set;
-		ehotk->eeepc_bluetooth_rfkill->get_state =
-			eeepc_bluetooth_rfkill_state;
-		if (get_acpi(CM_ASL_BLUETOOTH) == 1) {
-			ehotk->eeepc_bluetooth_rfkill->state =
-				RFKILL_STATE_UNBLOCKED;
-			rfkill_set_default(RFKILL_TYPE_BLUETOOTH,
-					   RFKILL_STATE_UNBLOCKED);
-		} else {
-			ehotk->eeepc_bluetooth_rfkill->state =
-				RFKILL_STATE_SOFT_BLOCKED;
-			rfkill_set_default(RFKILL_TYPE_BLUETOOTH,
-					   RFKILL_STATE_SOFT_BLOCKED);
-		}
-
+		rfkill_set_global_sw_state(RFKILL_TYPE_BLUETOOTH,
+					   get_acpi(CM_ASL_BLUETOOTH) != 1);
 		result = rfkill_register(ehotk->eeepc_bluetooth_rfkill);
 		if (result)
 			goto bluetooth_fail;
@@ -741,13 +703,10 @@ static int eeepc_hotk_add(struct acpi_device *device)
 	return 0;
 
  bluetooth_fail:
-	if (ehotk->eeepc_bluetooth_rfkill)
-		rfkill_free(ehotk->eeepc_bluetooth_rfkill);
+	rfkill_destroy(ehotk->eeepc_bluetooth_rfkill);
 	rfkill_unregister(ehotk->eeepc_wlan_rfkill);
-	ehotk->eeepc_wlan_rfkill = NULL;
  wlan_fail:
-	if (ehotk->eeepc_wlan_rfkill)
-		rfkill_free(ehotk->eeepc_wlan_rfkill);
+	rfkill_destroy(ehotk->eeepc_wlan_rfkill);
 	eeepc_unregister_rfkill_notifier("\\_SB.PCI0.P0P6");
 	eeepc_unregister_rfkill_notifier("\\_SB.PCI0.P0P7");
  ehotk_fail:
diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c
index fe171fad12c..8d931145cbf 100644
--- a/drivers/platform/x86/hp-wmi.c
+++ b/drivers/platform/x86/hp-wmi.c
@@ -154,58 +154,46 @@ static int hp_wmi_dock_state(void)
 	return hp_wmi_perform_query(HPWMI_DOCK_QUERY, 0, 0);
 }
 
-static int hp_wmi_wifi_set(void *data, enum rfkill_state state)
+static int hp_wmi_set_block(void *data, bool blocked)
 {
-	if (state)
-		return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, 0x101);
-	else
-		return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, 0x100);
-}
+	unsigned long b = (unsigned long) data;
+	int query = BIT(b + 8) | ((!!blocked) << b);
 
-static int hp_wmi_bluetooth_set(void *data, enum rfkill_state state)
-{
-	if (state)
-		return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, 0x202);
-	else
-		return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, 0x200);
+	return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, query);
 }
 
-static int hp_wmi_wwan_set(void *data, enum rfkill_state state)
-{
-	if (state)
-		return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, 0x404);
-	else
-		return hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, 0x400);
-}
+static const struct rfkill_ops hp_wmi_rfkill_ops = {
+	.set_block = hp_wmi_set_block,
+};
 
-static int hp_wmi_wifi_state(void)
+static bool hp_wmi_wifi_state(void)
 {
 	int wireless = hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 0, 0);
 
 	if (wireless & 0x100)
-		return RFKILL_STATE_UNBLOCKED;
+		return false;
 	else
-		return RFKILL_STATE_SOFT_BLOCKED;
+		return true;
 }
 
-static int hp_wmi_bluetooth_state(void)
+static bool hp_wmi_bluetooth_state(void)
 {
 	int wireless = hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 0, 0);
 
 	if (wireless & 0x10000)
-		return RFKILL_STATE_UNBLOCKED;
+		return false;
 	else
-		return RFKILL_STATE_SOFT_BLOCKED;
+		return true;
 }
 
-static int hp_wmi_wwan_state(void)
+static bool hp_wmi_wwan_state(void)
 {
 	int wireless = hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 0, 0);
 
 	if (wireless & 0x1000000)
-		return RFKILL_STATE_UNBLOCKED;
+		return false;
 	else
-		return RFKILL_STATE_SOFT_BLOCKED;
+		return true;
 }
 
 static ssize_t show_display(struct device *dev, struct device_attribute *attr,
@@ -347,14 +335,14 @@ static void hp_wmi_notify(u32 value, void *context)
 			}
 		} else if (eventcode == 0x5) {
 			if (wifi_rfkill)
-				rfkill_force_state(wifi_rfkill,
-						   hp_wmi_wifi_state());
+				rfkill_set_sw_state(wifi_rfkill,
+						    hp_wmi_wifi_state());
 			if (bluetooth_rfkill)
-				rfkill_force_state(bluetooth_rfkill,
-						   hp_wmi_bluetooth_state());
+				rfkill_set_sw_state(bluetooth_rfkill,
+						    hp_wmi_bluetooth_state());
 			if (wwan_rfkill)
-				rfkill_force_state(wwan_rfkill,
-						   hp_wmi_wwan_state());
+				rfkill_set_sw_state(wwan_rfkill,
+						    hp_wmi_wwan_state());
 		} else
 			printk(KERN_INFO "HP WMI: Unknown key pressed - %x\n",
 			       eventcode);
@@ -430,31 +418,34 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 		goto add_sysfs_error;
 
 	if (wireless & 0x1) {
-		wifi_rfkill = rfkill_allocate(&device->dev, RFKILL_TYPE_WLAN);
-		wifi_rfkill->name = "hp-wifi";
-		wifi_rfkill->state = hp_wmi_wifi_state();
-		wifi_rfkill->toggle_radio = hp_wmi_wifi_set;
+		wifi_rfkill = rfkill_alloc("hp-wifi", &device->dev,
+					   RFKILL_TYPE_WLAN,
+					   &hp_wmi_rfkill_ops,
+					   (void *) 0);
+		rfkill_set_sw_state(wifi_rfkill, hp_wmi_wifi_state());
 		err = rfkill_register(wifi_rfkill);
 		if (err)
-			goto add_sysfs_error;
+			goto register_wifi_error;
 	}
 
 	if (wireless & 0x2) {
-		bluetooth_rfkill = rfkill_allocate(&device->dev,
-						   RFKILL_TYPE_BLUETOOTH);
-		bluetooth_rfkill->name = "hp-bluetooth";
-		bluetooth_rfkill->state = hp_wmi_bluetooth_state();
-		bluetooth_rfkill->toggle_radio = hp_wmi_bluetooth_set;
+		bluetooth_rfkill = rfkill_alloc("hp-bluetooth", &device->dev,
+						RFKILL_TYPE_BLUETOOTH,
+						&hp_wmi_rfkill_ops,
+						(void *) 1);
+		rfkill_set_sw_state(bluetooth_rfkill,
+				    hp_wmi_bluetooth_state());
 		err = rfkill_register(bluetooth_rfkill);
 		if (err)
 			goto register_bluetooth_error;
 	}
 
 	if (wireless & 0x4) {
-		wwan_rfkill = rfkill_allocate(&device->dev, RFKILL_TYPE_WWAN);
-		wwan_rfkill->name = "hp-wwan";
-		wwan_rfkill->state = hp_wmi_wwan_state();
-		wwan_rfkill->toggle_radio = hp_wmi_wwan_set;
+		wwan_rfkill = rfkill_alloc("hp-wwan", &device->dev,
+					   RFKILL_TYPE_WWAN,
+					   &hp_wmi_rfkill_ops,
+					   (void *) 2);
+		rfkill_set_sw_state(wwan_rfkill, hp_wmi_wwan_state());
 		err = rfkill_register(wwan_rfkill);
 		if (err)
 			goto register_wwan_err;
@@ -462,11 +453,15 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 
 	return 0;
 register_wwan_err:
+	rfkill_destroy(wwan_rfkill);
 	if (bluetooth_rfkill)
 		rfkill_unregister(bluetooth_rfkill);
 register_bluetooth_error:
+	rfkill_destroy(bluetooth_rfkill);
 	if (wifi_rfkill)
 		rfkill_unregister(wifi_rfkill);
+register_wifi_error:
+	rfkill_destroy(wifi_rfkill);
 add_sysfs_error:
 	cleanup_sysfs(device);
 	return err;
@@ -476,12 +471,18 @@ static int __exit hp_wmi_bios_remove(struct platform_device *device)
 {
 	cleanup_sysfs(device);
 
-	if (wifi_rfkill)
+	if (wifi_rfkill) {
 		rfkill_unregister(wifi_rfkill);
-	if (bluetooth_rfkill)
+		rfkill_destroy(wifi_rfkill);
+	}
+	if (bluetooth_rfkill) {
 		rfkill_unregister(bluetooth_rfkill);
-	if (wwan_rfkill)
+		rfkill_destroy(wifi_rfkill);
+	}
+	if (wwan_rfkill) {
 		rfkill_unregister(wwan_rfkill);
+		rfkill_destroy(wwan_rfkill);
+	}
 
 	return 0;
 }
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index f1963b05175..aec0b27fd77 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -128,11 +128,11 @@ enum sony_nc_rfkill {
 	SONY_BLUETOOTH,
 	SONY_WWAN,
 	SONY_WIMAX,
-	SONY_RFKILL_MAX,
+	N_SONY_RFKILL,
 };
 
-static struct rfkill *sony_rfkill_devices[SONY_RFKILL_MAX];
-static int sony_rfkill_address[SONY_RFKILL_MAX] = {0x300, 0x500, 0x700, 0x900};
+static struct rfkill *sony_rfkill_devices[N_SONY_RFKILL];
+static int sony_rfkill_address[N_SONY_RFKILL] = {0x300, 0x500, 0x700, 0x900};
 static void sony_nc_rfkill_update(void);
 
 /*********** Input Devices ***********/
@@ -1051,147 +1051,98 @@ static void sony_nc_rfkill_cleanup(void)
 {
 	int i;
 
-	for (i = 0; i < SONY_RFKILL_MAX; i++) {
-		if (sony_rfkill_devices[i])
+	for (i = 0; i < N_SONY_RFKILL; i++) {
+		if (sony_rfkill_devices[i]) {
 			rfkill_unregister(sony_rfkill_devices[i]);
+			rfkill_destroy(sony_rfkill_devices[i]);
+		}
 	}
 }
 
-static int sony_nc_rfkill_get(void *data, enum rfkill_state *state)
-{
-	int result;
-	int argument = sony_rfkill_address[(long) data];
-
-	sony_call_snc_handle(0x124, 0x200, &result);
-	if (result & 0x1) {
-		sony_call_snc_handle(0x124, argument, &result);
-		if (result & 0xf)
-			*state = RFKILL_STATE_UNBLOCKED;
-		else
-			*state = RFKILL_STATE_SOFT_BLOCKED;
-	} else {
-		*state = RFKILL_STATE_HARD_BLOCKED;
-	}
-
-	return 0;
-}
-
-static int sony_nc_rfkill_set(void *data, enum rfkill_state state)
+static int sony_nc_rfkill_set(void *data, bool blocked)
 {
 	int result;
 	int argument = sony_rfkill_address[(long) data] + 0x100;
 
-	if (state == RFKILL_STATE_UNBLOCKED)
+	if (!blocked)
 		argument |= 0xff0000;
 
 	return sony_call_snc_handle(0x124, argument, &result);
 }
 
-static int sony_nc_setup_wifi_rfkill(struct acpi_device *device)
-{
-	int err = 0;
-	struct rfkill *sony_wifi_rfkill;
-
-	sony_wifi_rfkill = rfkill_allocate(&device->dev, RFKILL_TYPE_WLAN);
-	if (!sony_wifi_rfkill)
-		return -1;
-	sony_wifi_rfkill->name = "sony-wifi";
-	sony_wifi_rfkill->toggle_radio = sony_nc_rfkill_set;
-	sony_wifi_rfkill->get_state = sony_nc_rfkill_get;
-	sony_wifi_rfkill->data = (void *)SONY_WIFI;
-	err = rfkill_register(sony_wifi_rfkill);
-	if (err)
-		rfkill_free(sony_wifi_rfkill);
-	else {
-		sony_rfkill_devices[SONY_WIFI] = sony_wifi_rfkill;
-		sony_nc_rfkill_set(sony_wifi_rfkill->data,
-				RFKILL_STATE_UNBLOCKED);
-	}
-	return err;
-}
+static const struct rfkill_ops sony_rfkill_ops = {
+	.set_block = sony_nc_rfkill_set,
+};
 
-static int sony_nc_setup_bluetooth_rfkill(struct acpi_device *device)
+static int sony_nc_setup_rfkill(struct acpi_device *device,
+				enum sony_nc_rfkill nc_type)
 {
 	int err = 0;
-	struct rfkill *sony_bluetooth_rfkill;
-
-	sony_bluetooth_rfkill = rfkill_allocate(&device->dev,
-						RFKILL_TYPE_BLUETOOTH);
-	if (!sony_bluetooth_rfkill)
-		return -1;
-	sony_bluetooth_rfkill->name = "sony-bluetooth";
-	sony_bluetooth_rfkill->toggle_radio = sony_nc_rfkill_set;
-	sony_bluetooth_rfkill->get_state = sony_nc_rfkill_get;
-	sony_bluetooth_rfkill->data = (void *)SONY_BLUETOOTH;
-	err = rfkill_register(sony_bluetooth_rfkill);
-	if (err)
-		rfkill_free(sony_bluetooth_rfkill);
-	else {
-		sony_rfkill_devices[SONY_BLUETOOTH] = sony_bluetooth_rfkill;
-		sony_nc_rfkill_set(sony_bluetooth_rfkill->data,
-				RFKILL_STATE_UNBLOCKED);
+	struct rfkill *rfk;
+	enum rfkill_type type;
+	const char *name;
+
+	switch (nc_type) {
+	case SONY_WIFI:
+		type = RFKILL_TYPE_WLAN;
+		name = "sony-wifi";
+		break;
+	case SONY_BLUETOOTH:
+		type = RFKILL_TYPE_BLUETOOTH;
+		name = "sony-bluetooth";
+		break;
+	case SONY_WWAN:
+		type = RFKILL_TYPE_WWAN;
+		name = "sony-wwan";
+		break;
+	case SONY_WIMAX:
+		type = RFKILL_TYPE_WIMAX;
+		name = "sony-wimax";
+		break;
+	default:
+		return -EINVAL;
 	}
-	return err;
-}
 
-static int sony_nc_setup_wwan_rfkill(struct acpi_device *device)
-{
-	int err = 0;
-	struct rfkill *sony_wwan_rfkill;
+	rfk = rfkill_alloc(name, &device->dev, type,
+			   &sony_rfkill_ops, (void *)nc_type);
+	if (!rfk)
+		return -ENOMEM;
 
-	sony_wwan_rfkill = rfkill_allocate(&device->dev, RFKILL_TYPE_WWAN);
-	if (!sony_wwan_rfkill)
-		return -1;
-	sony_wwan_rfkill->name = "sony-wwan";
-	sony_wwan_rfkill->toggle_radio = sony_nc_rfkill_set;
-	sony_wwan_rfkill->get_state = sony_nc_rfkill_get;
-	sony_wwan_rfkill->data = (void *)SONY_WWAN;
-	err = rfkill_register(sony_wwan_rfkill);
-	if (err)
-		rfkill_free(sony_wwan_rfkill);
-	else {
-		sony_rfkill_devices[SONY_WWAN] = sony_wwan_rfkill;
-		sony_nc_rfkill_set(sony_wwan_rfkill->data,
-				RFKILL_STATE_UNBLOCKED);
+	err = rfkill_register(rfk);
+	if (err) {
+		rfkill_destroy(rfk);
+		return err;
 	}
+	sony_rfkill_devices[nc_type] = rfk;
+	sony_nc_rfkill_set((void *)nc_type, false);
 	return err;
 }
 
-static int sony_nc_setup_wimax_rfkill(struct acpi_device *device)
+static void sony_nc_rfkill_update()
 {
-	int err = 0;
-	struct rfkill *sony_wimax_rfkill;
+	enum sony_nc_rfkill i;
+	int result;
+	bool hwblock;
 
-	sony_wimax_rfkill = rfkill_allocate(&device->dev, RFKILL_TYPE_WIMAX);
-	if (!sony_wimax_rfkill)
-		return -1;
-	sony_wimax_rfkill->name = "sony-wimax";
-	sony_wimax_rfkill->toggle_radio = sony_nc_rfkill_set;
-	sony_wimax_rfkill->get_state = sony_nc_rfkill_get;
-	sony_wimax_rfkill->data = (void *)SONY_WIMAX;
-	err = rfkill_register(sony_wimax_rfkill);
-	if (err)
-		rfkill_free(sony_wimax_rfkill);
-	else {
-		sony_rfkill_devices[SONY_WIMAX] = sony_wimax_rfkill;
-		sony_nc_rfkill_set(sony_wimax_rfkill->data,
-				RFKILL_STATE_UNBLOCKED);
-	}
-	return err;
-}
+	sony_call_snc_handle(0x124, 0x200, &result);
+	hwblock = !(result & 0x1);
 
-static void sony_nc_rfkill_update()
-{
-	int i;
-	enum rfkill_state state;
+	for (i = 0; i < N_SONY_RFKILL; i++) {
+		int argument = sony_rfkill_address[i];
 
-	for (i = 0; i < SONY_RFKILL_MAX; i++) {
-		if (sony_rfkill_devices[i]) {
-			sony_rfkill_devices[i]->
-				get_state(sony_rfkill_devices[i]->data,
-					  &state);
-			rfkill_force_state(sony_rfkill_devices[i], state);
+		if (!sony_rfkill_devices[i])
+			continue;
+
+		if (hwblock) {
+			if (rfkill_set_hw_state(sony_rfkill_devices[i], true))
+				sony_nc_rfkill_set(sony_rfkill_devices[i],
+						   true);
+			continue;
 		}
+
+		sony_call_snc_handle(0x124, argument, &result);
+		rfkill_set_states(sony_rfkill_devices[i],
+				  !(result & 0xf), false);
 	}
 }
 
@@ -1210,13 +1161,13 @@ static int sony_nc_rfkill_setup(struct acpi_device *device)
 	}
 
 	if (result & 0x1)
-		sony_nc_setup_wifi_rfkill(device);
+		sony_nc_setup_rfkill(device, SONY_WIFI);
 	if (result & 0x2)
-		sony_nc_setup_bluetooth_rfkill(device);
+		sony_nc_setup_rfkill(device, SONY_BLUETOOTH);
 	if (result & 0x1c)
-		sony_nc_setup_wwan_rfkill(device);
+		sony_nc_setup_rfkill(device, SONY_WWAN);
 	if (result & 0x20)
-		sony_nc_setup_wimax_rfkill(device);
+		sony_nc_setup_rfkill(device, SONY_WIMAX);
 
 	return 0;
 }
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 912be65b626..cfcafa4e947 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -166,13 +166,6 @@ enum {
 
 #define TPACPI_MAX_ACPI_ARGS 3
 
-/* rfkill switches */
-enum {
-	TPACPI_RFK_BLUETOOTH_SW_ID = 0,
-	TPACPI_RFK_WWAN_SW_ID,
-	TPACPI_RFK_UWB_SW_ID,
-};
-
 /* printk headers */
 #define TPACPI_LOG TPACPI_FILE ": "
 #define TPACPI_EMERG	KERN_EMERG	TPACPI_LOG
@@ -1005,67 +998,237 @@ static int __init tpacpi_check_std_acpi_brightness_support(void)
 	return 0;
 }
 
-static int __init tpacpi_new_rfkill(const unsigned int id,
-			struct rfkill **rfk,
+static void printk_deprecated_attribute(const char * const what,
+					const char * const details)
+{
+	tpacpi_log_usertask("deprecated sysfs attribute");
+	printk(TPACPI_WARN "WARNING: sysfs attribute %s is deprecated and "
+		"will be removed. %s\n",
+		what, details);
+}
+
+/*************************************************************************
+ * rfkill and radio control support helpers
+ */
+
+/*
+ * ThinkPad-ACPI firmware handling model:
+ *
+ * WLSW (master wireless switch) is event-driven, and is common to all
+ * firmware-controlled radios.  It cannot be controlled, just monitored,
+ * as expected.  It overrides all radio state in firmware
+ *
+ * The kernel, a masked-off hotkey, and WLSW can change the radio state
+ * (TODO: verify how WLSW interacts with the returned radio state).
+ *
+ * The only time there are shadow radio state changes, is when
+ * masked-off hotkeys are used.
+ */
+
+/*
+ * Internal driver API for radio state:
+ *
+ * int: < 0 = error, otherwise enum tpacpi_rfkill_state
+ * bool: true means radio blocked (off)
+ */
+enum tpacpi_rfkill_state {
+	TPACPI_RFK_RADIO_OFF = 0,
+	TPACPI_RFK_RADIO_ON
+};
+
+/* rfkill switches */
+enum tpacpi_rfk_id {
+	TPACPI_RFK_BLUETOOTH_SW_ID = 0,
+	TPACPI_RFK_WWAN_SW_ID,
+	TPACPI_RFK_UWB_SW_ID,
+	TPACPI_RFK_SW_MAX
+};
+
+static const char *tpacpi_rfkill_names[] = {
+	[TPACPI_RFK_BLUETOOTH_SW_ID] = "bluetooth",
+	[TPACPI_RFK_WWAN_SW_ID] = "wwan",
+	[TPACPI_RFK_UWB_SW_ID] = "uwb",
+	[TPACPI_RFK_SW_MAX] = NULL
+};
+
+/* ThinkPad-ACPI rfkill subdriver */
+struct tpacpi_rfk {
+	struct rfkill *rfkill;
+	enum tpacpi_rfk_id id;
+	const struct tpacpi_rfk_ops *ops;
+};
+
+struct tpacpi_rfk_ops {
+	/* firmware interface */
+	int (*get_status)(void);
+	int (*set_status)(const enum tpacpi_rfkill_state);
+};
+
+static struct tpacpi_rfk *tpacpi_rfkill_switches[TPACPI_RFK_SW_MAX];
+
+/* Query FW and update rfkill sw state for a given rfkill switch */
+static int tpacpi_rfk_update_swstate(const struct tpacpi_rfk *tp_rfk)
+{
+	int status;
+
+	if (!tp_rfk)
+		return -ENODEV;
+
+	status = (tp_rfk->ops->get_status)();
+	if (status < 0)
+		return status;
+
+	rfkill_set_sw_state(tp_rfk->rfkill,
+			    (status == TPACPI_RFK_RADIO_OFF));
+
+	return status;
+}
+
+/* Query FW and update rfkill sw state for all rfkill switches */
+static void tpacpi_rfk_update_swstate_all(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < TPACPI_RFK_SW_MAX; i++)
+		tpacpi_rfk_update_swstate(tpacpi_rfkill_switches[i]);
+}
+
+/*
+ * Sync the HW-blocking state of all rfkill switches,
+ * do notice it causes the rfkill core to schedule uevents
+ */
+static void tpacpi_rfk_update_hwblock_state(bool blocked)
+{
+	unsigned int i;
+	struct tpacpi_rfk *tp_rfk;
+
+	for (i = 0; i < TPACPI_RFK_SW_MAX; i++) {
+		tp_rfk = tpacpi_rfkill_switches[i];
+		if (tp_rfk) {
+			if (rfkill_set_hw_state(tp_rfk->rfkill,
+						blocked)) {
+				/* ignore -- we track sw block */
+			}
+		}
+	}
+}
+
+/* Call to get the WLSW state from the firmware */
+static int hotkey_get_wlsw(void);
+
+/* Call to query WLSW state and update all rfkill switches */
+static bool tpacpi_rfk_check_hwblock_state(void)
+{
+	int res = hotkey_get_wlsw();
+	int hw_blocked;
+
+	/* When unknown or unsupported, we have to assume it is unblocked */
+	if (res < 0)
+		return false;
+
+	hw_blocked = (res == TPACPI_RFK_RADIO_OFF);
+	tpacpi_rfk_update_hwblock_state(hw_blocked);
+
+	return hw_blocked;
+}
+
+static int tpacpi_rfk_hook_set_block(void *data, bool blocked)
+{
+	struct tpacpi_rfk *tp_rfk = data;
+	int res;
+
+	dbg_printk(TPACPI_DBG_RFKILL,
+		   "request to change radio state to %s\n",
+		   blocked ? "blocked" : "unblocked");
+
+	/* try to set radio state */
+	res = (tp_rfk->ops->set_status)(blocked ?
+				TPACPI_RFK_RADIO_OFF : TPACPI_RFK_RADIO_ON);
+
+	/* and update the rfkill core with whatever the FW really did */
+	tpacpi_rfk_update_swstate(tp_rfk);
+
+	return (res < 0) ? res : 0;
+}
+
+static const struct rfkill_ops tpacpi_rfk_rfkill_ops = {
+	.set_block = tpacpi_rfk_hook_set_block,
+};
+
+static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
+			const struct tpacpi_rfk_ops *tp_rfkops,
 			const enum rfkill_type rfktype,
 			const char *name,
-			const bool set_default,
-			int (*toggle_radio)(void *, enum rfkill_state),
-			int (*get_state)(void *, enum rfkill_state *))
+			const bool set_default)
 {
+	struct tpacpi_rfk *atp_rfk;
 	int res;
-	enum rfkill_state initial_state = RFKILL_STATE_SOFT_BLOCKED;
+	bool initial_sw_state = false;
+	int initial_sw_status;
 
-	res = get_state(NULL, &initial_state);
-	if (res < 0) {
+	BUG_ON(id >= TPACPI_RFK_SW_MAX || tpacpi_rfkill_switches[id]);
+
+	initial_sw_status = (tp_rfkops->get_status)();
+	if (initial_sw_status < 0) {
 		printk(TPACPI_ERR
 			"failed to read initial state for %s, error %d; "
-			"will turn radio off\n", name, res);
-	} else if (set_default) {
-		/* try to set the initial state as the default for the rfkill
-		 * type, since we ask the firmware to preserve it across S5 in
-		 * NVRAM */
-		if (rfkill_set_default(rfktype,
-				(initial_state == RFKILL_STATE_UNBLOCKED) ?
-					RFKILL_STATE_UNBLOCKED :
-					RFKILL_STATE_SOFT_BLOCKED) == -EPERM)
-			vdbg_printk(TPACPI_DBG_RFKILL,
-				    "Default state for %s cannot be changed\n",
-				    name);
-	}
-
-	*rfk = rfkill_allocate(&tpacpi_pdev->dev, rfktype);
-	if (!*rfk) {
+			"will turn radio off\n", name, initial_sw_status);
+	} else {
+		initial_sw_state = (initial_sw_status == TPACPI_RFK_RADIO_OFF);
+		if (set_default) {
+			/* try to set the initial state as the default for the
+			 * rfkill type, since we ask the firmware to preserve
+			 * it across S5 in NVRAM */
+			rfkill_set_global_sw_state(rfktype, initial_sw_state);
+		}
+	}
+
+	atp_rfk = kzalloc(sizeof(struct tpacpi_rfk), GFP_KERNEL);
+	if (atp_rfk)
+		atp_rfk->rfkill = rfkill_alloc(name,
+						&tpacpi_pdev->dev,
+						rfktype,
+						&tpacpi_rfk_rfkill_ops,
+						atp_rfk);
+	if (!atp_rfk || !atp_rfk->rfkill) {
 		printk(TPACPI_ERR
 			"failed to allocate memory for rfkill class\n");
+		kfree(atp_rfk);
 		return -ENOMEM;
 	}
 
-	(*rfk)->name = name;
-	(*rfk)->get_state = get_state;
-	(*rfk)->toggle_radio = toggle_radio;
-	(*rfk)->state = initial_state;
+	atp_rfk->id = id;
+	atp_rfk->ops = tp_rfkops;
+
+	rfkill_set_states(atp_rfk->rfkill, initial_sw_state,
+				tpacpi_rfk_check_hwblock_state());
 
-	res = rfkill_register(*rfk);
+	res = rfkill_register(atp_rfk->rfkill);
 	if (res < 0) {
 		printk(TPACPI_ERR
 			"failed to register %s rfkill switch: %d\n",
 			name, res);
-		rfkill_free(*rfk);
-		*rfk = NULL;
+		rfkill_destroy(atp_rfk->rfkill);
+		kfree(atp_rfk);
 		return res;
 	}
 
+	tpacpi_rfkill_switches[id] = atp_rfk;
 	return 0;
 }
 
-static void printk_deprecated_attribute(const char * const what,
-					const char * const details)
+static void tpacpi_destroy_rfkill(const enum tpacpi_rfk_id id)
 {
-	tpacpi_log_usertask("deprecated sysfs attribute");
-	printk(TPACPI_WARN "WARNING: sysfs attribute %s is deprecated and "
-		"will be removed. %s\n",
-		what, details);
+	struct tpacpi_rfk *tp_rfk;
+
+	BUG_ON(id >= TPACPI_RFK_SW_MAX);
+
+	tp_rfk = tpacpi_rfkill_switches[id];
+	if (tp_rfk) {
+		rfkill_unregister(tp_rfk->rfkill);
+		tpacpi_rfkill_switches[id] = NULL;
+		kfree(tp_rfk);
+	}
 }
 
 static void printk_deprecated_rfkill_attribute(const char * const what)
@@ -1074,6 +1237,112 @@ static void printk_deprecated_rfkill_attribute(const char * const what)
 			"Please switch to generic rfkill before year 2010");
 }
 
+/* sysfs <radio> enable ------------------------------------------------ */
+static ssize_t tpacpi_rfk_sysfs_enable_show(const enum tpacpi_rfk_id id,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	int status;
+
+	printk_deprecated_rfkill_attribute(attr->attr.name);
+
+	/* This is in the ABI... */
+	if (tpacpi_rfk_check_hwblock_state()) {
+		status = TPACPI_RFK_RADIO_OFF;
+	} else {
+		status = tpacpi_rfk_update_swstate(tpacpi_rfkill_switches[id]);
+		if (status < 0)
+			return status;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			(status == TPACPI_RFK_RADIO_ON) ? 1 : 0);
+}
+
+static ssize_t tpacpi_rfk_sysfs_enable_store(const enum tpacpi_rfk_id id,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned long t;
+	int res;
+
+	printk_deprecated_rfkill_attribute(attr->attr.name);
+
+	if (parse_strtoul(buf, 1, &t))
+		return -EINVAL;
+
+	tpacpi_disclose_usertask(attr->attr.name, "set to %ld\n", t);
+
+	/* This is in the ABI... */
+	if (tpacpi_rfk_check_hwblock_state() && !!t)
+		return -EPERM;
+
+	res = tpacpi_rfkill_switches[id]->ops->set_status((!!t) ?
+				TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF);
+	tpacpi_rfk_update_swstate(tpacpi_rfkill_switches[id]);
+
+	return (res < 0) ? res : count;
+}
+
+/* procfs -------------------------------------------------------------- */
+static int tpacpi_rfk_procfs_read(const enum tpacpi_rfk_id id, char *p)
+{
+	int len = 0;
+
+	if (id >= TPACPI_RFK_SW_MAX)
+		len += sprintf(p + len, "status:\t\tnot supported\n");
+	else {
+		int status;
+
+		/* This is in the ABI... */
+		if (tpacpi_rfk_check_hwblock_state()) {
+			status = TPACPI_RFK_RADIO_OFF;
+		} else {
+			status = tpacpi_rfk_update_swstate(
+						tpacpi_rfkill_switches[id]);
+			if (status < 0)
+				return status;
+		}
+
+		len += sprintf(p + len, "status:\t\t%s\n",
+				(status == TPACPI_RFK_RADIO_ON) ?
+					"enabled" : "disabled");
+		len += sprintf(p + len, "commands:\tenable, disable\n");
+	}
+
+	return len;
+}
+
+static int tpacpi_rfk_procfs_write(const enum tpacpi_rfk_id id, char *buf)
+{
+	char *cmd;
+	int status = -1;
+	int res = 0;
+
+	if (id >= TPACPI_RFK_SW_MAX)
+		return -ENODEV;
+
+	while ((cmd = next_cmd(&buf))) {
+		if (strlencmp(cmd, "enable") == 0)
+			status = TPACPI_RFK_RADIO_ON;
+		else if (strlencmp(cmd, "disable") == 0)
+			status = TPACPI_RFK_RADIO_OFF;
+		else
+			return -EINVAL;
+	}
+
+	if (status != -1) {
+		tpacpi_disclose_usertask("procfs", "attempt to %s %s\n",
+				(status == TPACPI_RFK_RADIO_ON) ?
+						"enable" : "disable",
+				tpacpi_rfkill_names[id]);
+		res = (tpacpi_rfkill_switches[id]->ops->set_status)(status);
+		tpacpi_rfk_update_swstate(tpacpi_rfkill_switches[id]);
+	}
+
+	return res;
+}
+
 /*************************************************************************
  * thinkpad-acpi driver attributes
  */
@@ -1127,8 +1396,6 @@ static DRIVER_ATTR(version, S_IRUGO,
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 
-static void tpacpi_send_radiosw_update(void);
-
 /* wlsw_emulstate ------------------------------------------------------ */
 static ssize_t tpacpi_driver_wlsw_emulstate_show(struct device_driver *drv,
 						char *buf)
@@ -1144,11 +1411,10 @@ static ssize_t tpacpi_driver_wlsw_emulstate_store(struct device_driver *drv,
 	if (parse_strtoul(buf, 1, &t))
 		return -EINVAL;
 
-	if (tpacpi_wlsw_emulstate != t) {
-		tpacpi_wlsw_emulstate = !!t;
-		tpacpi_send_radiosw_update();
-	} else
+	if (tpacpi_wlsw_emulstate != !!t) {
 		tpacpi_wlsw_emulstate = !!t;
+		tpacpi_rfk_update_hwblock_state(!t);	/* negative logic */
+	}
 
 	return count;
 }
@@ -1463,17 +1729,23 @@ static struct attribute_set *hotkey_dev_attributes;
 /* HKEY.MHKG() return bits */
 #define TP_HOTKEY_TABLET_MASK (1 << 3)
 
-static int hotkey_get_wlsw(int *status)
+static int hotkey_get_wlsw(void)
 {
+	int status;
+
+	if (!tp_features.hotkey_wlsw)
+		return -ENODEV;
+
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
-	if (dbg_wlswemul) {
-		*status = !!tpacpi_wlsw_emulstate;
-		return 0;
-	}
+	if (dbg_wlswemul)
+		return (tpacpi_wlsw_emulstate) ?
+				TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 #endif
-	if (!acpi_evalf(hkey_handle, status, "WLSW", "d"))
+
+	if (!acpi_evalf(hkey_handle, &status, "WLSW", "d"))
 		return -EIO;
-	return 0;
+
+	return (status) ? TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 }
 
 static int hotkey_get_tablet_mode(int *status)
@@ -2107,12 +2379,16 @@ static ssize_t hotkey_radio_sw_show(struct device *dev,
 			   struct device_attribute *attr,
 			   char *buf)
 {
-	int res, s;
-	res = hotkey_get_wlsw(&s);
+	int res;
+	res = hotkey_get_wlsw();
 	if (res < 0)
 		return res;
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", !!s);
+	/* Opportunistic update */
+	tpacpi_rfk_update_hwblock_state((res == TPACPI_RFK_RADIO_OFF));
+
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			(res == TPACPI_RFK_RADIO_OFF) ? 0 : 1);
 }
 
 static struct device_attribute dev_attr_hotkey_radio_sw =
@@ -2223,30 +2499,52 @@ static struct attribute *hotkey_mask_attributes[] __initdata = {
 	&dev_attr_hotkey_wakeup_hotunplug_complete.attr,
 };
 
-static void bluetooth_update_rfk(void);
-static void wan_update_rfk(void);
-static void uwb_update_rfk(void);
+/*
+ * Sync both the hw and sw blocking state of all switches
+ */
 static void tpacpi_send_radiosw_update(void)
 {
 	int wlsw;
 
-	/* Sync these BEFORE sending any rfkill events */
-	if (tp_features.bluetooth)
-		bluetooth_update_rfk();
-	if (tp_features.wan)
-		wan_update_rfk();
-	if (tp_features.uwb)
-		uwb_update_rfk();
+	/*
+	 * We must sync all rfkill controllers *before* issuing any
+	 * rfkill input events, or we will race the rfkill core input
+	 * handler.
+	 *
+	 * tpacpi_inputdev_send_mutex works as a syncronization point
+	 * for the above.
+	 *
+	 * We optimize to avoid numerous calls to hotkey_get_wlsw.
+	 */
 
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&wlsw)) {
+	wlsw = hotkey_get_wlsw();
+
+	/* Sync hw blocking state first if it is hw-blocked */
+	if (wlsw == TPACPI_RFK_RADIO_OFF)
+		tpacpi_rfk_update_hwblock_state(true);
+
+	/* Sync sw blocking state */
+	tpacpi_rfk_update_swstate_all();
+
+	/* Sync hw blocking state last if it is hw-unblocked */
+	if (wlsw == TPACPI_RFK_RADIO_ON)
+		tpacpi_rfk_update_hwblock_state(false);
+
+	/* Issue rfkill input event for WLSW switch */
+	if (!(wlsw < 0)) {
 		mutex_lock(&tpacpi_inputdev_send_mutex);
 
 		input_report_switch(tpacpi_inputdev,
-				    SW_RFKILL_ALL, !!wlsw);
+				    SW_RFKILL_ALL, (wlsw > 0));
 		input_sync(tpacpi_inputdev);
 
 		mutex_unlock(&tpacpi_inputdev_send_mutex);
 	}
+
+	/*
+	 * this can be unconditional, as we will poll state again
+	 * if userspace uses the notify to read data
+	 */
 	hotkey_radio_sw_notify_change();
 }
 
@@ -3056,8 +3354,6 @@ enum {
 
 #define TPACPI_RFK_BLUETOOTH_SW_NAME	"tpacpi_bluetooth_sw"
 
-static struct rfkill *tpacpi_bluetooth_rfkill;
-
 static void bluetooth_suspend(pm_message_t state)
 {
 	/* Try to make sure radio will resume powered off */
@@ -3067,83 +3363,47 @@ static void bluetooth_suspend(pm_message_t state)
 			"bluetooth power down on resume request failed\n");
 }
 
-static int bluetooth_get_radiosw(void)
+static int bluetooth_get_status(void)
 {
 	int status;
 
-	if (!tp_features.bluetooth)
-		return -ENODEV;
-
-	/* WLSW overrides bluetooth in firmware/hardware, reflect that */
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&status) && !status)
-		return RFKILL_STATE_HARD_BLOCKED;
-
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_bluetoothemul)
 		return (tpacpi_bluetooth_emulstate) ?
-			RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
+		       TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 #endif
 
 	if (!acpi_evalf(hkey_handle, &status, "GBDC", "d"))
 		return -EIO;
 
 	return ((status & TP_ACPI_BLUETOOTH_RADIOSSW) != 0) ?
-		RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
+			TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 }
 
-static void bluetooth_update_rfk(void)
+static int bluetooth_set_status(enum tpacpi_rfkill_state state)
 {
 	int status;
 
-	if (!tpacpi_bluetooth_rfkill)
-		return;
-
-	status = bluetooth_get_radiosw();
-	if (status < 0)
-		return;
-	rfkill_force_state(tpacpi_bluetooth_rfkill, status);
-
 	vdbg_printk(TPACPI_DBG_RFKILL,
-		"forced rfkill state to %d\n",
-		status);
-}
-
-static int bluetooth_set_radiosw(int radio_on, int update_rfk)
-{
-	int status;
-
-	if (!tp_features.bluetooth)
-		return -ENODEV;
-
-	/* WLSW overrides bluetooth in firmware/hardware, but there is no
-	 * reason to risk weird behaviour. */
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&status) && !status
-	    && radio_on)
-		return -EPERM;
-
-	vdbg_printk(TPACPI_DBG_RFKILL,
-		"will %s bluetooth\n", radio_on ? "enable" : "disable");
+		"will attempt to %s bluetooth\n",
+		(state == TPACPI_RFK_RADIO_ON) ? "enable" : "disable");
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_bluetoothemul) {
-		tpacpi_bluetooth_emulstate = !!radio_on;
-		if (update_rfk)
-			bluetooth_update_rfk();
+		tpacpi_bluetooth_emulstate = (state == TPACPI_RFK_RADIO_ON);
 		return 0;
 	}
 #endif
 
 	/* We make sure to keep TP_ACPI_BLUETOOTH_RESUMECTRL off */
-	if (radio_on)
+	if (state == TPACPI_RFK_RADIO_ON)
 		status = TP_ACPI_BLUETOOTH_RADIOSSW;
 	else
 		status = 0;
+
 	if (!acpi_evalf(hkey_handle, NULL, "SBDC", "vd", status))
 		return -EIO;
 
-	if (update_rfk)
-		bluetooth_update_rfk();
-
 	return 0;
 }
 
@@ -3152,35 +3412,16 @@ static ssize_t bluetooth_enable_show(struct device *dev,
 			   struct device_attribute *attr,
 			   char *buf)
 {
-	int status;
-
-	printk_deprecated_rfkill_attribute("bluetooth_enable");
-
-	status = bluetooth_get_radiosw();
-	if (status < 0)
-		return status;
-
-	return snprintf(buf, PAGE_SIZE, "%d\n",
-			(status == RFKILL_STATE_UNBLOCKED) ? 1 : 0);
+	return tpacpi_rfk_sysfs_enable_show(TPACPI_RFK_BLUETOOTH_SW_ID,
+			attr, buf);
 }
 
 static ssize_t bluetooth_enable_store(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	unsigned long t;
-	int res;
-
-	printk_deprecated_rfkill_attribute("bluetooth_enable");
-
-	if (parse_strtoul(buf, 1, &t))
-		return -EINVAL;
-
-	tpacpi_disclose_usertask("bluetooth_enable", "set to %ld\n", t);
-
-	res = bluetooth_set_radiosw(t, 1);
-
-	return (res) ? res : count;
+	return tpacpi_rfk_sysfs_enable_store(TPACPI_RFK_BLUETOOTH_SW_ID,
+				attr, buf, count);
 }
 
 static struct device_attribute dev_attr_bluetooth_enable =
@@ -3198,23 +3439,10 @@ static const struct attribute_group bluetooth_attr_group = {
 	.attrs = bluetooth_attributes,
 };
 
-static int tpacpi_bluetooth_rfk_get(void *data, enum rfkill_state *state)
-{
-	int bts = bluetooth_get_radiosw();
-
-	if (bts < 0)
-		return bts;
-
-	*state = bts;
-	return 0;
-}
-
-static int tpacpi_bluetooth_rfk_set(void *data, enum rfkill_state state)
-{
-	dbg_printk(TPACPI_DBG_RFKILL,
-		   "request to change radio state to %d\n", state);
-	return bluetooth_set_radiosw((state == RFKILL_STATE_UNBLOCKED), 0);
-}
+static const struct tpacpi_rfk_ops bluetooth_tprfk_ops = {
+	.get_status = bluetooth_get_status,
+	.set_status = bluetooth_set_status,
+};
 
 static void bluetooth_shutdown(void)
 {
@@ -3230,13 +3458,12 @@ static void bluetooth_shutdown(void)
 
 static void bluetooth_exit(void)
 {
-	bluetooth_shutdown();
-
-	if (tpacpi_bluetooth_rfkill)
-		rfkill_unregister(tpacpi_bluetooth_rfkill);
-
 	sysfs_remove_group(&tpacpi_pdev->dev.kobj,
 			&bluetooth_attr_group);
+
+	tpacpi_destroy_rfkill(TPACPI_RFK_BLUETOOTH_SW_ID);
+
+	bluetooth_shutdown();
 }
 
 static int __init bluetooth_init(struct ibm_init_struct *iibm)
@@ -3277,20 +3504,18 @@ static int __init bluetooth_init(struct ibm_init_struct *iibm)
 	if (!tp_features.bluetooth)
 		return 1;
 
-	res = sysfs_create_group(&tpacpi_pdev->dev.kobj,
-				&bluetooth_attr_group);
-	if (res)
-		return res;
-
 	res = tpacpi_new_rfkill(TPACPI_RFK_BLUETOOTH_SW_ID,
-				&tpacpi_bluetooth_rfkill,
+				&bluetooth_tprfk_ops,
 				RFKILL_TYPE_BLUETOOTH,
 				TPACPI_RFK_BLUETOOTH_SW_NAME,
-				true,
-				tpacpi_bluetooth_rfk_set,
-				tpacpi_bluetooth_rfk_get);
+				true);
+	if (res)
+		return res;
+
+	res = sysfs_create_group(&tpacpi_pdev->dev.kobj,
+				&bluetooth_attr_group);
 	if (res) {
-		bluetooth_exit();
+		tpacpi_destroy_rfkill(TPACPI_RFK_BLUETOOTH_SW_ID);
 		return res;
 	}
 
@@ -3300,46 +3525,12 @@ static int __init bluetooth_init(struct ibm_init_struct *iibm)
 /* procfs -------------------------------------------------------------- */
 static int bluetooth_read(char *p)
 {
-	int len = 0;
-	int status = bluetooth_get_radiosw();
-
-	if (!tp_features.bluetooth)
-		len += sprintf(p + len, "status:\t\tnot supported\n");
-	else {
-		len += sprintf(p + len, "status:\t\t%s\n",
-				(status == RFKILL_STATE_UNBLOCKED) ?
-					"enabled" : "disabled");
-		len += sprintf(p + len, "commands:\tenable, disable\n");
-	}
-
-	return len;
+	return tpacpi_rfk_procfs_read(TPACPI_RFK_BLUETOOTH_SW_ID, p);
 }
 
 static int bluetooth_write(char *buf)
 {
-	char *cmd;
-	int state = -1;
-
-	if (!tp_features.bluetooth)
-		return -ENODEV;
-
-	while ((cmd = next_cmd(&buf))) {
-		if (strlencmp(cmd, "enable") == 0) {
-			state = 1;
-		} else if (strlencmp(cmd, "disable") == 0) {
-			state = 0;
-		} else
-			return -EINVAL;
-	}
-
-	if (state != -1) {
-		tpacpi_disclose_usertask("procfs bluetooth",
-			"attempt to %s\n",
-			state ? "enable" : "disable");
-		bluetooth_set_radiosw(state, 1);
-	}
-
-	return 0;
+	return tpacpi_rfk_procfs_write(TPACPI_RFK_BLUETOOTH_SW_ID, buf);
 }
 
 static struct ibm_struct bluetooth_driver_data = {
@@ -3365,8 +3556,6 @@ enum {
 
 #define TPACPI_RFK_WWAN_SW_NAME		"tpacpi_wwan_sw"
 
-static struct rfkill *tpacpi_wan_rfkill;
-
 static void wan_suspend(pm_message_t state)
 {
 	/* Try to make sure radio will resume powered off */
@@ -3376,83 +3565,47 @@ static void wan_suspend(pm_message_t state)
 			"WWAN power down on resume request failed\n");
 }
 
-static int wan_get_radiosw(void)
+static int wan_get_status(void)
 {
 	int status;
 
-	if (!tp_features.wan)
-		return -ENODEV;
-
-	/* WLSW overrides WWAN in firmware/hardware, reflect that */
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&status) && !status)
-		return RFKILL_STATE_HARD_BLOCKED;
-
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_wwanemul)
 		return (tpacpi_wwan_emulstate) ?
-			RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
+		       TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 #endif
 
 	if (!acpi_evalf(hkey_handle, &status, "GWAN", "d"))
 		return -EIO;
 
 	return ((status & TP_ACPI_WANCARD_RADIOSSW) != 0) ?
-		RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
-}
-
-static void wan_update_rfk(void)
-{
-	int status;
-
-	if (!tpacpi_wan_rfkill)
-		return;
-
-	status = wan_get_radiosw();
-	if (status < 0)
-		return;
-	rfkill_force_state(tpacpi_wan_rfkill, status);
-
-	vdbg_printk(TPACPI_DBG_RFKILL,
-		"forced rfkill state to %d\n",
-		status);
+			TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 }
 
-static int wan_set_radiosw(int radio_on, int update_rfk)
+static int wan_set_status(enum tpacpi_rfkill_state state)
 {
 	int status;
 
-	if (!tp_features.wan)
-		return -ENODEV;
-
-	/* WLSW overrides bluetooth in firmware/hardware, but there is no
-	 * reason to risk weird behaviour. */
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&status) && !status
-	    && radio_on)
-		return -EPERM;
-
 	vdbg_printk(TPACPI_DBG_RFKILL,
-		"will %s WWAN\n", radio_on ? "enable" : "disable");
+		"will attempt to %s wwan\n",
+		(state == TPACPI_RFK_RADIO_ON) ? "enable" : "disable");
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_wwanemul) {
-		tpacpi_wwan_emulstate = !!radio_on;
-		if (update_rfk)
-			wan_update_rfk();
+		tpacpi_wwan_emulstate = (state == TPACPI_RFK_RADIO_ON);
 		return 0;
 	}
 #endif
 
 	/* We make sure to keep TP_ACPI_WANCARD_RESUMECTRL off */
-	if (radio_on)
+	if (state == TPACPI_RFK_RADIO_ON)
 		status = TP_ACPI_WANCARD_RADIOSSW;
 	else
 		status = 0;
+
 	if (!acpi_evalf(hkey_handle, NULL, "SWAN", "vd", status))
 		return -EIO;
 
-	if (update_rfk)
-		wan_update_rfk();
-
 	return 0;
 }
 
@@ -3461,35 +3614,16 @@ static ssize_t wan_enable_show(struct device *dev,
 			   struct device_attribute *attr,
 			   char *buf)
 {
-	int status;
-
-	printk_deprecated_rfkill_attribute("wwan_enable");
-
-	status = wan_get_radiosw();
-	if (status < 0)
-		return status;
-
-	return snprintf(buf, PAGE_SIZE, "%d\n",
-			(status == RFKILL_STATE_UNBLOCKED) ? 1 : 0);
+	return tpacpi_rfk_sysfs_enable_show(TPACPI_RFK_WWAN_SW_ID,
+			attr, buf);
 }
 
 static ssize_t wan_enable_store(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	unsigned long t;
-	int res;
-
-	printk_deprecated_rfkill_attribute("wwan_enable");
-
-	if (parse_strtoul(buf, 1, &t))
-		return -EINVAL;
-
-	tpacpi_disclose_usertask("wwan_enable", "set to %ld\n", t);
-
-	res = wan_set_radiosw(t, 1);
-
-	return (res) ? res : count;
+	return tpacpi_rfk_sysfs_enable_store(TPACPI_RFK_WWAN_SW_ID,
+			attr, buf, count);
 }
 
 static struct device_attribute dev_attr_wan_enable =
@@ -3507,23 +3641,10 @@ static const struct attribute_group wan_attr_group = {
 	.attrs = wan_attributes,
 };
 
-static int tpacpi_wan_rfk_get(void *data, enum rfkill_state *state)
-{
-	int wans = wan_get_radiosw();
-
-	if (wans < 0)
-		return wans;
-
-	*state = wans;
-	return 0;
-}
-
-static int tpacpi_wan_rfk_set(void *data, enum rfkill_state state)
-{
-	dbg_printk(TPACPI_DBG_RFKILL,
-		   "request to change radio state to %d\n", state);
-	return wan_set_radiosw((state == RFKILL_STATE_UNBLOCKED), 0);
-}
+static const struct tpacpi_rfk_ops wan_tprfk_ops = {
+	.get_status = wan_get_status,
+	.set_status = wan_set_status,
+};
 
 static void wan_shutdown(void)
 {
@@ -3539,13 +3660,12 @@ static void wan_shutdown(void)
 
 static void wan_exit(void)
 {
-	wan_shutdown();
-
-	if (tpacpi_wan_rfkill)
-		rfkill_unregister(tpacpi_wan_rfkill);
-
 	sysfs_remove_group(&tpacpi_pdev->dev.kobj,
 		&wan_attr_group);
+
+	tpacpi_destroy_rfkill(TPACPI_RFK_WWAN_SW_ID);
+
+	wan_shutdown();
 }
 
 static int __init wan_init(struct ibm_init_struct *iibm)
@@ -3584,20 +3704,19 @@ static int __init wan_init(struct ibm_init_struct *iibm)
 	if (!tp_features.wan)
 		return 1;
 
-	res = sysfs_create_group(&tpacpi_pdev->dev.kobj,
-				&wan_attr_group);
-	if (res)
-		return res;
-
 	res = tpacpi_new_rfkill(TPACPI_RFK_WWAN_SW_ID,
-				&tpacpi_wan_rfkill,
+				&wan_tprfk_ops,
 				RFKILL_TYPE_WWAN,
 				TPACPI_RFK_WWAN_SW_NAME,
-				true,
-				tpacpi_wan_rfk_set,
-				tpacpi_wan_rfk_get);
+				true);
+	if (res)
+		return res;
+
+	res = sysfs_create_group(&tpacpi_pdev->dev.kobj,
+				&wan_attr_group);
+
 	if (res) {
-		wan_exit();
+		tpacpi_destroy_rfkill(TPACPI_RFK_WWAN_SW_ID);
 		return res;
 	}
 
@@ -3607,48 +3726,12 @@ static int __init wan_init(struct ibm_init_struct *iibm)
 /* procfs -------------------------------------------------------------- */
 static int wan_read(char *p)
 {
-	int len = 0;
-	int status = wan_get_radiosw();
-
-	tpacpi_disclose_usertask("procfs wan", "read");
-
-	if (!tp_features.wan)
-		len += sprintf(p + len, "status:\t\tnot supported\n");
-	else {
-		len += sprintf(p + len, "status:\t\t%s\n",
-				(status == RFKILL_STATE_UNBLOCKED) ?
-					"enabled" : "disabled");
-		len += sprintf(p + len, "commands:\tenable, disable\n");
-	}
-
-	return len;
+	return tpacpi_rfk_procfs_read(TPACPI_RFK_WWAN_SW_ID, p);
 }
 
 static int wan_write(char *buf)
 {
-	char *cmd;
-	int state = -1;
-
-	if (!tp_features.wan)
-		return -ENODEV;
-
-	while ((cmd = next_cmd(&buf))) {
-		if (strlencmp(cmd, "enable") == 0) {
-			state = 1;
-		} else if (strlencmp(cmd, "disable") == 0) {
-			state = 0;
-		} else
-			return -EINVAL;
-	}
-
-	if (state != -1) {
-		tpacpi_disclose_usertask("procfs wan",
-			"attempt to %s\n",
-			state ? "enable" : "disable");
-		wan_set_radiosw(state, 1);
-	}
-
-	return 0;
+	return tpacpi_rfk_procfs_write(TPACPI_RFK_WWAN_SW_ID, buf);
 }
 
 static struct ibm_struct wan_driver_data = {
@@ -3672,108 +3755,59 @@ enum {
 
 #define TPACPI_RFK_UWB_SW_NAME	"tpacpi_uwb_sw"
 
-static struct rfkill *tpacpi_uwb_rfkill;
-
-static int uwb_get_radiosw(void)
+static int uwb_get_status(void)
 {
 	int status;
 
-	if (!tp_features.uwb)
-		return -ENODEV;
-
-	/* WLSW overrides UWB in firmware/hardware, reflect that */
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&status) && !status)
-		return RFKILL_STATE_HARD_BLOCKED;
-
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_uwbemul)
 		return (tpacpi_uwb_emulstate) ?
-			RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
+		       TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 #endif
 
 	if (!acpi_evalf(hkey_handle, &status, "GUWB", "d"))
 		return -EIO;
 
 	return ((status & TP_ACPI_UWB_RADIOSSW) != 0) ?
-		RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
+			TPACPI_RFK_RADIO_ON : TPACPI_RFK_RADIO_OFF;
 }
 
-static void uwb_update_rfk(void)
+static int uwb_set_status(enum tpacpi_rfkill_state state)
 {
 	int status;
 
-	if (!tpacpi_uwb_rfkill)
-		return;
-
-	status = uwb_get_radiosw();
-	if (status < 0)
-		return;
-	rfkill_force_state(tpacpi_uwb_rfkill, status);
-
 	vdbg_printk(TPACPI_DBG_RFKILL,
-		"forced rfkill state to %d\n",
-		status);
-}
-
-static int uwb_set_radiosw(int radio_on, int update_rfk)
-{
-	int status;
-
-	if (!tp_features.uwb)
-		return -ENODEV;
-
-	/* WLSW overrides UWB in firmware/hardware, but there is no
-	 * reason to risk weird behaviour. */
-	if (tp_features.hotkey_wlsw && !hotkey_get_wlsw(&status) && !status
-	    && radio_on)
-		return -EPERM;
-
-	vdbg_printk(TPACPI_DBG_RFKILL,
-			"will %s UWB\n", radio_on ? "enable" : "disable");
+		"will attempt to %s UWB\n",
+		(state == TPACPI_RFK_RADIO_ON) ? "enable" : "disable");
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_uwbemul) {
-		tpacpi_uwb_emulstate = !!radio_on;
-		if (update_rfk)
-			uwb_update_rfk();
+		tpacpi_uwb_emulstate = (state == TPACPI_RFK_RADIO_ON);
 		return 0;
 	}
 #endif
 
-	status = (radio_on) ? TP_ACPI_UWB_RADIOSSW : 0;
+	if (state == TPACPI_RFK_RADIO_ON)
+		status = TP_ACPI_UWB_RADIOSSW;
+	else
+		status = 0;
+
 	if (!acpi_evalf(hkey_handle, NULL, "SUWB", "vd", status))
 		return -EIO;
 
-	if (update_rfk)
-		uwb_update_rfk();
-
 	return 0;
 }
 
 /* --------------------------------------------------------------------- */
 
-static int tpacpi_uwb_rfk_get(void *data, enum rfkill_state *state)
-{
-	int uwbs = uwb_get_radiosw();
-
-	if (uwbs < 0)
-		return uwbs;
-
-	*state = uwbs;
-	return 0;
-}
-
-static int tpacpi_uwb_rfk_set(void *data, enum rfkill_state state)
-{
-	dbg_printk(TPACPI_DBG_RFKILL,
-		   "request to change radio state to %d\n", state);
-	return uwb_set_radiosw((state == RFKILL_STATE_UNBLOCKED), 0);
-}
+static const struct tpacpi_rfk_ops uwb_tprfk_ops = {
+	.get_status = uwb_get_status,
+	.set_status = uwb_set_status,
+};
 
 static void uwb_exit(void)
 {
-	if (tpacpi_uwb_rfkill)
-		rfkill_unregister(tpacpi_uwb_rfkill);
+	tpacpi_destroy_rfkill(TPACPI_RFK_UWB_SW_ID);
 }
 
 static int __init uwb_init(struct ibm_init_struct *iibm)
@@ -3813,13 +3847,10 @@ static int __init uwb_init(struct ibm_init_struct *iibm)
 		return 1;
 
 	res = tpacpi_new_rfkill(TPACPI_RFK_UWB_SW_ID,
-				&tpacpi_uwb_rfkill,
+				&uwb_tprfk_ops,
 				RFKILL_TYPE_UWB,
 				TPACPI_RFK_UWB_SW_NAME,
-				false,
-				tpacpi_uwb_rfk_set,
-				tpacpi_uwb_rfk_get);
-
+				false);
 	return res;
 }
 
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 4345089f517..81d31ea507d 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -45,7 +45,6 @@
 #include <linux/backlight.h>
 #include <linux/platform_device.h>
 #include <linux/rfkill.h>
-#include <linux/input-polldev.h>
 
 #include <asm/uaccess.h>
 
@@ -250,21 +249,15 @@ static acpi_status hci_read2(u32 reg, u32 *out1, u32 *out2, u32 *result)
 
 struct toshiba_acpi_dev {
 	struct platform_device *p_dev;
-	struct rfkill *rfk_dev;
-	struct input_polled_dev *poll_dev;
+	struct rfkill *bt_rfk;
 
 	const char *bt_name;
-	const char *rfk_name;
-
-	bool last_rfk_state;
 
 	struct mutex mutex;
 };
 
 static struct toshiba_acpi_dev toshiba_acpi = {
 	.bt_name = "Toshiba Bluetooth",
-	.rfk_name = "Toshiba RFKill Switch",
-	.last_rfk_state = false,
 };
 
 /* Bluetooth rfkill handlers */
@@ -283,21 +276,6 @@ static u32 hci_get_bt_present(bool *present)
 	return hci_result;
 }
 
-static u32 hci_get_bt_on(bool *on)
-{
-	u32 hci_result;
-	u32 value, value2;
-
-	value = 0;
-	value2 = 0x0001;
-	hci_read2(HCI_WIRELESS, &value, &value2, &hci_result);
-	if (hci_result == HCI_SUCCESS)
-		*on = (value & HCI_WIRELESS_BT_POWER) &&
-		      (value & HCI_WIRELESS_BT_ATTACH);
-
-	return hci_result;
-}
-
 static u32 hci_get_radio_state(bool *radio_state)
 {
 	u32 hci_result;
@@ -311,70 +289,67 @@ static u32 hci_get_radio_state(bool *radio_state)
 	return hci_result;
 }
 
-static int bt_rfkill_toggle_radio(void *data, enum rfkill_state state)
+static int bt_rfkill_set_block(void *data, bool blocked)
 {
+	struct toshiba_acpi_dev *dev = data;
 	u32 result1, result2;
 	u32 value;
+	int err;
 	bool radio_state;
-	struct toshiba_acpi_dev *dev = data;
 
-	value = (state == RFKILL_STATE_UNBLOCKED);
+	value = (blocked == false);
 
-	if (hci_get_radio_state(&radio_state) != HCI_SUCCESS)
-		return -EFAULT;
+	mutex_lock(&dev->mutex);
+	if (hci_get_radio_state(&radio_state) != HCI_SUCCESS) {
+		err = -EBUSY;
+		goto out;
+	}
 
-	switch (state) {
-	case RFKILL_STATE_UNBLOCKED:
-		if (!radio_state)
-			return -EPERM;
-		break;
-	case RFKILL_STATE_SOFT_BLOCKED:
-		break;
-	default:
-		return -EINVAL;
+	if (!radio_state) {
+		err = 0;
+		goto out;
 	}
 
-	mutex_lock(&dev->mutex);
 	hci_write2(HCI_WIRELESS, value, HCI_WIRELESS_BT_POWER, &result1);
 	hci_write2(HCI_WIRELESS, value, HCI_WIRELESS_BT_ATTACH, &result2);
-	mutex_unlock(&dev->mutex);
 
 	if (result1 != HCI_SUCCESS || result2 != HCI_SUCCESS)
-		return -EFAULT;
-
-	return 0;
+		err = -EBUSY;
+	else
+		err = 0;
+ out:
+	mutex_unlock(&dev->mutex);
+	return err;
 }
 
-static void bt_poll_rfkill(struct input_polled_dev *poll_dev)
+static void bt_rfkill_poll(struct rfkill *rfkill, void *data)
 {
-	bool state_changed;
 	bool new_rfk_state;
 	bool value;
 	u32 hci_result;
-	struct toshiba_acpi_dev *dev = poll_dev->private;
+	struct toshiba_acpi_dev *dev = data;
+
+	mutex_lock(&dev->mutex);
 
 	hci_result = hci_get_radio_state(&value);
-	if (hci_result != HCI_SUCCESS)
-		return; /* Can't do anything useful */
+	if (hci_result != HCI_SUCCESS) {
+		/* Can't do anything useful */
+		mutex_unlock(&dev->mutex);
+	}
 
 	new_rfk_state = value;
 
-	mutex_lock(&dev->mutex);
-	state_changed = new_rfk_state != dev->last_rfk_state;
-	dev->last_rfk_state = new_rfk_state;
 	mutex_unlock(&dev->mutex);
 
-	if (unlikely(state_changed)) {
-		rfkill_force_state(dev->rfk_dev,
-				   new_rfk_state ?
-				   RFKILL_STATE_SOFT_BLOCKED :
-				   RFKILL_STATE_HARD_BLOCKED);
-		input_report_switch(poll_dev->input, SW_RFKILL_ALL,
-				    new_rfk_state);
-		input_sync(poll_dev->input);
-	}
+	if (rfkill_set_hw_state(rfkill, !new_rfk_state))
+		bt_rfkill_set_block(data, true);
 }
 
+static const struct rfkill_ops toshiba_rfk_ops = {
+	.set_block = bt_rfkill_set_block,
+	.poll = bt_rfkill_poll,
+};
+
 static struct proc_dir_entry *toshiba_proc_dir /*= 0*/ ;
 static struct backlight_device *toshiba_backlight_device;
 static int force_fan;
@@ -702,14 +677,11 @@ static struct backlight_ops toshiba_backlight_data = {
 
 static void toshiba_acpi_exit(void)
 {
-	if (toshiba_acpi.poll_dev) {
-		input_unregister_polled_device(toshiba_acpi.poll_dev);
-		input_free_polled_device(toshiba_acpi.poll_dev);
+	if (toshiba_acpi.bt_rfk) {
+		rfkill_unregister(toshiba_acpi.bt_rfk);
+		rfkill_destroy(toshiba_acpi.bt_rfk);
 	}
 
-	if (toshiba_acpi.rfk_dev)
-		rfkill_unregister(toshiba_acpi.rfk_dev);
-
 	if (toshiba_backlight_device)
 		backlight_device_unregister(toshiba_backlight_device);
 
@@ -728,8 +700,6 @@ static int __init toshiba_acpi_init(void)
 	acpi_status status = AE_OK;
 	u32 hci_result;
 	bool bt_present;
-	bool bt_on;
-	bool radio_on;
 	int ret = 0;
 
 	if (acpi_disabled)
@@ -793,60 +763,21 @@ static int __init toshiba_acpi_init(void)
 
 	/* Register rfkill switch for Bluetooth */
 	if (hci_get_bt_present(&bt_present) == HCI_SUCCESS && bt_present) {
-		toshiba_acpi.rfk_dev = rfkill_allocate(&toshiba_acpi.p_dev->dev,
-							RFKILL_TYPE_BLUETOOTH);
-		if (!toshiba_acpi.rfk_dev) {
+		toshiba_acpi.bt_rfk = rfkill_alloc(toshiba_acpi.bt_name,
+						   &toshiba_acpi.p_dev->dev,
+						   RFKILL_TYPE_BLUETOOTH,
+						   &toshiba_rfk_ops,
+						   &toshiba_acpi);
+		if (!toshiba_acpi.bt_rfk) {
 			printk(MY_ERR "unable to allocate rfkill device\n");
 			toshiba_acpi_exit();
 			return -ENOMEM;
 		}
 
-		toshiba_acpi.rfk_dev->name = toshiba_acpi.bt_name;
-		toshiba_acpi.rfk_dev->toggle_radio = bt_rfkill_toggle_radio;
-		toshiba_acpi.rfk_dev->data = &toshiba_acpi;
-
-		if (hci_get_bt_on(&bt_on) == HCI_SUCCESS && bt_on) {
-			toshiba_acpi.rfk_dev->state = RFKILL_STATE_UNBLOCKED;
-		} else if (hci_get_radio_state(&radio_on) == HCI_SUCCESS &&
-			   radio_on) {
-			toshiba_acpi.rfk_dev->state = RFKILL_STATE_SOFT_BLOCKED;
-		} else {
-			toshiba_acpi.rfk_dev->state = RFKILL_STATE_HARD_BLOCKED;
-		}
-
-		ret = rfkill_register(toshiba_acpi.rfk_dev);
+		ret = rfkill_register(toshiba_acpi.bt_rfk);
 		if (ret) {
 			printk(MY_ERR "unable to register rfkill device\n");
-			toshiba_acpi_exit();
-			return -ENOMEM;
-		}
-
-		/* Register input device for kill switch */
-		toshiba_acpi.poll_dev = input_allocate_polled_device();
-		if (!toshiba_acpi.poll_dev) {
-			printk(MY_ERR
-			       "unable to allocate kill-switch input device\n");
-			toshiba_acpi_exit();
-			return -ENOMEM;
-		}
-		toshiba_acpi.poll_dev->private = &toshiba_acpi;
-		toshiba_acpi.poll_dev->poll = bt_poll_rfkill;
-		toshiba_acpi.poll_dev->poll_interval = 1000; /* msecs */
-
-		toshiba_acpi.poll_dev->input->name = toshiba_acpi.rfk_name;
-		toshiba_acpi.poll_dev->input->id.bustype = BUS_HOST;
-		/* Toshiba USB ID */
-		toshiba_acpi.poll_dev->input->id.vendor = 0x0930;
-		set_bit(EV_SW, toshiba_acpi.poll_dev->input->evbit);
-		set_bit(SW_RFKILL_ALL, toshiba_acpi.poll_dev->input->swbit);
-		input_report_switch(toshiba_acpi.poll_dev->input,
-				    SW_RFKILL_ALL, TRUE);
-		input_sync(toshiba_acpi.poll_dev->input);
-
-		ret = input_register_polled_device(toshiba_acpi.poll_dev);
-		if (ret) {
-			printk(MY_ERR
-			       "unable to register kill-switch input device\n");
+			rfkill_destroy(toshiba_acpi.bt_rfk);
 			toshiba_acpi_exit();
 			return ret;
 		}
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3f0eaa397ef..7e09c5c1ed0 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -311,6 +311,7 @@ unifdef-y += ptrace.h
 unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
+unifdef-y += rfkill.h
 unifdef-y += irqnr.h
 unifdef-y += reboot.h
 unifdef-y += reiserfs_fs.h
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index de18ef227e0..090852c8de7 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -4,6 +4,7 @@
 /*
  * Copyright (C) 2006 - 2007 Ivo van Doorn
  * Copyright (C) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,6 +22,24 @@
  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
+
+/* define userspace visible states */
+#define RFKILL_STATE_SOFT_BLOCKED	0
+#define RFKILL_STATE_UNBLOCKED		1
+#define RFKILL_STATE_HARD_BLOCKED	2
+
+/* and that's all userspace gets */
+#ifdef __KERNEL__
+/* don't allow anyone to use these in the kernel */
+enum rfkill_user_states {
+	RFKILL_USER_STATE_SOFT_BLOCKED	= RFKILL_STATE_SOFT_BLOCKED,
+	RFKILL_USER_STATE_UNBLOCKED	= RFKILL_STATE_UNBLOCKED,
+	RFKILL_USER_STATE_HARD_BLOCKED	= RFKILL_STATE_HARD_BLOCKED,
+};
+#undef RFKILL_STATE_SOFT_BLOCKED
+#undef RFKILL_STATE_UNBLOCKED
+#undef RFKILL_STATE_HARD_BLOCKED
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -30,109 +49,267 @@
 
 /**
  * enum rfkill_type - type of rfkill switch.
- * RFKILL_TYPE_WLAN: switch is on a 802.11 wireless network device.
- * RFKILL_TYPE_BLUETOOTH: switch is on a bluetooth device.
- * RFKILL_TYPE_UWB: switch is on a ultra wideband device.
- * RFKILL_TYPE_WIMAX: switch is on a WiMAX device.
- * RFKILL_TYPE_WWAN: switch is on a wireless WAN device.
+ *
+ * @RFKILL_TYPE_WLAN: switch is on a 802.11 wireless network device.
+ * @RFKILL_TYPE_BLUETOOTH: switch is on a bluetooth device.
+ * @RFKILL_TYPE_UWB: switch is on a ultra wideband device.
+ * @RFKILL_TYPE_WIMAX: switch is on a WiMAX device.
+ * @RFKILL_TYPE_WWAN: switch is on a wireless WAN device.
+ * @NUM_RFKILL_TYPES: number of defined rfkill types
  */
 enum rfkill_type {
-	RFKILL_TYPE_WLAN ,
+	RFKILL_TYPE_WLAN,
 	RFKILL_TYPE_BLUETOOTH,
 	RFKILL_TYPE_UWB,
 	RFKILL_TYPE_WIMAX,
 	RFKILL_TYPE_WWAN,
-	RFKILL_TYPE_MAX,
+	NUM_RFKILL_TYPES,
 };
 
-enum rfkill_state {
-	RFKILL_STATE_SOFT_BLOCKED = 0,	/* Radio output blocked */
-	RFKILL_STATE_UNBLOCKED    = 1,	/* Radio output allowed */
-	RFKILL_STATE_HARD_BLOCKED = 2,	/* Output blocked, non-overrideable */
-	RFKILL_STATE_MAX,		/* marker for last valid state */
+/* this is opaque */
+struct rfkill;
+
+/**
+ * struct rfkill_ops - rfkill driver methods
+ *
+ * @poll: poll the rfkill block state(s) -- only assign this method
+ *	when you need polling. When called, simply call one of the
+ *	rfkill_set{,_hw,_sw}_state family of functions. If the hw
+ *	is getting unblocked you need to take into account the return
+ *	value of those functions to make sure the software block is
+ *	properly used.
+ * @query: query the rfkill block state(s) and call exactly one of the
+ *	rfkill_set{,_hw,_sw}_state family of functions. Assign this
+ *	method if input events can cause hardware state changes to make
+ *	the rfkill core query your driver before setting a requested
+ *	block.
+ * @set_block: turn the transmitter on (blocked == false) or off
+ *	(blocked == true) -- this is called only while the transmitter
+ *	is not hard-blocked, but note that the core's view of whether
+ *	the transmitter is hard-blocked might differ from your driver's
+ *	view due to race conditions, so it is possible that it is still
+ *	called at the same time as you are calling rfkill_set_hw_state().
+ *	This callback must be assigned.
+ */
+struct rfkill_ops {
+	void	(*poll)(struct rfkill *rfkill, void *data);
+	void	(*query)(struct rfkill *rfkill, void *data);
+	int	(*set_block)(void *data, bool blocked);
 };
 
+#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
 /**
- * struct rfkill - rfkill control structure.
- * @name: Name of the switch.
- * @type: Radio type which the button controls, the value stored
- *	here should be a value from enum rfkill_type.
- * @state: State of the switch, "UNBLOCKED" means radio can operate.
- * @mutex: Guards switch state transitions.  It serializes callbacks
- *	and also protects the state.
- * @data: Pointer to the RF button drivers private data which will be
- *	passed along when toggling radio state.
- * @toggle_radio(): Mandatory handler to control state of the radio.
- *	only RFKILL_STATE_SOFT_BLOCKED and RFKILL_STATE_UNBLOCKED are
- *	valid parameters.
- * @get_state(): handler to read current radio state from hardware,
- *      may be called from atomic context, should return 0 on success.
- *      Either this handler OR judicious use of rfkill_force_state() is
- *      MANDATORY for any driver capable of RFKILL_STATE_HARD_BLOCKED.
- * @led_trigger: A LED trigger for this button's LED.
- * @dev: Device structure integrating the switch into device tree.
- * @node: Used to place switch into list of all switches known to the
- *	the system.
- *
- * This structure represents a RF switch located on a network device.
+ * rfkill_alloc - allocate rfkill structure
+ * @name: name of the struct -- the string is not copied internally
+ * @parent: device that has rf switch on it
+ * @type: type of the switch (RFKILL_TYPE_*)
+ * @ops: rfkill methods
+ * @ops_data: data passed to each method
+ *
+ * This function should be called by the transmitter driver to allocate an
+ * rfkill structure. Returns %NULL on failure.
  */
-struct rfkill {
-	const char *name;
-	enum rfkill_type type;
-
-	/* the mutex serializes callbacks and also protects
-	 * the state */
-	struct mutex mutex;
-	enum rfkill_state state;
-	void *data;
-	int (*toggle_radio)(void *data, enum rfkill_state state);
-	int (*get_state)(void *data, enum rfkill_state *state);
+struct rfkill * __must_check rfkill_alloc(const char *name,
+					  struct device *parent,
+					  const enum rfkill_type type,
+					  const struct rfkill_ops *ops,
+					  void *ops_data);
 
-#ifdef CONFIG_RFKILL_LEDS
-	struct led_trigger led_trigger;
-#endif
+/**
+ * rfkill_register - Register a rfkill structure.
+ * @rfkill: rfkill structure to be registered
+ *
+ * This function should be called by the transmitter driver to register
+ * the rfkill structure needs to be registered. Before calling this function
+ * the driver needs to be ready to service method calls from rfkill.
+ */
+int __must_check rfkill_register(struct rfkill *rfkill);
 
-	struct device dev;
-	struct list_head node;
-	enum rfkill_state state_for_resume;
-};
-#define to_rfkill(d)	container_of(d, struct rfkill, dev)
+/**
+ * rfkill_pause_polling(struct rfkill *rfkill)
+ *
+ * Pause polling -- say transmitter is off for other reasons.
+ * NOTE: not necessary for suspend/resume -- in that case the
+ * core stops polling anyway
+ */
+void rfkill_pause_polling(struct rfkill *rfkill);
 
-struct rfkill * __must_check rfkill_allocate(struct device *parent,
-					     enum rfkill_type type);
-void rfkill_free(struct rfkill *rfkill);
-int __must_check rfkill_register(struct rfkill *rfkill);
+/**
+ * rfkill_resume_polling(struct rfkill *rfkill)
+ *
+ * Pause polling -- say transmitter is off for other reasons.
+ * NOTE: not necessary for suspend/resume -- in that case the
+ * core stops polling anyway
+ */
+void rfkill_resume_polling(struct rfkill *rfkill);
+
+
+/**
+ * rfkill_unregister - Unregister a rfkill structure.
+ * @rfkill: rfkill structure to be unregistered
+ *
+ * This function should be called by the network driver during device
+ * teardown to destroy rfkill structure. Until it returns, the driver
+ * needs to be able to service method calls.
+ */
 void rfkill_unregister(struct rfkill *rfkill);
 
-int rfkill_force_state(struct rfkill *rfkill, enum rfkill_state state);
-int rfkill_set_default(enum rfkill_type type, enum rfkill_state state);
+/**
+ * rfkill_destroy - free rfkill structure
+ * @rfkill: rfkill structure to be destroyed
+ *
+ * Destroys the rfkill structure.
+ */
+void rfkill_destroy(struct rfkill *rfkill);
+
+/**
+ * rfkill_set_hw_state - Set the internal rfkill hardware block state
+ * @rfkill: pointer to the rfkill class to modify.
+ * @state: the current hardware block state to set
+ *
+ * rfkill drivers that get events when the hard-blocked state changes
+ * use this function to notify the rfkill core (and through that also
+ * userspace) of the current state -- they should also use this after
+ * resume if the state could have changed.
+ *
+ * You need not (but may) call this function if poll_state is assigned.
+ *
+ * This function can be called in any context, even from within rfkill
+ * callbacks.
+ *
+ * The function returns the combined block state (true if transmitter
+ * should be blocked) so that drivers need not keep track of the soft
+ * block state -- which they might not be able to.
+ */
+bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
+
+/**
+ * rfkill_set_sw_state - Set the internal rfkill software block state
+ * @rfkill: pointer to the rfkill class to modify.
+ * @state: the current software block state to set
+ *
+ * rfkill drivers that get events when the soft-blocked state changes
+ * (yes, some platforms directly act on input but allow changing again)
+ * use this function to notify the rfkill core (and through that also
+ * userspace) of the current state -- they should also use this after
+ * resume if the state could have changed.
+ *
+ * This function can be called in any context, even from within rfkill
+ * callbacks.
+ *
+ * The function returns the combined block state (true if transmitter
+ * should be blocked).
+ */
+bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
+
+/**
+ * rfkill_set_states - Set the internal rfkill block states
+ * @rfkill: pointer to the rfkill class to modify.
+ * @sw: the current software block state to set
+ * @hw: the current hardware block state to set
+ *
+ * This function can be called in any context, even from within rfkill
+ * callbacks.
+ */
+void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
 
 /**
- * rfkill_state_complement - return complementar state
- * @state: state to return the complement of
+ * rfkill_set_global_sw_state - set global sw block default
+ * @type: rfkill type to set default for
+ * @blocked: default to set
  *
- * Returns RFKILL_STATE_SOFT_BLOCKED if @state is RFKILL_STATE_UNBLOCKED,
- * returns RFKILL_STATE_UNBLOCKED otherwise.
+ * This function sets the global default -- use at boot if your platform has
+ * an rfkill switch. If not early enough this call may be ignored.
+ *
+ * XXX: instead of ignoring -- how about just updating all currently
+ *	registered drivers?
  */
-static inline enum rfkill_state rfkill_state_complement(enum rfkill_state state)
+void rfkill_set_global_sw_state(const enum rfkill_type type, bool blocked);
+#else /* !RFKILL */
+static inline struct rfkill * __must_check
+rfkill_alloc(const char *name,
+	     struct device *parent,
+	     const enum rfkill_type type,
+	     const struct rfkill_ops *ops,
+	     void *ops_data)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline int __must_check rfkill_register(struct rfkill *rfkill)
+{
+	if (rfkill == ERR_PTR(-ENODEV))
+		return 0;
+	return -EINVAL;
+}
+
+static inline void rfkill_pause_polling(struct rfkill *rfkill)
+{
+}
+
+static inline void rfkill_resume_polling(struct rfkill *rfkill)
+{
+}
+
+static inline void rfkill_unregister(struct rfkill *rfkill)
+{
+}
+
+static inline void rfkill_destroy(struct rfkill *rfkill)
+{
+}
+
+static inline bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+{
+	return blocked;
+}
+
+static inline bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	return blocked;
+}
+
+static inline void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
+{
+}
+
+static inline void rfkill_set_global_sw_state(const enum rfkill_type type,
+					      bool blocked)
 {
-	return (state == RFKILL_STATE_UNBLOCKED) ?
-		RFKILL_STATE_SOFT_BLOCKED : RFKILL_STATE_UNBLOCKED;
 }
+#endif /* RFKILL || RFKILL_MODULE */
+
 
+#ifdef CONFIG_RFKILL_LEDS
 /**
- * rfkill_get_led_name - Get the LED trigger name for the button's LED.
+ * rfkill_get_led_trigger_name - Get the LED trigger name for the button's LED.
  * This function might return a NULL pointer if registering of the
- * LED trigger failed.
- * Use this as "default_trigger" for the LED.
+ * LED trigger failed. Use this as "default_trigger" for the LED.
  */
-static inline char *rfkill_get_led_name(struct rfkill *rfkill)
-{
-#ifdef CONFIG_RFKILL_LEDS
-	return (char *)(rfkill->led_trigger.name);
+const char *rfkill_get_led_trigger_name(struct rfkill *rfkill);
+
+/**
+ * rfkill_set_led_trigger_name -- set the LED trigger name
+ * @rfkill: rfkill struct
+ * @name: LED trigger name
+ *
+ * This function sets the LED trigger name of the radio LED
+ * trigger that rfkill creates. It is optional, but if called
+ * must be called before rfkill_register() to be effective.
+ */
+void rfkill_set_led_trigger_name(struct rfkill *rfkill, const char *name);
 #else
+static inline const char *rfkill_get_led_trigger_name(struct rfkill *rfkill)
+{
 	return NULL;
-#endif
 }
 
+static inline void
+rfkill_set_led_trigger_name(struct rfkill *rfkill, const char *name)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
 #endif /* RFKILL_H */
diff --git a/include/net/wimax.h b/include/net/wimax.h
index 6b3824edb39..2af7bf839f2 100644
--- a/include/net/wimax.h
+++ b/include/net/wimax.h
@@ -253,7 +253,6 @@
 struct net_device;
 struct genl_info;
 struct wimax_dev;
-struct input_dev;
 
 /**
  * struct wimax_dev - Generic WiMAX device
@@ -293,8 +292,8 @@ struct input_dev;
  *     See wimax_reset()'s documentation.
  *
  * @name: [fill] A way to identify this device. We need to register a
- *     name with many subsystems (input for RFKILL, workqueue
- *     creation, etc). We can't use the network device name as that
+ *     name with many subsystems (rfkill, workqueue creation, etc).
+ *     We can't use the network device name as that
  *     might change and in some instances we don't know it yet (until
  *     we don't call register_netdev()). So we generate an unique one
  *     using the driver name and device bus id, place it here and use
@@ -316,9 +315,6 @@ struct input_dev;
  *
  * @rfkill: [private] integration into the RF-Kill infrastructure.
  *
- * @rfkill_input: [private] virtual input device to process the
- *     hardware RF Kill switches.
- *
  * @rf_sw: [private] State of the software radio switch (OFF/ON)
  *
  * @rf_hw: [private] State of the hardware radio switch (OFF/ON)
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 7f807b30cfb..b47f72fae05 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -10,22 +10,15 @@ menuconfig RFKILL
 	  To compile this driver as a module, choose M here: the
 	  module will be called rfkill.
 
-config RFKILL_INPUT
-	tristate "Input layer to RF switch connector"
-	depends on RFKILL && INPUT
-	help
-	  Say Y here if you want kernel automatically toggle state
-	  of RF switches on and off when user presses appropriate
-	  button or a key on the keyboard. Without this module you
-	  need a some kind of userspace application to control
-	  state of the switches.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called rfkill-input.
-
 # LED trigger support
 config RFKILL_LEDS
 	bool
-	depends on RFKILL && LEDS_TRIGGERS
+	depends on RFKILL
+	depends on LEDS_TRIGGERS = y || RFKILL = LEDS_TRIGGERS
 	default y
 
+config RFKILL_INPUT
+	bool
+	depends on RFKILL
+	depends on INPUT = y || RFKILL = INPUT
+	default y
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index b38c430be05..66210535269 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -2,5 +2,6 @@
 # Makefile for the RF switch subsystem.
 #
 
-obj-$(CONFIG_RFKILL)			+= rfkill.o
-obj-$(CONFIG_RFKILL_INPUT)		+= rfkill-input.o
+rfkill-y			+= core.o
+rfkill-$(CONFIG_RFKILL_INPUT)	+= input.o
+obj-$(CONFIG_RFKILL)		+= rfkill.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
new file mode 100644
index 00000000000..30a6f8d819b
--- /dev/null
+++ b/net/rfkill/core.c
@@ -0,0 +1,896 @@
+/*
+ * Copyright (C) 2006 - 2007 Ivo van Doorn
+ * Copyright (C) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/workqueue.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rfkill.h>
+#include <linux/spinlock.h>
+
+#include "rfkill.h"
+
+#define POLL_INTERVAL		(5 * HZ)
+
+#define RFKILL_BLOCK_HW		BIT(0)
+#define RFKILL_BLOCK_SW		BIT(1)
+#define RFKILL_BLOCK_SW_PREV	BIT(2)
+#define RFKILL_BLOCK_ANY	(RFKILL_BLOCK_HW |\
+				 RFKILL_BLOCK_SW |\
+				 RFKILL_BLOCK_SW_PREV)
+#define RFKILL_BLOCK_SW_SETCALL	BIT(31)
+
+struct rfkill {
+	spinlock_t		lock;
+
+	const char		*name;
+	enum rfkill_type	type;
+
+	unsigned long		state;
+
+	bool			registered;
+	bool			suspended;
+
+	const struct rfkill_ops	*ops;
+	void			*data;
+
+#ifdef CONFIG_RFKILL_LEDS
+	struct led_trigger	led_trigger;
+	const char		*ledtrigname;
+#endif
+
+	struct device		dev;
+	struct list_head	node;
+
+	struct delayed_work	poll_work;
+	struct work_struct	uevent_work;
+	struct work_struct	sync_work;
+};
+#define to_rfkill(d)	container_of(d, struct rfkill, dev)
+
+
+
+MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>");
+MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
+MODULE_DESCRIPTION("RF switch support");
+MODULE_LICENSE("GPL");
+
+
+/*
+ * The locking here should be made much smarter, we currently have
+ * a bit of a stupid situation because drivers might want to register
+ * the rfkill struct under their own lock, and take this lock during
+ * rfkill method calls -- which will cause an AB-BA deadlock situation.
+ *
+ * To fix that, we need to rework this code here to be mostly lock-free
+ * and only use the mutex for list manipulations, not to protect the
+ * various other global variables. Then we can avoid holding the mutex
+ * around driver operations, and all is happy.
+ */
+static LIST_HEAD(rfkill_list);	/* list of registered rf switches */
+static DEFINE_MUTEX(rfkill_global_mutex);
+
+static unsigned int rfkill_default_state = 1;
+module_param_named(default_state, rfkill_default_state, uint, 0444);
+MODULE_PARM_DESC(default_state,
+		 "Default initial state for all radio types, 0 = radio off");
+
+static struct {
+	bool cur, def;
+} rfkill_global_states[NUM_RFKILL_TYPES];
+
+static unsigned long rfkill_states_default_locked;
+
+static bool rfkill_epo_lock_active;
+
+
+#ifdef CONFIG_RFKILL_LEDS
+static void rfkill_led_trigger_event(struct rfkill *rfkill)
+{
+	struct led_trigger *trigger;
+
+	if (!rfkill->registered)
+		return;
+
+	trigger = &rfkill->led_trigger;
+
+	if (rfkill->state & RFKILL_BLOCK_ANY)
+		led_trigger_event(trigger, LED_OFF);
+	else
+		led_trigger_event(trigger, LED_FULL);
+}
+
+static void rfkill_led_trigger_activate(struct led_classdev *led)
+{
+	struct rfkill *rfkill;
+
+	rfkill = container_of(led->trigger, struct rfkill, led_trigger);
+
+	rfkill_led_trigger_event(rfkill);
+}
+
+const char *rfkill_get_led_trigger_name(struct rfkill *rfkill)
+{
+	return rfkill->led_trigger.name;
+}
+EXPORT_SYMBOL(rfkill_get_led_trigger_name);
+
+void rfkill_set_led_trigger_name(struct rfkill *rfkill, const char *name)
+{
+	BUG_ON(!rfkill);
+
+	rfkill->ledtrigname = name;
+}
+EXPORT_SYMBOL(rfkill_set_led_trigger_name);
+
+static int rfkill_led_trigger_register(struct rfkill *rfkill)
+{
+	rfkill->led_trigger.name = rfkill->ledtrigname
+					? : dev_name(&rfkill->dev);
+	rfkill->led_trigger.activate = rfkill_led_trigger_activate;
+	return led_trigger_register(&rfkill->led_trigger);
+}
+
+static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
+{
+	led_trigger_unregister(&rfkill->led_trigger);
+}
+#else
+static void rfkill_led_trigger_event(struct rfkill *rfkill)
+{
+}
+
+static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
+{
+	return 0;
+}
+
+static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
+{
+}
+#endif /* CONFIG_RFKILL_LEDS */
+
+static void rfkill_uevent(struct rfkill *rfkill)
+{
+	if (!rfkill->registered || rfkill->suspended)
+		return;
+
+	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
+}
+
+static bool __rfkill_set_hw_state(struct rfkill *rfkill,
+				  bool blocked, bool *change)
+{
+	unsigned long flags;
+	bool prev, any;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	prev = !!(rfkill->state & RFKILL_BLOCK_HW);
+	if (blocked)
+		rfkill->state |= RFKILL_BLOCK_HW;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_HW;
+	*change = prev != blocked;
+	any = rfkill->state & RFKILL_BLOCK_ANY;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	rfkill_led_trigger_event(rfkill);
+
+	return any;
+}
+
+/**
+ * rfkill_set_block - wrapper for set_block method
+ *
+ * @rfkill: the rfkill struct to use
+ * @blocked: the new software state
+ *
+ * Calls the set_block method (when applicable) and handles notifications
+ * etc. as well.
+ */
+static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+	int err;
+
+	/*
+	 * Some platforms (...!) generate input events which affect the
+	 * _hard_ kill state -- whenever something tries to change the
+	 * current software state query the hardware state too.
+	 */
+	if (rfkill->ops->query)
+		rfkill->ops->query(rfkill, rfkill->data);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	if (rfkill->state & RFKILL_BLOCK_SW)
+		rfkill->state |= RFKILL_BLOCK_SW_PREV;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
+
+	if (blocked)
+		rfkill->state |= RFKILL_BLOCK_SW;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_SW;
+
+	rfkill->state |= RFKILL_BLOCK_SW_SETCALL;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP))
+		return;
+
+	err = rfkill->ops->set_block(rfkill->data, blocked);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	if (err) {
+		/*
+		 * Failed -- reset status to _prev, this may be different
+		 * from what set set _PREV to earlier in this function
+		 * if rfkill_set_sw_state was invoked.
+		 */
+		if (rfkill->state & RFKILL_BLOCK_SW_PREV)
+			rfkill->state |= RFKILL_BLOCK_SW;
+		else
+			rfkill->state &= ~RFKILL_BLOCK_SW;
+	}
+	rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL;
+	rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	rfkill_led_trigger_event(rfkill);
+	rfkill_uevent(rfkill);
+}
+
+/**
+ * __rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @state: the new state
+ *
+ * This function sets the state of all switches of given type,
+ * unless a specific switch is claimed by userspace (in which case,
+ * that switch is left alone) or suspended.
+ *
+ * Caller must have acquired rfkill_global_mutex.
+ */
+static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
+{
+	struct rfkill *rfkill;
+
+	rfkill_global_states[type].cur = blocked;
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (rfkill->type != type)
+			continue;
+
+		rfkill_set_block(rfkill, blocked);
+	}
+}
+
+/**
+ * rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @state: the new state
+ *
+ * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state).
+ * Please refer to __rfkill_switch_all() for details.
+ *
+ * Does nothing if the EPO lock is active.
+ */
+void rfkill_switch_all(enum rfkill_type type, bool blocked)
+{
+	mutex_lock(&rfkill_global_mutex);
+
+	if (!rfkill_epo_lock_active)
+		__rfkill_switch_all(type, blocked);
+
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_epo - emergency power off all transmitters
+ *
+ * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED,
+ * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex.
+ *
+ * The global state before the EPO is saved and can be restored later
+ * using rfkill_restore_states().
+ */
+void rfkill_epo(void)
+{
+	struct rfkill *rfkill;
+	int i;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	rfkill_epo_lock_active = true;
+	list_for_each_entry(rfkill, &rfkill_list, node)
+		rfkill_set_block(rfkill, true);
+
+	for (i = 0; i < NUM_RFKILL_TYPES; i++) {
+		rfkill_global_states[i].def = rfkill_global_states[i].cur;
+		rfkill_global_states[i].cur = true;
+	}
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_restore_states - restore global states
+ *
+ * Restore (and sync switches to) the global state from the
+ * states in rfkill_default_states.  This can undo the effects of
+ * a call to rfkill_epo().
+ */
+void rfkill_restore_states(void)
+{
+	int i;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	rfkill_epo_lock_active = false;
+	for (i = 0; i < NUM_RFKILL_TYPES; i++)
+		__rfkill_switch_all(i, rfkill_global_states[i].def);
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_remove_epo_lock - unlock state changes
+ *
+ * Used by rfkill-input manually unlock state changes, when
+ * the EPO switch is deactivated.
+ */
+void rfkill_remove_epo_lock(void)
+{
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_epo_lock_active = false;
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_is_epo_lock_active - returns true EPO is active
+ *
+ * Returns 0 (false) if there is NOT an active EPO contidion,
+ * and 1 (true) if there is an active EPO contition, which
+ * locks all radios in one of the BLOCKED states.
+ *
+ * Can be called in atomic context.
+ */
+bool rfkill_is_epo_lock_active(void)
+{
+	return rfkill_epo_lock_active;
+}
+
+/**
+ * rfkill_get_global_sw_state - returns global state for a type
+ * @type: the type to get the global state of
+ *
+ * Returns the current global state for a given wireless
+ * device type.
+ */
+bool rfkill_get_global_sw_state(const enum rfkill_type type)
+{
+	return rfkill_global_states[type].cur;
+}
+
+void rfkill_set_global_sw_state(const enum rfkill_type type, bool blocked)
+{
+	mutex_lock(&rfkill_global_mutex);
+
+	/* don't allow unblock when epo */
+	if (rfkill_epo_lock_active && !blocked)
+		goto out;
+
+	/* too late */
+	if (rfkill_states_default_locked & BIT(type))
+		goto out;
+
+	rfkill_states_default_locked |= BIT(type);
+
+	rfkill_global_states[type].cur = blocked;
+	rfkill_global_states[type].def = blocked;
+ out:
+	mutex_unlock(&rfkill_global_mutex);
+}
+EXPORT_SYMBOL(rfkill_set_global_sw_state);
+
+
+bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+{
+	bool ret, change;
+
+	ret = __rfkill_set_hw_state(rfkill, blocked, &change);
+
+	if (!rfkill->registered)
+		return ret;
+
+	if (change)
+		schedule_work(&rfkill->uevent_work);
+
+	return ret;
+}
+EXPORT_SYMBOL(rfkill_set_hw_state);
+
+static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	u32 bit = RFKILL_BLOCK_SW;
+
+	/* if in a ops->set_block right now, use other bit */
+	if (rfkill->state & RFKILL_BLOCK_SW_SETCALL)
+		bit = RFKILL_BLOCK_SW_PREV;
+
+	if (blocked)
+		rfkill->state |= bit;
+	else
+		rfkill->state &= ~bit;
+}
+
+bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+	bool prev, hwblock;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	prev = !!(rfkill->state & RFKILL_BLOCK_SW);
+	__rfkill_set_sw_state(rfkill, blocked);
+	hwblock = !!(rfkill->state & RFKILL_BLOCK_HW);
+	blocked = blocked || hwblock;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	if (!rfkill->registered)
+		return blocked;
+
+	if (prev != blocked && !hwblock)
+		schedule_work(&rfkill->uevent_work);
+
+	rfkill_led_trigger_event(rfkill);
+
+	return blocked;
+}
+EXPORT_SYMBOL(rfkill_set_sw_state);
+
+void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
+{
+	unsigned long flags;
+	bool swprev, hwprev;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+
+	/*
+	 * No need to care about prev/setblock ... this is for uevent only
+	 * and that will get triggered by rfkill_set_block anyway.
+	 */
+	swprev = !!(rfkill->state & RFKILL_BLOCK_SW);
+	hwprev = !!(rfkill->state & RFKILL_BLOCK_HW);
+	__rfkill_set_sw_state(rfkill, sw);
+
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	if (!rfkill->registered)
+		return;
+
+	if (swprev != sw || hwprev != hw)
+		schedule_work(&rfkill->uevent_work);
+
+	rfkill_led_trigger_event(rfkill);
+}
+EXPORT_SYMBOL(rfkill_set_states);
+
+static ssize_t rfkill_name_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%s\n", rfkill->name);
+}
+
+static const char *rfkill_get_type_str(enum rfkill_type type)
+{
+	switch (type) {
+	case RFKILL_TYPE_WLAN:
+		return "wlan";
+	case RFKILL_TYPE_BLUETOOTH:
+		return "bluetooth";
+	case RFKILL_TYPE_UWB:
+		return "ultrawideband";
+	case RFKILL_TYPE_WIMAX:
+		return "wimax";
+	case RFKILL_TYPE_WWAN:
+		return "wwan";
+	default:
+		BUG();
+	}
+
+	BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_WWAN + 1);
+}
+
+static ssize_t rfkill_type_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
+}
+
+static u8 user_state_from_blocked(unsigned long state)
+{
+	if (state & RFKILL_BLOCK_HW)
+		return RFKILL_USER_STATE_HARD_BLOCKED;
+	if (state & RFKILL_BLOCK_SW)
+		return RFKILL_USER_STATE_SOFT_BLOCKED;
+
+	return RFKILL_USER_STATE_UNBLOCKED;
+}
+
+static ssize_t rfkill_state_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	unsigned long flags;
+	u32 state;
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	state = rfkill->state;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	return sprintf(buf, "%d\n", user_state_from_blocked(state));
+}
+
+static ssize_t rfkill_state_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	/*
+	 * The intention was that userspace can only take control over
+	 * a given device when/if rfkill-input doesn't control it due
+	 * to user_claim. Since user_claim is currently unsupported,
+	 * we never support changing the state from userspace -- this
+	 * can be implemented again later.
+	 */
+
+	return -EPERM;
+}
+
+static ssize_t rfkill_claim_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%d\n", 0);
+}
+
+static ssize_t rfkill_claim_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	return -EOPNOTSUPP;
+}
+
+static struct device_attribute rfkill_dev_attrs[] = {
+	__ATTR(name, S_IRUGO, rfkill_name_show, NULL),
+	__ATTR(type, S_IRUGO, rfkill_type_show, NULL),
+	__ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store),
+	__ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store),
+	__ATTR_NULL
+};
+
+static void rfkill_release(struct device *dev)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	kfree(rfkill);
+}
+
+static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	unsigned long flags;
+	u32 state;
+	int error;
+
+	error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name);
+	if (error)
+		return error;
+	error = add_uevent_var(env, "RFKILL_TYPE=%s",
+			       rfkill_get_type_str(rfkill->type));
+	if (error)
+		return error;
+	spin_lock_irqsave(&rfkill->lock, flags);
+	state = rfkill->state;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+	error = add_uevent_var(env, "RFKILL_STATE=%d",
+			       user_state_from_blocked(state));
+	return error;
+}
+
+void rfkill_pause_polling(struct rfkill *rfkill)
+{
+	BUG_ON(!rfkill);
+
+	if (!rfkill->ops->poll)
+		return;
+
+	cancel_delayed_work_sync(&rfkill->poll_work);
+}
+EXPORT_SYMBOL(rfkill_pause_polling);
+
+void rfkill_resume_polling(struct rfkill *rfkill)
+{
+	BUG_ON(!rfkill);
+
+	if (!rfkill->ops->poll)
+		return;
+
+	schedule_work(&rfkill->poll_work.work);
+}
+EXPORT_SYMBOL(rfkill_resume_polling);
+
+static int rfkill_suspend(struct device *dev, pm_message_t state)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	rfkill_pause_polling(rfkill);
+
+	rfkill->suspended = true;
+
+	return 0;
+}
+
+static int rfkill_resume(struct device *dev)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	bool cur;
+
+	mutex_lock(&rfkill_global_mutex);
+	cur = rfkill_global_states[rfkill->type].cur;
+	rfkill_set_block(rfkill, cur);
+	mutex_unlock(&rfkill_global_mutex);
+
+	rfkill->suspended = false;
+
+	schedule_work(&rfkill->uevent_work);
+
+	rfkill_resume_polling(rfkill);
+
+	return 0;
+}
+
+static struct class rfkill_class = {
+	.name		= "rfkill",
+	.dev_release	= rfkill_release,
+	.dev_attrs	= rfkill_dev_attrs,
+	.dev_uevent	= rfkill_dev_uevent,
+	.suspend	= rfkill_suspend,
+	.resume		= rfkill_resume,
+};
+
+
+struct rfkill * __must_check rfkill_alloc(const char *name,
+					  struct device *parent,
+					  const enum rfkill_type type,
+					  const struct rfkill_ops *ops,
+					  void *ops_data)
+{
+	struct rfkill *rfkill;
+	struct device *dev;
+
+	if (WARN_ON(!ops))
+		return NULL;
+
+	if (WARN_ON(!ops->set_block))
+		return NULL;
+
+	if (WARN_ON(!name))
+		return NULL;
+
+	if (WARN_ON(type >= NUM_RFKILL_TYPES))
+		return NULL;
+
+	rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+	if (!rfkill)
+		return NULL;
+
+	spin_lock_init(&rfkill->lock);
+	INIT_LIST_HEAD(&rfkill->node);
+	rfkill->type = type;
+	rfkill->name = name;
+	rfkill->ops = ops;
+	rfkill->data = ops_data;
+
+	dev = &rfkill->dev;
+	dev->class = &rfkill_class;
+	dev->parent = parent;
+	device_initialize(dev);
+
+	return rfkill;
+}
+EXPORT_SYMBOL(rfkill_alloc);
+
+static void rfkill_poll(struct work_struct *work)
+{
+	struct rfkill *rfkill;
+
+	rfkill = container_of(work, struct rfkill, poll_work.work);
+
+	/*
+	 * Poll hardware state -- driver will use one of the
+	 * rfkill_set{,_hw,_sw}_state functions and use its
+	 * return value to update the current status.
+	 */
+	rfkill->ops->poll(rfkill, rfkill->data);
+
+	schedule_delayed_work(&rfkill->poll_work,
+		round_jiffies_relative(POLL_INTERVAL));
+}
+
+static void rfkill_uevent_work(struct work_struct *work)
+{
+	struct rfkill *rfkill;
+
+	rfkill = container_of(work, struct rfkill, uevent_work);
+
+	rfkill_uevent(rfkill);
+}
+
+static void rfkill_sync_work(struct work_struct *work)
+{
+	struct rfkill *rfkill;
+	bool cur;
+
+	rfkill = container_of(work, struct rfkill, sync_work);
+
+	mutex_lock(&rfkill_global_mutex);
+	cur = rfkill_global_states[rfkill->type].cur;
+	rfkill_set_block(rfkill, cur);
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+int __must_check rfkill_register(struct rfkill *rfkill)
+{
+	static unsigned long rfkill_no;
+	struct device *dev = &rfkill->dev;
+	int error;
+
+	BUG_ON(!rfkill);
+
+	mutex_lock(&rfkill_global_mutex);
+
+	if (rfkill->registered) {
+		error = -EALREADY;
+		goto unlock;
+	}
+
+	dev_set_name(dev, "rfkill%lu", rfkill_no);
+	rfkill_no++;
+
+	if (!(rfkill_states_default_locked & BIT(rfkill->type))) {
+		/* first of its kind */
+		BUILD_BUG_ON(NUM_RFKILL_TYPES >
+			sizeof(rfkill_states_default_locked) * 8);
+		rfkill_states_default_locked |= BIT(rfkill->type);
+		rfkill_global_states[rfkill->type].cur =
+			rfkill_global_states[rfkill->type].def;
+	}
+
+	list_add_tail(&rfkill->node, &rfkill_list);
+
+	error = device_add(dev);
+	if (error)
+		goto remove;
+
+	error = rfkill_led_trigger_register(rfkill);
+	if (error)
+		goto devdel;
+
+	rfkill->registered = true;
+
+	if (rfkill->ops->poll) {
+		INIT_DELAYED_WORK(&rfkill->poll_work, rfkill_poll);
+		schedule_delayed_work(&rfkill->poll_work,
+			round_jiffies_relative(POLL_INTERVAL));
+	}
+
+	INIT_WORK(&rfkill->uevent_work, rfkill_uevent_work);
+
+	INIT_WORK(&rfkill->sync_work, rfkill_sync_work);
+	schedule_work(&rfkill->sync_work);
+
+	mutex_unlock(&rfkill_global_mutex);
+	return 0;
+
+ devdel:
+	device_del(&rfkill->dev);
+ remove:
+	list_del_init(&rfkill->node);
+ unlock:
+	mutex_unlock(&rfkill_global_mutex);
+	return error;
+}
+EXPORT_SYMBOL(rfkill_register);
+
+void rfkill_unregister(struct rfkill *rfkill)
+{
+	BUG_ON(!rfkill);
+
+	if (rfkill->ops->poll)
+		cancel_delayed_work_sync(&rfkill->poll_work);
+
+	cancel_work_sync(&rfkill->uevent_work);
+	cancel_work_sync(&rfkill->sync_work);
+
+	rfkill->registered = false;
+
+	device_del(&rfkill->dev);
+
+	mutex_lock(&rfkill_global_mutex);
+	list_del_init(&rfkill->node);
+	mutex_unlock(&rfkill_global_mutex);
+
+	rfkill_led_trigger_unregister(rfkill);
+}
+EXPORT_SYMBOL(rfkill_unregister);
+
+void rfkill_destroy(struct rfkill *rfkill)
+{
+	if (rfkill)
+		put_device(&rfkill->dev);
+}
+EXPORT_SYMBOL(rfkill_destroy);
+
+
+static int __init rfkill_init(void)
+{
+	int error;
+	int i;
+
+	for (i = 0; i < NUM_RFKILL_TYPES; i++)
+		rfkill_global_states[i].def = !rfkill_default_state;
+
+	error = class_register(&rfkill_class);
+	if (error)
+		goto out;
+
+#ifdef CONFIG_RFKILL_INPUT
+	error = rfkill_handler_init();
+	if (error)
+		class_unregister(&rfkill_class);
+#endif
+
+ out:
+	return error;
+}
+subsys_initcall(rfkill_init);
+
+static void __exit rfkill_exit(void)
+{
+#ifdef CONFIG_RFKILL_INPUT
+	rfkill_handler_exit();
+#endif
+	class_unregister(&rfkill_class);
+}
+module_exit(rfkill_exit);
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
new file mode 100644
index 00000000000..a7295ad5f9c
--- /dev/null
+++ b/net/rfkill/input.c
@@ -0,0 +1,342 @@
+/*
+ * Input layer to RF Kill interface connector
+ *
+ * Copyright (c) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * If you ever run into a situation in which you have a SW_ type rfkill
+ * input device, then you can revive code that was removed in the patch
+ * "rfkill-input: remove unused code".
+ */
+
+#include <linux/input.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/init.h>
+#include <linux/rfkill.h>
+#include <linux/sched.h>
+
+#include "rfkill.h"
+
+enum rfkill_input_master_mode {
+	RFKILL_INPUT_MASTER_UNLOCK = 0,
+	RFKILL_INPUT_MASTER_RESTORE = 1,
+	RFKILL_INPUT_MASTER_UNBLOCKALL = 2,
+	NUM_RFKILL_INPUT_MASTER_MODES
+};
+
+/* Delay (in ms) between consecutive switch ops */
+#define RFKILL_OPS_DELAY 200
+
+static enum rfkill_input_master_mode rfkill_master_switch_mode =
+					RFKILL_INPUT_MASTER_UNBLOCKALL;
+module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0);
+MODULE_PARM_DESC(master_switch_mode,
+	"SW_RFKILL_ALL ON should: 0=do nothing (only unlock); 1=restore; 2=unblock all");
+
+static spinlock_t rfkill_op_lock;
+static bool rfkill_op_pending;
+static unsigned long rfkill_sw_pending[BITS_TO_LONGS(NUM_RFKILL_TYPES)];
+static unsigned long rfkill_sw_state[BITS_TO_LONGS(NUM_RFKILL_TYPES)];
+
+enum rfkill_sched_op {
+	RFKILL_GLOBAL_OP_EPO = 0,
+	RFKILL_GLOBAL_OP_RESTORE,
+	RFKILL_GLOBAL_OP_UNLOCK,
+	RFKILL_GLOBAL_OP_UNBLOCK,
+};
+
+static enum rfkill_sched_op rfkill_master_switch_op;
+static enum rfkill_sched_op rfkill_op;
+
+static void __rfkill_handle_global_op(enum rfkill_sched_op op)
+{
+	unsigned int i;
+
+	switch (op) {
+	case RFKILL_GLOBAL_OP_EPO:
+		rfkill_epo();
+		break;
+	case RFKILL_GLOBAL_OP_RESTORE:
+		rfkill_restore_states();
+		break;
+	case RFKILL_GLOBAL_OP_UNLOCK:
+		rfkill_remove_epo_lock();
+		break;
+	case RFKILL_GLOBAL_OP_UNBLOCK:
+		rfkill_remove_epo_lock();
+		for (i = 0; i < NUM_RFKILL_TYPES; i++)
+			rfkill_switch_all(i, false);
+		break;
+	default:
+		/* memory corruption or bug, fail safely */
+		rfkill_epo();
+		WARN(1, "Unknown requested operation %d! "
+			"rfkill Emergency Power Off activated\n",
+			op);
+	}
+}
+
+static void __rfkill_handle_normal_op(const enum rfkill_type type,
+				      const bool complement)
+{
+	bool blocked;
+
+	blocked = rfkill_get_global_sw_state(type);
+	if (complement)
+		blocked = !blocked;
+
+	rfkill_switch_all(type, blocked);
+}
+
+static void rfkill_op_handler(struct work_struct *work)
+{
+	unsigned int i;
+	bool c;
+
+	spin_lock_irq(&rfkill_op_lock);
+	do {
+		if (rfkill_op_pending) {
+			enum rfkill_sched_op op = rfkill_op;
+			rfkill_op_pending = false;
+			memset(rfkill_sw_pending, 0,
+				sizeof(rfkill_sw_pending));
+			spin_unlock_irq(&rfkill_op_lock);
+
+			__rfkill_handle_global_op(op);
+
+			spin_lock_irq(&rfkill_op_lock);
+
+			/*
+			 * handle global ops first -- during unlocked period
+			 * we might have gotten a new global op.
+			 */
+			if (rfkill_op_pending)
+				continue;
+		}
+
+		if (rfkill_is_epo_lock_active())
+			continue;
+
+		for (i = 0; i < NUM_RFKILL_TYPES; i++) {
+			if (__test_and_clear_bit(i, rfkill_sw_pending)) {
+				c = __test_and_clear_bit(i, rfkill_sw_state);
+				spin_unlock_irq(&rfkill_op_lock);
+
+				__rfkill_handle_normal_op(i, c);
+
+				spin_lock_irq(&rfkill_op_lock);
+			}
+		}
+	} while (rfkill_op_pending);
+	spin_unlock_irq(&rfkill_op_lock);
+}
+
+static DECLARE_DELAYED_WORK(rfkill_op_work, rfkill_op_handler);
+static unsigned long rfkill_last_scheduled;
+
+static unsigned long rfkill_ratelimit(const unsigned long last)
+{
+	const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
+	return (time_after(jiffies, last + delay)) ? 0 : delay;
+}
+
+static void rfkill_schedule_ratelimited(void)
+{
+	if (delayed_work_pending(&rfkill_op_work))
+		return;
+	schedule_delayed_work(&rfkill_op_work,
+			      rfkill_ratelimit(rfkill_last_scheduled));
+	rfkill_last_scheduled = jiffies;
+}
+
+static void rfkill_schedule_global_op(enum rfkill_sched_op op)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rfkill_op_lock, flags);
+	rfkill_op = op;
+	rfkill_op_pending = true;
+	if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) {
+		/* bypass the limiter for EPO */
+		cancel_delayed_work(&rfkill_op_work);
+		schedule_delayed_work(&rfkill_op_work, 0);
+		rfkill_last_scheduled = jiffies;
+	} else
+		rfkill_schedule_ratelimited();
+	spin_unlock_irqrestore(&rfkill_op_lock, flags);
+}
+
+static void rfkill_schedule_toggle(enum rfkill_type type)
+{
+	unsigned long flags;
+
+	if (rfkill_is_epo_lock_active())
+		return;
+
+	spin_lock_irqsave(&rfkill_op_lock, flags);
+	if (!rfkill_op_pending) {
+		__set_bit(type, rfkill_sw_pending);
+		__change_bit(type, rfkill_sw_state);
+		rfkill_schedule_ratelimited();
+	}
+	spin_unlock_irqrestore(&rfkill_op_lock, flags);
+}
+
+static void rfkill_schedule_evsw_rfkillall(int state)
+{
+	if (state)
+		rfkill_schedule_global_op(rfkill_master_switch_op);
+	else
+		rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO);
+}
+
+static void rfkill_event(struct input_handle *handle, unsigned int type,
+			unsigned int code, int data)
+{
+	if (type == EV_KEY && data == 1) {
+		switch (code) {
+		case KEY_WLAN:
+			rfkill_schedule_toggle(RFKILL_TYPE_WLAN);
+			break;
+		case KEY_BLUETOOTH:
+			rfkill_schedule_toggle(RFKILL_TYPE_BLUETOOTH);
+			break;
+		case KEY_UWB:
+			rfkill_schedule_toggle(RFKILL_TYPE_UWB);
+			break;
+		case KEY_WIMAX:
+			rfkill_schedule_toggle(RFKILL_TYPE_WIMAX);
+			break;
+		}
+	} else if (type == EV_SW && code == SW_RFKILL_ALL)
+		rfkill_schedule_evsw_rfkillall(data);
+}
+
+static int rfkill_connect(struct input_handler *handler, struct input_dev *dev,
+			  const struct input_device_id *id)
+{
+	struct input_handle *handle;
+	int error;
+
+	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->dev = dev;
+	handle->handler = handler;
+	handle->name = "rfkill";
+
+	/* causes rfkill_start() to be called */
+	error = input_register_handle(handle);
+	if (error)
+		goto err_free_handle;
+
+	error = input_open_device(handle);
+	if (error)
+		goto err_unregister_handle;
+
+	return 0;
+
+ err_unregister_handle:
+	input_unregister_handle(handle);
+ err_free_handle:
+	kfree(handle);
+	return error;
+}
+
+static void rfkill_start(struct input_handle *handle)
+{
+	/*
+	 * Take event_lock to guard against configuration changes, we
+	 * should be able to deal with concurrency with rfkill_event()
+	 * just fine (which event_lock will also avoid).
+	 */
+	spin_lock_irq(&handle->dev->event_lock);
+
+	if (test_bit(EV_SW, handle->dev->evbit) &&
+	    test_bit(SW_RFKILL_ALL, handle->dev->swbit))
+		rfkill_schedule_evsw_rfkillall(test_bit(SW_RFKILL_ALL,
+							handle->dev->sw));
+
+	spin_unlock_irq(&handle->dev->event_lock);
+}
+
+static void rfkill_disconnect(struct input_handle *handle)
+{
+	input_close_device(handle);
+	input_unregister_handle(handle);
+	kfree(handle);
+}
+
+static const struct input_device_id rfkill_ids[] = {
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_WIMAX)] = BIT_MASK(KEY_WIMAX) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_SWBIT,
+		.evbit = { BIT(EV_SW) },
+		.swbit = { [BIT_WORD(SW_RFKILL_ALL)] = BIT_MASK(SW_RFKILL_ALL) },
+	},
+	{ }
+};
+
+static struct input_handler rfkill_handler = {
+	.name =	"rfkill",
+	.event = rfkill_event,
+	.connect = rfkill_connect,
+	.start = rfkill_start,
+	.disconnect = rfkill_disconnect,
+	.id_table = rfkill_ids,
+};
+
+int __init rfkill_handler_init(void)
+{
+	switch (rfkill_master_switch_mode) {
+	case RFKILL_INPUT_MASTER_UNBLOCKALL:
+		rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNBLOCK;
+		break;
+	case RFKILL_INPUT_MASTER_RESTORE:
+		rfkill_master_switch_op = RFKILL_GLOBAL_OP_RESTORE;
+		break;
+	case RFKILL_INPUT_MASTER_UNLOCK:
+		rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNLOCK;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	spin_lock_init(&rfkill_op_lock);
+
+	/* Avoid delay at first schedule */
+	rfkill_last_scheduled =
+			jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1;
+	return input_register_handler(&rfkill_handler);
+}
+
+void __exit rfkill_handler_exit(void)
+{
+	input_unregister_handler(&rfkill_handler);
+	cancel_delayed_work_sync(&rfkill_op_work);
+}
diff --git a/net/rfkill/rfkill-input.c b/net/rfkill/rfkill-input.c
deleted file mode 100644
index 60a34f3b5f6..00000000000
--- a/net/rfkill/rfkill-input.c
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Input layer to RF Kill interface connector
- *
- * Copyright (c) 2007 Dmitry Torokhov
- */
-
-/*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/input.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include <linux/init.h>
-#include <linux/rfkill.h>
-#include <linux/sched.h>
-
-#include "rfkill-input.h"
-
-MODULE_AUTHOR("Dmitry Torokhov <dtor@mail.ru>");
-MODULE_DESCRIPTION("Input layer to RF switch connector");
-MODULE_LICENSE("GPL");
-
-enum rfkill_input_master_mode {
-	RFKILL_INPUT_MASTER_DONOTHING = 0,
-	RFKILL_INPUT_MASTER_RESTORE = 1,
-	RFKILL_INPUT_MASTER_UNBLOCKALL = 2,
-	RFKILL_INPUT_MASTER_MAX,	/* marker */
-};
-
-/* Delay (in ms) between consecutive switch ops */
-#define RFKILL_OPS_DELAY 200
-
-static enum rfkill_input_master_mode rfkill_master_switch_mode =
-					RFKILL_INPUT_MASTER_UNBLOCKALL;
-module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0);
-MODULE_PARM_DESC(master_switch_mode,
-	"SW_RFKILL_ALL ON should: 0=do nothing; 1=restore; 2=unblock all");
-
-enum rfkill_global_sched_op {
-	RFKILL_GLOBAL_OP_EPO = 0,
-	RFKILL_GLOBAL_OP_RESTORE,
-	RFKILL_GLOBAL_OP_UNLOCK,
-	RFKILL_GLOBAL_OP_UNBLOCK,
-};
-
-struct rfkill_task {
-	struct delayed_work dwork;
-
-	/* ensures that task is serialized */
-	struct mutex mutex;
-
-	/* protects everything below */
-	spinlock_t lock;
-
-	/* pending regular switch operations (1=pending) */
-	unsigned long sw_pending[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
-
-	/* should the state be complemented (1=yes) */
-	unsigned long sw_togglestate[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
-
-	bool global_op_pending;
-	enum rfkill_global_sched_op op;
-
-	/* last time it was scheduled */
-	unsigned long last_scheduled;
-};
-
-static void __rfkill_handle_global_op(enum rfkill_global_sched_op op)
-{
-	unsigned int i;
-
-	switch (op) {
-	case RFKILL_GLOBAL_OP_EPO:
-		rfkill_epo();
-		break;
-	case RFKILL_GLOBAL_OP_RESTORE:
-		rfkill_restore_states();
-		break;
-	case RFKILL_GLOBAL_OP_UNLOCK:
-		rfkill_remove_epo_lock();
-		break;
-	case RFKILL_GLOBAL_OP_UNBLOCK:
-		rfkill_remove_epo_lock();
-		for (i = 0; i < RFKILL_TYPE_MAX; i++)
-			rfkill_switch_all(i, RFKILL_STATE_UNBLOCKED);
-		break;
-	default:
-		/* memory corruption or bug, fail safely */
-		rfkill_epo();
-		WARN(1, "Unknown requested operation %d! "
-			"rfkill Emergency Power Off activated\n",
-			op);
-	}
-}
-
-static void __rfkill_handle_normal_op(const enum rfkill_type type,
-			const bool c)
-{
-	enum rfkill_state state;
-
-	state = rfkill_get_global_state(type);
-	if (c)
-		state = rfkill_state_complement(state);
-
-	rfkill_switch_all(type, state);
-}
-
-static void rfkill_task_handler(struct work_struct *work)
-{
-	struct rfkill_task *task = container_of(work,
-					struct rfkill_task, dwork.work);
-	bool doit = true;
-
-	mutex_lock(&task->mutex);
-
-	spin_lock_irq(&task->lock);
-	while (doit) {
-		if (task->global_op_pending) {
-			enum rfkill_global_sched_op op = task->op;
-			task->global_op_pending = false;
-			memset(task->sw_pending, 0, sizeof(task->sw_pending));
-			spin_unlock_irq(&task->lock);
-
-			__rfkill_handle_global_op(op);
-
-			/* make sure we do at least one pass with
-			 * !task->global_op_pending */
-			spin_lock_irq(&task->lock);
-			continue;
-		} else if (!rfkill_is_epo_lock_active()) {
-			unsigned int i = 0;
-
-			while (!task->global_op_pending &&
-						i < RFKILL_TYPE_MAX) {
-				if (test_and_clear_bit(i, task->sw_pending)) {
-					bool c;
-					c = test_and_clear_bit(i,
-							task->sw_togglestate);
-					spin_unlock_irq(&task->lock);
-
-					__rfkill_handle_normal_op(i, c);
-
-					spin_lock_irq(&task->lock);
-				}
-				i++;
-			}
-		}
-		doit = task->global_op_pending;
-	}
-	spin_unlock_irq(&task->lock);
-
-	mutex_unlock(&task->mutex);
-}
-
-static struct rfkill_task rfkill_task = {
-	.dwork = __DELAYED_WORK_INITIALIZER(rfkill_task.dwork,
-				rfkill_task_handler),
-	.mutex = __MUTEX_INITIALIZER(rfkill_task.mutex),
-	.lock = __SPIN_LOCK_UNLOCKED(rfkill_task.lock),
-};
-
-static unsigned long rfkill_ratelimit(const unsigned long last)
-{
-	const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
-	return (time_after(jiffies, last + delay)) ? 0 : delay;
-}
-
-static void rfkill_schedule_ratelimited(void)
-{
-	if (!delayed_work_pending(&rfkill_task.dwork)) {
-		schedule_delayed_work(&rfkill_task.dwork,
-				rfkill_ratelimit(rfkill_task.last_scheduled));
-		rfkill_task.last_scheduled = jiffies;
-	}
-}
-
-static void rfkill_schedule_global_op(enum rfkill_global_sched_op op)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rfkill_task.lock, flags);
-	rfkill_task.op = op;
-	rfkill_task.global_op_pending = true;
-	if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) {
-		/* bypass the limiter for EPO */
-		cancel_delayed_work(&rfkill_task.dwork);
-		schedule_delayed_work(&rfkill_task.dwork, 0);
-		rfkill_task.last_scheduled = jiffies;
-	} else
-		rfkill_schedule_ratelimited();
-	spin_unlock_irqrestore(&rfkill_task.lock, flags);
-}
-
-static void rfkill_schedule_toggle(enum rfkill_type type)
-{
-	unsigned long flags;
-
-	if (rfkill_is_epo_lock_active())
-		return;
-
-	spin_lock_irqsave(&rfkill_task.lock, flags);
-	if (!rfkill_task.global_op_pending) {
-		set_bit(type, rfkill_task.sw_pending);
-		change_bit(type, rfkill_task.sw_togglestate);
-		rfkill_schedule_ratelimited();
-	}
-	spin_unlock_irqrestore(&rfkill_task.lock, flags);
-}
-
-static void rfkill_schedule_evsw_rfkillall(int state)
-{
-	if (state) {
-		switch (rfkill_master_switch_mode) {
-		case RFKILL_INPUT_MASTER_UNBLOCKALL:
-			rfkill_schedule_global_op(RFKILL_GLOBAL_OP_UNBLOCK);
-			break;
-		case RFKILL_INPUT_MASTER_RESTORE:
-			rfkill_schedule_global_op(RFKILL_GLOBAL_OP_RESTORE);
-			break;
-		case RFKILL_INPUT_MASTER_DONOTHING:
-			rfkill_schedule_global_op(RFKILL_GLOBAL_OP_UNLOCK);
-			break;
-		default:
-			/* memory corruption or driver bug! fail safely */
-			rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO);
-			WARN(1, "Unknown rfkill_master_switch_mode (%d), "
-				"driver bug or memory corruption detected!\n",
-				rfkill_master_switch_mode);
-			break;
-		}
-	} else
-		rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO);
-}
-
-static void rfkill_event(struct input_handle *handle, unsigned int type,
-			unsigned int code, int data)
-{
-	if (type == EV_KEY && data == 1) {
-		enum rfkill_type t;
-
-		switch (code) {
-		case KEY_WLAN:
-			t = RFKILL_TYPE_WLAN;
-			break;
-		case KEY_BLUETOOTH:
-			t = RFKILL_TYPE_BLUETOOTH;
-			break;
-		case KEY_UWB:
-			t = RFKILL_TYPE_UWB;
-			break;
-		case KEY_WIMAX:
-			t = RFKILL_TYPE_WIMAX;
-			break;
-		default:
-			return;
-		}
-		rfkill_schedule_toggle(t);
-		return;
-	} else if (type == EV_SW) {
-		switch (code) {
-		case SW_RFKILL_ALL:
-			rfkill_schedule_evsw_rfkillall(data);
-			return;
-		default:
-			return;
-		}
-	}
-}
-
-static int rfkill_connect(struct input_handler *handler, struct input_dev *dev,
-			  const struct input_device_id *id)
-{
-	struct input_handle *handle;
-	int error;
-
-	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
-	if (!handle)
-		return -ENOMEM;
-
-	handle->dev = dev;
-	handle->handler = handler;
-	handle->name = "rfkill";
-
-	/* causes rfkill_start() to be called */
-	error = input_register_handle(handle);
-	if (error)
-		goto err_free_handle;
-
-	error = input_open_device(handle);
-	if (error)
-		goto err_unregister_handle;
-
-	return 0;
-
- err_unregister_handle:
-	input_unregister_handle(handle);
- err_free_handle:
-	kfree(handle);
-	return error;
-}
-
-static void rfkill_start(struct input_handle *handle)
-{
-	/* Take event_lock to guard against configuration changes, we
-	 * should be able to deal with concurrency with rfkill_event()
-	 * just fine (which event_lock will also avoid). */
-	spin_lock_irq(&handle->dev->event_lock);
-
-	if (test_bit(EV_SW, handle->dev->evbit)) {
-		if (test_bit(SW_RFKILL_ALL, handle->dev->swbit))
-			rfkill_schedule_evsw_rfkillall(test_bit(SW_RFKILL_ALL,
-							handle->dev->sw));
-		/* add resync for further EV_SW events here */
-	}
-
-	spin_unlock_irq(&handle->dev->event_lock);
-}
-
-static void rfkill_disconnect(struct input_handle *handle)
-{
-	input_close_device(handle);
-	input_unregister_handle(handle);
-	kfree(handle);
-}
-
-static const struct input_device_id rfkill_ids[] = {
-	{
-		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
-		.evbit = { BIT_MASK(EV_KEY) },
-		.keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) },
-	},
-	{
-		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
-		.evbit = { BIT_MASK(EV_KEY) },
-		.keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) },
-	},
-	{
-		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
-		.evbit = { BIT_MASK(EV_KEY) },
-		.keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) },
-	},
-	{
-		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
-		.evbit = { BIT_MASK(EV_KEY) },
-		.keybit = { [BIT_WORD(KEY_WIMAX)] = BIT_MASK(KEY_WIMAX) },
-	},
-	{
-		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_SWBIT,
-		.evbit = { BIT(EV_SW) },
-		.swbit = { [BIT_WORD(SW_RFKILL_ALL)] = BIT_MASK(SW_RFKILL_ALL) },
-	},
-	{ }
-};
-
-static struct input_handler rfkill_handler = {
-	.event =	rfkill_event,
-	.connect =	rfkill_connect,
-	.disconnect =	rfkill_disconnect,
-	.start =	rfkill_start,
-	.name =		"rfkill",
-	.id_table =	rfkill_ids,
-};
-
-static int __init rfkill_handler_init(void)
-{
-	if (rfkill_master_switch_mode >= RFKILL_INPUT_MASTER_MAX)
-		return -EINVAL;
-
-	/*
-	 * The penalty to not doing this is a possible RFKILL_OPS_DELAY delay
-	 * at the first use.  Acceptable, but if we can avoid it, why not?
-	 */
-	rfkill_task.last_scheduled =
-			jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1;
-	return input_register_handler(&rfkill_handler);
-}
-
-static void __exit rfkill_handler_exit(void)
-{
-	input_unregister_handler(&rfkill_handler);
-	cancel_delayed_work_sync(&rfkill_task.dwork);
-	rfkill_remove_epo_lock();
-}
-
-module_init(rfkill_handler_init);
-module_exit(rfkill_handler_exit);
diff --git a/net/rfkill/rfkill-input.h b/net/rfkill/rfkill-input.h
deleted file mode 100644
index fe8df6b5b93..00000000000
--- a/net/rfkill/rfkill-input.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2007 Ivo van Doorn
- */
-
-/*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#ifndef __RFKILL_INPUT_H
-#define __RFKILL_INPUT_H
-
-void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state);
-void rfkill_epo(void);
-void rfkill_restore_states(void);
-void rfkill_remove_epo_lock(void);
-bool rfkill_is_epo_lock_active(void);
-enum rfkill_state rfkill_get_global_state(const enum rfkill_type type);
-
-#endif /* __RFKILL_INPUT_H */
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
deleted file mode 100644
index 4f5a83183c9..00000000000
--- a/net/rfkill/rfkill.c
+++ /dev/null
@@ -1,855 +0,0 @@
-/*
- * Copyright (C) 2006 - 2007 Ivo van Doorn
- * Copyright (C) 2007 Dmitry Torokhov
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the
- * Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/workqueue.h>
-#include <linux/capability.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rfkill.h>
-
-/* Get declaration of rfkill_switch_all() to shut up sparse. */
-#include "rfkill-input.h"
-
-
-MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>");
-MODULE_VERSION("1.0");
-MODULE_DESCRIPTION("RF switch support");
-MODULE_LICENSE("GPL");
-
-static LIST_HEAD(rfkill_list);	/* list of registered rf switches */
-static DEFINE_MUTEX(rfkill_global_mutex);
-
-static unsigned int rfkill_default_state = RFKILL_STATE_UNBLOCKED;
-module_param_named(default_state, rfkill_default_state, uint, 0444);
-MODULE_PARM_DESC(default_state,
-		 "Default initial state for all radio types, 0 = radio off");
-
-struct rfkill_gsw_state {
-	enum rfkill_state current_state;
-	enum rfkill_state default_state;
-};
-
-static struct rfkill_gsw_state rfkill_global_states[RFKILL_TYPE_MAX];
-static unsigned long rfkill_states_lockdflt[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
-static bool rfkill_epo_lock_active;
-
-
-#ifdef CONFIG_RFKILL_LEDS
-static void rfkill_led_trigger(struct rfkill *rfkill,
-			       enum rfkill_state state)
-{
-	struct led_trigger *led = &rfkill->led_trigger;
-
-	if (!led->name)
-		return;
-	if (state != RFKILL_STATE_UNBLOCKED)
-		led_trigger_event(led, LED_OFF);
-	else
-		led_trigger_event(led, LED_FULL);
-}
-
-static void rfkill_led_trigger_activate(struct led_classdev *led)
-{
-	struct rfkill *rfkill = container_of(led->trigger,
-			struct rfkill, led_trigger);
-
-	rfkill_led_trigger(rfkill, rfkill->state);
-}
-#else
-static inline void rfkill_led_trigger(struct rfkill *rfkill,
-					enum rfkill_state state)
-{
-}
-#endif /* CONFIG_RFKILL_LEDS */
-
-static void rfkill_uevent(struct rfkill *rfkill)
-{
-	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
-}
-
-static void update_rfkill_state(struct rfkill *rfkill)
-{
-	enum rfkill_state newstate, oldstate;
-
-	if (rfkill->get_state) {
-		mutex_lock(&rfkill->mutex);
-		if (!rfkill->get_state(rfkill->data, &newstate)) {
-			oldstate = rfkill->state;
-			rfkill->state = newstate;
-			if (oldstate != newstate)
-				rfkill_uevent(rfkill);
-		}
-		mutex_unlock(&rfkill->mutex);
-	}
-	rfkill_led_trigger(rfkill, rfkill->state);
-}
-
-/**
- * rfkill_toggle_radio - wrapper for toggle_radio hook
- * @rfkill: the rfkill struct to use
- * @force: calls toggle_radio even if cache says it is not needed,
- *	and also makes sure notifications of the state will be
- *	sent even if it didn't change
- * @state: the new state to call toggle_radio() with
- *
- * Calls rfkill->toggle_radio, enforcing the API for toggle_radio
- * calls and handling all the red tape such as issuing notifications
- * if the call is successful.
- *
- * Suspended devices are not touched at all, and -EAGAIN is returned.
- *
- * Note that the @force parameter cannot override a (possibly cached)
- * state of RFKILL_STATE_HARD_BLOCKED.  Any device making use of
- * RFKILL_STATE_HARD_BLOCKED implements either get_state() or
- * rfkill_force_state(), so the cache either is bypassed or valid.
- *
- * Note that we do call toggle_radio for RFKILL_STATE_SOFT_BLOCKED
- * even if the radio is in RFKILL_STATE_HARD_BLOCKED state, so as to
- * give the driver a hint that it should double-BLOCK the transmitter.
- *
- * Caller must have acquired rfkill->mutex.
- */
-static int rfkill_toggle_radio(struct rfkill *rfkill,
-				enum rfkill_state state,
-				int force)
-{
-	int retval = 0;
-	enum rfkill_state oldstate, newstate;
-
-	if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP))
-		return -EBUSY;
-
-	oldstate = rfkill->state;
-
-	if (rfkill->get_state && !force &&
-	    !rfkill->get_state(rfkill->data, &newstate)) {
-		rfkill->state = newstate;
-	}
-
-	switch (state) {
-	case RFKILL_STATE_HARD_BLOCKED:
-		/* typically happens when refreshing hardware state,
-		 * such as on resume */
-		state = RFKILL_STATE_SOFT_BLOCKED;
-		break;
-	case RFKILL_STATE_UNBLOCKED:
-		/* force can't override this, only rfkill_force_state() can */
-		if (rfkill->state == RFKILL_STATE_HARD_BLOCKED)
-			return -EPERM;
-		break;
-	case RFKILL_STATE_SOFT_BLOCKED:
-		/* nothing to do, we want to give drivers the hint to double
-		 * BLOCK even a transmitter that is already in state
-		 * RFKILL_STATE_HARD_BLOCKED */
-		break;
-	default:
-		WARN(1, KERN_WARNING
-			"rfkill: illegal state %d passed as parameter "
-			"to rfkill_toggle_radio\n", state);
-		return -EINVAL;
-	}
-
-	if (force || state != rfkill->state) {
-		retval = rfkill->toggle_radio(rfkill->data, state);
-		/* never allow a HARD->SOFT downgrade! */
-		if (!retval && rfkill->state != RFKILL_STATE_HARD_BLOCKED)
-			rfkill->state = state;
-	}
-
-	if (force || rfkill->state != oldstate)
-		rfkill_uevent(rfkill);
-
-	rfkill_led_trigger(rfkill, rfkill->state);
-	return retval;
-}
-
-/**
- * __rfkill_switch_all - Toggle state of all switches of given type
- * @type: type of interfaces to be affected
- * @state: the new state
- *
- * This function toggles the state of all switches of given type,
- * unless a specific switch is claimed by userspace (in which case,
- * that switch is left alone) or suspended.
- *
- * Caller must have acquired rfkill_global_mutex.
- */
-static void __rfkill_switch_all(const enum rfkill_type type,
-				const enum rfkill_state state)
-{
-	struct rfkill *rfkill;
-
-	if (WARN((state >= RFKILL_STATE_MAX || type >= RFKILL_TYPE_MAX),
-			KERN_WARNING
-			"rfkill: illegal state %d or type %d "
-			"passed as parameter to __rfkill_switch_all\n",
-			state, type))
-		return;
-
-	rfkill_global_states[type].current_state = state;
-	list_for_each_entry(rfkill, &rfkill_list, node) {
-		if (rfkill->type == type) {
-			mutex_lock(&rfkill->mutex);
-			rfkill_toggle_radio(rfkill, state, 0);
-			mutex_unlock(&rfkill->mutex);
-			rfkill_led_trigger(rfkill, rfkill->state);
-		}
-	}
-}
-
-/**
- * rfkill_switch_all - Toggle state of all switches of given type
- * @type: type of interfaces to be affected
- * @state: the new state
- *
- * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state).
- * Please refer to __rfkill_switch_all() for details.
- *
- * Does nothing if the EPO lock is active.
- */
-void rfkill_switch_all(enum rfkill_type type, enum rfkill_state state)
-{
-	mutex_lock(&rfkill_global_mutex);
-	if (!rfkill_epo_lock_active)
-		__rfkill_switch_all(type, state);
-	mutex_unlock(&rfkill_global_mutex);
-}
-EXPORT_SYMBOL(rfkill_switch_all);
-
-/**
- * rfkill_epo - emergency power off all transmitters
- *
- * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED,
- * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex.
- *
- * The global state before the EPO is saved and can be restored later
- * using rfkill_restore_states().
- */
-void rfkill_epo(void)
-{
-	struct rfkill *rfkill;
-	int i;
-
-	mutex_lock(&rfkill_global_mutex);
-
-	rfkill_epo_lock_active = true;
-	list_for_each_entry(rfkill, &rfkill_list, node) {
-		mutex_lock(&rfkill->mutex);
-		rfkill_toggle_radio(rfkill, RFKILL_STATE_SOFT_BLOCKED, 1);
-		mutex_unlock(&rfkill->mutex);
-	}
-	for (i = 0; i < RFKILL_TYPE_MAX; i++) {
-		rfkill_global_states[i].default_state =
-				rfkill_global_states[i].current_state;
-		rfkill_global_states[i].current_state =
-				RFKILL_STATE_SOFT_BLOCKED;
-	}
-	mutex_unlock(&rfkill_global_mutex);
-	rfkill_led_trigger(rfkill, rfkill->state);
-}
-EXPORT_SYMBOL_GPL(rfkill_epo);
-
-/**
- * rfkill_restore_states - restore global states
- *
- * Restore (and sync switches to) the global state from the
- * states in rfkill_default_states.  This can undo the effects of
- * a call to rfkill_epo().
- */
-void rfkill_restore_states(void)
-{
-	int i;
-
-	mutex_lock(&rfkill_global_mutex);
-
-	rfkill_epo_lock_active = false;
-	for (i = 0; i < RFKILL_TYPE_MAX; i++)
-		__rfkill_switch_all(i, rfkill_global_states[i].default_state);
-	mutex_unlock(&rfkill_global_mutex);
-}
-EXPORT_SYMBOL_GPL(rfkill_restore_states);
-
-/**
- * rfkill_remove_epo_lock - unlock state changes
- *
- * Used by rfkill-input manually unlock state changes, when
- * the EPO switch is deactivated.
- */
-void rfkill_remove_epo_lock(void)
-{
-	mutex_lock(&rfkill_global_mutex);
-	rfkill_epo_lock_active = false;
-	mutex_unlock(&rfkill_global_mutex);
-}
-EXPORT_SYMBOL_GPL(rfkill_remove_epo_lock);
-
-/**
- * rfkill_is_epo_lock_active - returns true EPO is active
- *
- * Returns 0 (false) if there is NOT an active EPO contidion,
- * and 1 (true) if there is an active EPO contition, which
- * locks all radios in one of the BLOCKED states.
- *
- * Can be called in atomic context.
- */
-bool rfkill_is_epo_lock_active(void)
-{
-	return rfkill_epo_lock_active;
-}
-EXPORT_SYMBOL_GPL(rfkill_is_epo_lock_active);
-
-/**
- * rfkill_get_global_state - returns global state for a type
- * @type: the type to get the global state of
- *
- * Returns the current global state for a given wireless
- * device type.
- */
-enum rfkill_state rfkill_get_global_state(const enum rfkill_type type)
-{
-	return rfkill_global_states[type].current_state;
-}
-EXPORT_SYMBOL_GPL(rfkill_get_global_state);
-
-/**
- * rfkill_force_state - Force the internal rfkill radio state
- * @rfkill: pointer to the rfkill class to modify.
- * @state: the current radio state the class should be forced to.
- *
- * This function updates the internal state of the radio cached
- * by the rfkill class.  It should be used when the driver gets
- * a notification by the firmware/hardware of the current *real*
- * state of the radio rfkill switch.
- *
- * Devices which are subject to external changes on their rfkill
- * state (such as those caused by a hardware rfkill line) MUST
- * have their driver arrange to call rfkill_force_state() as soon
- * as possible after such a change.
- *
- * This function may not be called from an atomic context.
- */
-int rfkill_force_state(struct rfkill *rfkill, enum rfkill_state state)
-{
-	enum rfkill_state oldstate;
-
-	BUG_ON(!rfkill);
-	if (WARN((state >= RFKILL_STATE_MAX),
-			KERN_WARNING
-			"rfkill: illegal state %d passed as parameter "
-			"to rfkill_force_state\n", state))
-		return -EINVAL;
-
-	mutex_lock(&rfkill->mutex);
-
-	oldstate = rfkill->state;
-	rfkill->state = state;
-
-	if (state != oldstate)
-		rfkill_uevent(rfkill);
-
-	mutex_unlock(&rfkill->mutex);
-	rfkill_led_trigger(rfkill, rfkill->state);
-
-	return 0;
-}
-EXPORT_SYMBOL(rfkill_force_state);
-
-static ssize_t rfkill_name_show(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	return sprintf(buf, "%s\n", rfkill->name);
-}
-
-static const char *rfkill_get_type_str(enum rfkill_type type)
-{
-	switch (type) {
-	case RFKILL_TYPE_WLAN:
-		return "wlan";
-	case RFKILL_TYPE_BLUETOOTH:
-		return "bluetooth";
-	case RFKILL_TYPE_UWB:
-		return "ultrawideband";
-	case RFKILL_TYPE_WIMAX:
-		return "wimax";
-	case RFKILL_TYPE_WWAN:
-		return "wwan";
-	default:
-		BUG();
-	}
-}
-
-static ssize_t rfkill_type_show(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
-}
-
-static ssize_t rfkill_state_show(struct device *dev,
-				 struct device_attribute *attr,
-				 char *buf)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	update_rfkill_state(rfkill);
-	return sprintf(buf, "%d\n", rfkill->state);
-}
-
-static ssize_t rfkill_state_store(struct device *dev,
-				  struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-	unsigned long state;
-	int error;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	error = strict_strtoul(buf, 0, &state);
-	if (error)
-		return error;
-
-	/* RFKILL_STATE_HARD_BLOCKED is illegal here... */
-	if (state != RFKILL_STATE_UNBLOCKED &&
-	    state != RFKILL_STATE_SOFT_BLOCKED)
-		return -EINVAL;
-
-	error = mutex_lock_killable(&rfkill->mutex);
-	if (error)
-		return error;
-
-	if (!rfkill_epo_lock_active)
-		error = rfkill_toggle_radio(rfkill, state, 0);
-	else
-		error = -EPERM;
-
-	mutex_unlock(&rfkill->mutex);
-
-	return error ? error : count;
-}
-
-static ssize_t rfkill_claim_show(struct device *dev,
-				 struct device_attribute *attr,
-				 char *buf)
-{
-	return sprintf(buf, "%d\n", 0);
-}
-
-static ssize_t rfkill_claim_store(struct device *dev,
-				  struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	return -EOPNOTSUPP;
-}
-
-static struct device_attribute rfkill_dev_attrs[] = {
-	__ATTR(name, S_IRUGO, rfkill_name_show, NULL),
-	__ATTR(type, S_IRUGO, rfkill_type_show, NULL),
-	__ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store),
-	__ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store),
-	__ATTR_NULL
-};
-
-static void rfkill_release(struct device *dev)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	kfree(rfkill);
-	module_put(THIS_MODULE);
-}
-
-#ifdef CONFIG_PM
-static int rfkill_suspend(struct device *dev, pm_message_t state)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	/* mark class device as suspended */
-	if (dev->power.power_state.event != state.event)
-		dev->power.power_state = state;
-
-	/* store state for the resume handler */
-	rfkill->state_for_resume = rfkill->state;
-
-	return 0;
-}
-
-static int rfkill_resume(struct device *dev)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-	enum rfkill_state newstate;
-
-	if (dev->power.power_state.event != PM_EVENT_ON) {
-		mutex_lock(&rfkill->mutex);
-
-		dev->power.power_state.event = PM_EVENT_ON;
-
-		/*
-		 * rfkill->state could have been modified before we got
-		 * called, and won't be updated by rfkill_toggle_radio()
-		 * in force mode.  Sync it FIRST.
-		 */
-		if (rfkill->get_state &&
-		    !rfkill->get_state(rfkill->data, &newstate))
-			rfkill->state = newstate;
-
-		/*
-		 * If we are under EPO, kick transmitter offline,
-		 * otherwise restore to pre-suspend state.
-		 *
-		 * Issue a notification in any case
-		 */
-		rfkill_toggle_radio(rfkill,
-				rfkill_epo_lock_active ?
-					RFKILL_STATE_SOFT_BLOCKED :
-					rfkill->state_for_resume,
-				1);
-
-		mutex_unlock(&rfkill->mutex);
-		rfkill_led_trigger(rfkill, rfkill->state);
-	}
-
-	return 0;
-}
-#else
-#define rfkill_suspend NULL
-#define rfkill_resume NULL
-#endif
-
-static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	struct rfkill *rfkill = to_rfkill(dev);
-	int error;
-
-	error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name);
-	if (error)
-		return error;
-	error = add_uevent_var(env, "RFKILL_TYPE=%s",
-				rfkill_get_type_str(rfkill->type));
-	if (error)
-		return error;
-	error = add_uevent_var(env, "RFKILL_STATE=%d", rfkill->state);
-	return error;
-}
-
-static struct class rfkill_class = {
-	.name		= "rfkill",
-	.dev_release	= rfkill_release,
-	.dev_attrs	= rfkill_dev_attrs,
-	.suspend	= rfkill_suspend,
-	.resume		= rfkill_resume,
-	.dev_uevent	= rfkill_dev_uevent,
-};
-
-static int rfkill_check_duplicity(const struct rfkill *rfkill)
-{
-	struct rfkill *p;
-	unsigned long seen[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
-
-	memset(seen, 0, sizeof(seen));
-
-	list_for_each_entry(p, &rfkill_list, node) {
-		if (WARN((p == rfkill), KERN_WARNING
-				"rfkill: illegal attempt to register "
-				"an already registered rfkill struct\n"))
-			return -EEXIST;
-		set_bit(p->type, seen);
-	}
-
-	/* 0: first switch of its kind */
-	return (test_bit(rfkill->type, seen)) ? 1 : 0;
-}
-
-static int rfkill_add_switch(struct rfkill *rfkill)
-{
-	int error;
-
-	mutex_lock(&rfkill_global_mutex);
-
-	error = rfkill_check_duplicity(rfkill);
-	if (error < 0)
-		goto unlock_out;
-
-	if (!error) {
-		/* lock default after first use */
-		set_bit(rfkill->type, rfkill_states_lockdflt);
-		rfkill_global_states[rfkill->type].current_state =
-			rfkill_global_states[rfkill->type].default_state;
-	}
-
-	rfkill_toggle_radio(rfkill,
-			    rfkill_global_states[rfkill->type].current_state,
-			    0);
-
-	list_add_tail(&rfkill->node, &rfkill_list);
-
-	error = 0;
-unlock_out:
-	mutex_unlock(&rfkill_global_mutex);
-
-	return error;
-}
-
-static void rfkill_remove_switch(struct rfkill *rfkill)
-{
-	mutex_lock(&rfkill_global_mutex);
-	list_del_init(&rfkill->node);
-	mutex_unlock(&rfkill_global_mutex);
-
-	mutex_lock(&rfkill->mutex);
-	rfkill_toggle_radio(rfkill, RFKILL_STATE_SOFT_BLOCKED, 1);
-	mutex_unlock(&rfkill->mutex);
-}
-
-/**
- * rfkill_allocate - allocate memory for rfkill structure.
- * @parent: device that has rf switch on it
- * @type: type of the switch (RFKILL_TYPE_*)
- *
- * This function should be called by the network driver when it needs
- * rfkill structure.  Once the structure is allocated the driver should
- * finish its initialization by setting the name, private data, enable_radio
- * and disable_radio methods and then register it with rfkill_register().
- *
- * NOTE: If registration fails the structure shoudl be freed by calling
- * rfkill_free() otherwise rfkill_unregister() should be used.
- */
-struct rfkill * __must_check rfkill_allocate(struct device *parent,
-					     enum rfkill_type type)
-{
-	struct rfkill *rfkill;
-	struct device *dev;
-
-	if (WARN((type >= RFKILL_TYPE_MAX),
-			KERN_WARNING
-			"rfkill: illegal type %d passed as parameter "
-			"to rfkill_allocate\n", type))
-		return NULL;
-
-	rfkill = kzalloc(sizeof(struct rfkill), GFP_KERNEL);
-	if (!rfkill)
-		return NULL;
-
-	mutex_init(&rfkill->mutex);
-	INIT_LIST_HEAD(&rfkill->node);
-	rfkill->type = type;
-
-	dev = &rfkill->dev;
-	dev->class = &rfkill_class;
-	dev->parent = parent;
-	device_initialize(dev);
-
-	__module_get(THIS_MODULE);
-
-	return rfkill;
-}
-EXPORT_SYMBOL(rfkill_allocate);
-
-/**
- * rfkill_free - Mark rfkill structure for deletion
- * @rfkill: rfkill structure to be destroyed
- *
- * Decrements reference count of the rfkill structure so it is destroyed.
- * Note that rfkill_free() should _not_ be called after rfkill_unregister().
- */
-void rfkill_free(struct rfkill *rfkill)
-{
-	if (rfkill)
-		put_device(&rfkill->dev);
-}
-EXPORT_SYMBOL(rfkill_free);
-
-static void rfkill_led_trigger_register(struct rfkill *rfkill)
-{
-#ifdef CONFIG_RFKILL_LEDS
-	int error;
-
-	if (!rfkill->led_trigger.name)
-		rfkill->led_trigger.name = dev_name(&rfkill->dev);
-	if (!rfkill->led_trigger.activate)
-		rfkill->led_trigger.activate = rfkill_led_trigger_activate;
-	error = led_trigger_register(&rfkill->led_trigger);
-	if (error)
-		rfkill->led_trigger.name = NULL;
-#endif /* CONFIG_RFKILL_LEDS */
-}
-
-static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
-{
-#ifdef CONFIG_RFKILL_LEDS
-	if (rfkill->led_trigger.name) {
-		led_trigger_unregister(&rfkill->led_trigger);
-		rfkill->led_trigger.name = NULL;
-	}
-#endif
-}
-
-/**
- * rfkill_register - Register a rfkill structure.
- * @rfkill: rfkill structure to be registered
- *
- * This function should be called by the network driver when the rfkill
- * structure needs to be registered. Immediately from registration the
- * switch driver should be able to service calls to toggle_radio.
- */
-int __must_check rfkill_register(struct rfkill *rfkill)
-{
-	static atomic_t rfkill_no = ATOMIC_INIT(0);
-	struct device *dev = &rfkill->dev;
-	int error;
-
-	if (WARN((!rfkill || !rfkill->toggle_radio ||
-			rfkill->type >= RFKILL_TYPE_MAX ||
-			rfkill->state >= RFKILL_STATE_MAX),
-			KERN_WARNING
-			"rfkill: attempt to register a "
-			"badly initialized rfkill struct\n"))
-		return -EINVAL;
-
-	dev_set_name(dev, "rfkill%ld", (long)atomic_inc_return(&rfkill_no) - 1);
-
-	rfkill_led_trigger_register(rfkill);
-
-	error = rfkill_add_switch(rfkill);
-	if (error) {
-		rfkill_led_trigger_unregister(rfkill);
-		return error;
-	}
-
-	error = device_add(dev);
-	if (error) {
-		rfkill_remove_switch(rfkill);
-		rfkill_led_trigger_unregister(rfkill);
-		return error;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(rfkill_register);
-
-/**
- * rfkill_unregister - Unregister a rfkill structure.
- * @rfkill: rfkill structure to be unregistered
- *
- * This function should be called by the network driver during device
- * teardown to destroy rfkill structure. Note that rfkill_free() should
- * _not_ be called after rfkill_unregister().
- */
-void rfkill_unregister(struct rfkill *rfkill)
-{
-	BUG_ON(!rfkill);
-	device_del(&rfkill->dev);
-	rfkill_remove_switch(rfkill);
-	rfkill_led_trigger_unregister(rfkill);
-	put_device(&rfkill->dev);
-}
-EXPORT_SYMBOL(rfkill_unregister);
-
-/**
- * rfkill_set_default - set initial value for a switch type
- * @type - the type of switch to set the default state of
- * @state - the new default state for that group of switches
- *
- * Sets the initial state rfkill should use for a given type.
- * The following initial states are allowed: RFKILL_STATE_SOFT_BLOCKED
- * and RFKILL_STATE_UNBLOCKED.
- *
- * This function is meant to be used by platform drivers for platforms
- * that can save switch state across power down/reboot.
- *
- * The default state for each switch type can be changed exactly once.
- * After a switch of that type is registered, the default state cannot
- * be changed anymore.  This guards against multiple drivers it the
- * same platform trying to set the initial switch default state, which
- * is not allowed.
- *
- * Returns -EPERM if the state has already been set once or is in use,
- * so drivers likely want to either ignore or at most printk(KERN_NOTICE)
- * if this function returns -EPERM.
- *
- * Returns 0 if the new default state was set, or an error if it
- * could not be set.
- */
-int rfkill_set_default(enum rfkill_type type, enum rfkill_state state)
-{
-	int error;
-
-	if (WARN((type >= RFKILL_TYPE_MAX ||
-			(state != RFKILL_STATE_SOFT_BLOCKED &&
-			 state != RFKILL_STATE_UNBLOCKED)),
-			KERN_WARNING
-			"rfkill: illegal state %d or type %d passed as "
-			"parameter to rfkill_set_default\n", state, type))
-		return -EINVAL;
-
-	mutex_lock(&rfkill_global_mutex);
-
-	if (!test_and_set_bit(type, rfkill_states_lockdflt)) {
-		rfkill_global_states[type].default_state = state;
-		rfkill_global_states[type].current_state = state;
-		error = 0;
-	} else
-		error = -EPERM;
-
-	mutex_unlock(&rfkill_global_mutex);
-	return error;
-}
-EXPORT_SYMBOL_GPL(rfkill_set_default);
-
-/*
- * Rfkill module initialization/deinitialization.
- */
-static int __init rfkill_init(void)
-{
-	int error;
-	int i;
-
-	/* RFKILL_STATE_HARD_BLOCKED is illegal here... */
-	if (rfkill_default_state != RFKILL_STATE_SOFT_BLOCKED &&
-	    rfkill_default_state != RFKILL_STATE_UNBLOCKED)
-		return -EINVAL;
-
-	for (i = 0; i < RFKILL_TYPE_MAX; i++)
-		rfkill_global_states[i].default_state = rfkill_default_state;
-
-	error = class_register(&rfkill_class);
-	if (error) {
-		printk(KERN_ERR "rfkill: unable to register rfkill class\n");
-		return error;
-	}
-
-	return 0;
-}
-
-static void __exit rfkill_exit(void)
-{
-	class_unregister(&rfkill_class);
-}
-
-subsys_initcall(rfkill_init);
-module_exit(rfkill_exit);
diff --git a/net/rfkill/rfkill.h b/net/rfkill/rfkill.h
new file mode 100644
index 00000000000..d1117cb6e4d
--- /dev/null
+++ b/net/rfkill/rfkill.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Ivo van Doorn
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef __RFKILL_INPUT_H
+#define __RFKILL_INPUT_H
+
+/* core code */
+void rfkill_switch_all(const enum rfkill_type type, bool blocked);
+void rfkill_epo(void);
+void rfkill_restore_states(void);
+void rfkill_remove_epo_lock(void);
+bool rfkill_is_epo_lock_active(void);
+bool rfkill_get_global_sw_state(const enum rfkill_type type);
+
+/* input handler */
+int rfkill_handler_init(void);
+void rfkill_handler_exit(void);
+
+#endif /* __RFKILL_INPUT_H */
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
index 1b46747a5f5..0bdbb692820 100644
--- a/net/wimax/Kconfig
+++ b/net/wimax/Kconfig
@@ -1,23 +1,9 @@
 #
 # WiMAX LAN device configuration
 #
-# Note the ugly 'depends on' on WIMAX: that disallows RFKILL to be a
-# module if WIMAX is to be linked in. The WiMAX code is done in such a
-# way that it doesn't require and explicit dependency on RFKILL in
-# case an embedded system wants to rip it out.
-#
-# As well, enablement of the RFKILL code means we need the INPUT layer
-# support to inject events coming from hw rfkill switches. That
-# dependency could be killed if input.h provided appropriate means to
-# work when input is disabled.
-
-comment "WiMAX Wireless Broadband support requires CONFIG_INPUT enabled"
-	depends on INPUT = n && RFKILL != n
 
 menuconfig WIMAX
 	tristate "WiMAX Wireless Broadband support"
-	depends on (y && RFKILL != m) || m
-	depends on (INPUT && RFKILL != n) || RFKILL = n
 	help
 
 	  Select to configure support for devices that provide
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
index a3616e2ccb8..bb102e4aa3e 100644
--- a/net/wimax/op-rfkill.c
+++ b/net/wimax/op-rfkill.c
@@ -29,8 +29,8 @@
  * A non-polled generic rfkill device is embedded into the WiMAX
  * subsystem's representation of a device.
  *
- * FIXME: Need polled support? use a timer or add the implementation
- *     to the stack.
+ * FIXME: Need polled support? Let drivers provide a poll routine
+ *	  and hand it to rfkill ops then?
  *
  * All device drivers have to do is after wimax_dev_init(), call
  * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update
@@ -43,7 +43,7 @@
  *   wimax_rfkill()             Kernel calling wimax_rfkill()
  *     __wimax_rf_toggle_radio()
  *
- * wimax_rfkill_toggle_radio()  RF-Kill subsytem calling
+ * wimax_rfkill_set_radio_block()  RF-Kill subsytem calling
  *   __wimax_rf_toggle_radio()
  *
  * __wimax_rf_toggle_radio()
@@ -65,15 +65,11 @@
 #include <linux/wimax.h>
 #include <linux/security.h>
 #include <linux/rfkill.h>
-#include <linux/input.h>
 #include "wimax-internal.h"
 
 #define D_SUBMODULE op_rfkill
 #include "debug-levels.h"
 
-#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
-
-
 /**
  * wimax_report_rfkill_hw - Reports changes in the hardware RF switch
  *
@@ -99,7 +95,6 @@ void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
 	int result;
 	struct device *dev = wimax_dev_to_dev(wimax_dev);
 	enum wimax_st wimax_state;
-	enum rfkill_state rfkill_state;
 
 	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
 	BUG_ON(state == WIMAX_RF_QUERY);
@@ -112,16 +107,15 @@ void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
 
 	if (state != wimax_dev->rf_hw) {
 		wimax_dev->rf_hw = state;
-		rfkill_state = state == WIMAX_RF_ON ?
-			RFKILL_STATE_UNBLOCKED : RFKILL_STATE_SOFT_BLOCKED;
 		if (wimax_dev->rf_hw == WIMAX_RF_ON
 		    && wimax_dev->rf_sw == WIMAX_RF_ON)
 			wimax_state = WIMAX_ST_READY;
 		else
 			wimax_state = WIMAX_ST_RADIO_OFF;
+
+		rfkill_set_hw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+
 		__wimax_state_change(wimax_dev, wimax_state);
-		input_report_key(wimax_dev->rfkill_input, KEY_WIMAX,
-				 rfkill_state);
 	}
 error_not_ready:
 	mutex_unlock(&wimax_dev->mutex);
@@ -174,6 +168,7 @@ void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
 		else
 			wimax_state = WIMAX_ST_RADIO_OFF;
 		__wimax_state_change(wimax_dev, wimax_state);
+		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
 	}
 error_not_ready:
 	mutex_unlock(&wimax_dev->mutex);
@@ -249,36 +244,31 @@ out_no_change:
  *
  * NOTE: This call will block until the operation is completed.
  */
-static
-int wimax_rfkill_toggle_radio(void *data, enum rfkill_state state)
+static int wimax_rfkill_set_radio_block(void *data, bool blocked)
 {
 	int result;
 	struct wimax_dev *wimax_dev = data;
 	struct device *dev = wimax_dev_to_dev(wimax_dev);
 	enum wimax_rf_state rf_state;
 
-	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
-	switch (state) {
-	case RFKILL_STATE_SOFT_BLOCKED:
+	d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked);
+	rf_state = WIMAX_RF_ON;
+	if (blocked)
 		rf_state = WIMAX_RF_OFF;
-		break;
-	case RFKILL_STATE_UNBLOCKED:
-		rf_state = WIMAX_RF_ON;
-		break;
-	default:
-		BUG();
-	}
 	mutex_lock(&wimax_dev->mutex);
 	if (wimax_dev->state <= __WIMAX_ST_QUIESCING)
-		result = 0;	/* just pretend it didn't happen */
+		result = 0;
 	else
 		result = __wimax_rf_toggle_radio(wimax_dev, rf_state);
 	mutex_unlock(&wimax_dev->mutex);
-	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
-		wimax_dev, state, result);
+	d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n",
+		wimax_dev, blocked, result);
 	return result;
 }
 
+static const struct rfkill_ops wimax_rfkill_ops = {
+	.set_block = wimax_rfkill_set_radio_block,
+};
 
 /**
  * wimax_rfkill - Set the software RF switch state for a WiMAX device
@@ -322,6 +312,7 @@ int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state)
 		result = __wimax_rf_toggle_radio(wimax_dev, state);
 		if (result < 0)
 			goto error;
+		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
 		break;
 	case WIMAX_RF_QUERY:
 		break;
@@ -349,40 +340,20 @@ int wimax_rfkill_add(struct wimax_dev *wimax_dev)
 {
 	int result;
 	struct rfkill *rfkill;
-	struct input_dev *input_dev;
 	struct device *dev = wimax_dev_to_dev(wimax_dev);
 
 	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
 	/* Initialize RF Kill */
 	result = -ENOMEM;
-	rfkill = rfkill_allocate(dev, RFKILL_TYPE_WIMAX);
+	rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX,
+			      &wimax_rfkill_ops, wimax_dev);
 	if (rfkill == NULL)
 		goto error_rfkill_allocate;
+
+	d_printf(1, dev, "rfkill %p\n", rfkill);
+
 	wimax_dev->rfkill = rfkill;
 
-	rfkill->name = wimax_dev->name;
-	rfkill->state = RFKILL_STATE_UNBLOCKED;
-	rfkill->data = wimax_dev;
-	rfkill->toggle_radio = wimax_rfkill_toggle_radio;
-
-	/* Initialize the input device for the hw key */
-	input_dev = input_allocate_device();
-	if (input_dev == NULL)
-		goto error_input_allocate;
-	wimax_dev->rfkill_input = input_dev;
-	d_printf(1, dev, "rfkill %p input %p\n", rfkill, input_dev);
-
-	input_dev->name = wimax_dev->name;
-	/* FIXME: get a real device bus ID and stuff? do we care? */
-	input_dev->id.bustype = BUS_HOST;
-	input_dev->id.vendor = 0xffff;
-	input_dev->evbit[0] = BIT(EV_KEY);
-	set_bit(KEY_WIMAX, input_dev->keybit);
-
-	/* Register both */
-	result = input_register_device(wimax_dev->rfkill_input);
-	if (result < 0)
-		goto error_input_register;
 	result = rfkill_register(wimax_dev->rfkill);
 	if (result < 0)
 		goto error_rfkill_register;
@@ -394,17 +365,8 @@ int wimax_rfkill_add(struct wimax_dev *wimax_dev)
 	d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev);
 	return 0;
 
-	/* if rfkill_register() suceeds, can't use rfkill_free() any
-	 * more, only rfkill_unregister() [it owns the refcount]; with
-	 * the input device we have the same issue--hence the if. */
 error_rfkill_register:
-	input_unregister_device(wimax_dev->rfkill_input);
-	wimax_dev->rfkill_input = NULL;
-error_input_register:
-	if (wimax_dev->rfkill_input)
-		input_free_device(wimax_dev->rfkill_input);
-error_input_allocate:
-	rfkill_free(wimax_dev->rfkill);
+	rfkill_destroy(wimax_dev->rfkill);
 error_rfkill_allocate:
 	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
 	return result;
@@ -423,45 +385,12 @@ void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
 {
 	struct device *dev = wimax_dev_to_dev(wimax_dev);
 	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
-	rfkill_unregister(wimax_dev->rfkill);	/* frees */
-	input_unregister_device(wimax_dev->rfkill_input);
+	rfkill_unregister(wimax_dev->rfkill);
+	rfkill_destroy(wimax_dev->rfkill);
 	d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev);
 }
 
 
-#else /* #ifdef CONFIG_RFKILL */
-
-void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
-			    enum wimax_rf_state state)
-{
-}
-EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
-
-void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
-			    enum wimax_rf_state state)
-{
-}
-EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw);
-
-int wimax_rfkill(struct wimax_dev *wimax_dev,
-		 enum wimax_rf_state state)
-{
-	return WIMAX_RF_ON << 1 | WIMAX_RF_ON;
-}
-EXPORT_SYMBOL_GPL(wimax_rfkill);
-
-int wimax_rfkill_add(struct wimax_dev *wimax_dev)
-{
-	return 0;
-}
-
-void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
-{
-}
-
-#endif /* #ifdef CONFIG_RFKILL */
-
-
 /*
  * Exporting to user space over generic netlink
  *
-- 
cgit v1.2.3-70-g09d2


From c64fb01627e24725d1f9d535e4426475a4415753 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 2 Jun 2009 13:01:38 +0200
Subject: rfkill: create useful userspace interface

The new code added by this patch will make rfkill create
a misc character device /dev/rfkill that userspace can use
to control rfkill soft blocks and get status of devices as
well as events when the status changes.

Using it is very simple -- when you open it you can read
a number of times to get the initial state, and every
further read blocks (you can poll) on getting the next
event from the kernel. The same structure you read is
also used when writing to it to change the soft block of
a given device, all devices of a given type, or all
devices.

This also makes CONFIG_RFKILL_INPUT selectable again in
order to be able to test without it present since its
functionality can now be replaced by userspace entirely
and distros and users may not want the input part of
rfkill interfering with their userspace code. We will
also write a userspace daemon to handle all that and
consequently add the input code to the feature removal
schedule.

In order to have rfkilld support both kernels with and
without CONFIG_RFKILL_INPUT (or new kernels after its
eventual removal) we also add an ioctl (that only exists
if rfkill-input is present) to disable rfkill-input.
It is not very efficient, but at least gives the correct
behaviour in all cases.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/feature-removal-schedule.txt |   7 +
 include/linux/rfkill.h                     |  84 +++++---
 net/rfkill/Kconfig                         |   4 +-
 net/rfkill/core.c                          | 330 ++++++++++++++++++++++++++++-
 4 files changed, 394 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index de491a3e231..edb2f0b0761 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -437,3 +437,10 @@ Why:	Superseded by tdfxfb. I2C/DDC support used to live in a separate
 	driver but this caused driver conflicts.
 Who:	Jean Delvare <khali@linux-fr.org>
 	Krzysztof Helt <krzysztof.h1@wp.pl>
+
+---------------------------
+
+What:	CONFIG_RFKILL_INPUT
+When:	2.6.33
+Why:	Should be implemented in userspace, policy daemon.
+Who:	Johannes Berg <johannes@sipsolutions.net>
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 090852c8de7..7c116f6631b 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -22,34 +22,17 @@
  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
+#include <linux/types.h>
 
 /* define userspace visible states */
 #define RFKILL_STATE_SOFT_BLOCKED	0
 #define RFKILL_STATE_UNBLOCKED		1
 #define RFKILL_STATE_HARD_BLOCKED	2
 
-/* and that's all userspace gets */
-#ifdef __KERNEL__
-/* don't allow anyone to use these in the kernel */
-enum rfkill_user_states {
-	RFKILL_USER_STATE_SOFT_BLOCKED	= RFKILL_STATE_SOFT_BLOCKED,
-	RFKILL_USER_STATE_UNBLOCKED	= RFKILL_STATE_UNBLOCKED,
-	RFKILL_USER_STATE_HARD_BLOCKED	= RFKILL_STATE_HARD_BLOCKED,
-};
-#undef RFKILL_STATE_SOFT_BLOCKED
-#undef RFKILL_STATE_UNBLOCKED
-#undef RFKILL_STATE_HARD_BLOCKED
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/device.h>
-#include <linux/leds.h>
-
 /**
  * enum rfkill_type - type of rfkill switch.
  *
+ * @RFKILL_TYPE_ALL: toggles all switches (userspace only)
  * @RFKILL_TYPE_WLAN: switch is on a 802.11 wireless network device.
  * @RFKILL_TYPE_BLUETOOTH: switch is on a bluetooth device.
  * @RFKILL_TYPE_UWB: switch is on a ultra wideband device.
@@ -58,6 +41,7 @@ enum rfkill_user_states {
  * @NUM_RFKILL_TYPES: number of defined rfkill types
  */
 enum rfkill_type {
+	RFKILL_TYPE_ALL = 0,
 	RFKILL_TYPE_WLAN,
 	RFKILL_TYPE_BLUETOOTH,
 	RFKILL_TYPE_UWB,
@@ -66,6 +50,62 @@ enum rfkill_type {
 	NUM_RFKILL_TYPES,
 };
 
+/**
+ * enum rfkill_operation - operation types
+ * @RFKILL_OP_ADD: a device was added
+ * @RFKILL_OP_DEL: a device was removed
+ * @RFKILL_OP_CHANGE: a device's state changed -- userspace changes one device
+ * @RFKILL_OP_CHANGE_ALL: userspace changes all devices (of a type, or all)
+ */
+enum rfkill_operation {
+	RFKILL_OP_ADD = 0,
+	RFKILL_OP_DEL,
+	RFKILL_OP_CHANGE,
+	RFKILL_OP_CHANGE_ALL,
+};
+
+/**
+ * struct rfkill_event - events for userspace on /dev/rfkill
+ * @idx: index of dev rfkill
+ * @type: type of the rfkill struct
+ * @op: operation code
+ * @hard: hard state (0/1)
+ * @soft: soft state (0/1)
+ *
+ * Structure used for userspace communication on /dev/rfkill,
+ * used for events from the kernel and control to the kernel.
+ */
+struct rfkill_event {
+	__u32 idx;
+	__u8  type;
+	__u8  op;
+	__u8  soft, hard;
+} __packed;
+
+/* ioctl for turning off rfkill-input (if present) */
+#define RFKILL_IOC_MAGIC	'R'
+#define RFKILL_IOC_NOINPUT	1
+#define RFKILL_IOCTL_NOINPUT	_IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT)
+
+/* and that's all userspace gets */
+#ifdef __KERNEL__
+/* don't allow anyone to use these in the kernel */
+enum rfkill_user_states {
+	RFKILL_USER_STATE_SOFT_BLOCKED	= RFKILL_STATE_SOFT_BLOCKED,
+	RFKILL_USER_STATE_UNBLOCKED	= RFKILL_STATE_UNBLOCKED,
+	RFKILL_USER_STATE_HARD_BLOCKED	= RFKILL_STATE_HARD_BLOCKED,
+};
+#undef RFKILL_STATE_SOFT_BLOCKED
+#undef RFKILL_STATE_UNBLOCKED
+#undef RFKILL_STATE_HARD_BLOCKED
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/device.h>
+#include <linux/leds.h>
+
 /* this is opaque */
 struct rfkill;
 
@@ -84,11 +124,7 @@ struct rfkill;
  *	the rfkill core query your driver before setting a requested
  *	block.
  * @set_block: turn the transmitter on (blocked == false) or off
- *	(blocked == true) -- this is called only while the transmitter
- *	is not hard-blocked, but note that the core's view of whether
- *	the transmitter is hard-blocked might differ from your driver's
- *	view due to race conditions, so it is possible that it is still
- *	called at the same time as you are calling rfkill_set_hw_state().
+ *	(blocked == true) -- ignore and return 0 when hard blocked.
  *	This callback must be assigned.
  */
 struct rfkill_ops {
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index b47f72fae05..fd7600d8ab1 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -18,7 +18,7 @@ config RFKILL_LEDS
 	default y
 
 config RFKILL_INPUT
-	bool
+	bool "RF switch input support"
 	depends on RFKILL
 	depends on INPUT = y || RFKILL = INPUT
-	default y
+	default y if !EMBEDDED
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 30a6f8d819b..2230aa6b14f 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -28,6 +28,10 @@
 #include <linux/mutex.h>
 #include <linux/rfkill.h>
 #include <linux/spinlock.h>
+#include <linux/miscdevice.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
 
 #include "rfkill.h"
 
@@ -49,6 +53,8 @@ struct rfkill {
 
 	unsigned long		state;
 
+	u32			idx;
+
 	bool			registered;
 	bool			suspended;
 
@@ -69,6 +75,18 @@ struct rfkill {
 };
 #define to_rfkill(d)	container_of(d, struct rfkill, dev)
 
+struct rfkill_int_event {
+	struct list_head	list;
+	struct rfkill_event	ev;
+};
+
+struct rfkill_data {
+	struct list_head	list;
+	struct list_head	events;
+	struct mutex		mtx;
+	wait_queue_head_t	read_wait;
+	bool			input_handler;
+};
 
 
 MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>");
@@ -90,6 +108,7 @@ MODULE_LICENSE("GPL");
  */
 static LIST_HEAD(rfkill_list);	/* list of registered rf switches */
 static DEFINE_MUTEX(rfkill_global_mutex);
+static LIST_HEAD(rfkill_fds);	/* list of open fds of /dev/rfkill */
 
 static unsigned int rfkill_default_state = 1;
 module_param_named(default_state, rfkill_default_state, uint, 0444);
@@ -171,12 +190,48 @@ static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 }
 #endif /* CONFIG_RFKILL_LEDS */
 
-static void rfkill_uevent(struct rfkill *rfkill)
+static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
+			      enum rfkill_operation op)
+{
+	unsigned long flags;
+
+	ev->idx = rfkill->idx;
+	ev->type = rfkill->type;
+	ev->op = op;
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW);
+	ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW |
+					RFKILL_BLOCK_SW_PREV));
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+
+static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op)
+{
+	struct rfkill_data *data;
+	struct rfkill_int_event *ev;
+
+	list_for_each_entry(data, &rfkill_fds, list) {
+		ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+		if (!ev)
+			continue;
+		rfkill_fill_event(&ev->ev, rfkill, op);
+		mutex_lock(&data->mtx);
+		list_add_tail(&ev->list, &data->events);
+		mutex_unlock(&data->mtx);
+		wake_up_interruptible(&data->read_wait);
+	}
+}
+
+static void rfkill_event(struct rfkill *rfkill)
 {
 	if (!rfkill->registered || rfkill->suspended)
 		return;
 
 	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
+
+	/* also send event to /dev/rfkill */
+	rfkill_send_events(rfkill, RFKILL_OP_CHANGE);
 }
 
 static bool __rfkill_set_hw_state(struct rfkill *rfkill,
@@ -260,9 +315,12 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
-	rfkill_uevent(rfkill);
+	rfkill_event(rfkill);
 }
 
+#ifdef CONFIG_RFKILL_INPUT
+static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
+
 /**
  * __rfkill_switch_all - Toggle state of all switches of given type
  * @type: type of interfaces to be affected
@@ -299,6 +357,9 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
  */
 void rfkill_switch_all(enum rfkill_type type, bool blocked)
 {
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
 	mutex_lock(&rfkill_global_mutex);
 
 	if (!rfkill_epo_lock_active)
@@ -321,6 +382,9 @@ void rfkill_epo(void)
 	struct rfkill *rfkill;
 	int i;
 
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
 	mutex_lock(&rfkill_global_mutex);
 
 	rfkill_epo_lock_active = true;
@@ -331,6 +395,7 @@ void rfkill_epo(void)
 		rfkill_global_states[i].def = rfkill_global_states[i].cur;
 		rfkill_global_states[i].cur = true;
 	}
+
 	mutex_unlock(&rfkill_global_mutex);
 }
 
@@ -345,6 +410,9 @@ void rfkill_restore_states(void)
 {
 	int i;
 
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
 	mutex_lock(&rfkill_global_mutex);
 
 	rfkill_epo_lock_active = false;
@@ -361,6 +429,9 @@ void rfkill_restore_states(void)
  */
 void rfkill_remove_epo_lock(void)
 {
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_epo_lock_active = false;
 	mutex_unlock(&rfkill_global_mutex);
@@ -391,9 +462,12 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
 {
 	return rfkill_global_states[type].cur;
 }
+#endif
 
 void rfkill_set_global_sw_state(const enum rfkill_type type, bool blocked)
 {
+	BUG_ON(type == RFKILL_TYPE_ALL);
+
 	mutex_lock(&rfkill_global_mutex);
 
 	/* don't allow unblock when epo */
@@ -537,6 +611,15 @@ static ssize_t rfkill_type_show(struct device *dev,
 	return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
 }
 
+static ssize_t rfkill_idx_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", rfkill->idx);
+}
+
 static u8 user_state_from_blocked(unsigned long state)
 {
 	if (state & RFKILL_BLOCK_HW)
@@ -594,6 +677,7 @@ static ssize_t rfkill_claim_store(struct device *dev,
 static struct device_attribute rfkill_dev_attrs[] = {
 	__ATTR(name, S_IRUGO, rfkill_name_show, NULL),
 	__ATTR(type, S_IRUGO, rfkill_type_show, NULL),
+	__ATTR(index, S_IRUGO, rfkill_idx_show, NULL),
 	__ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store),
 	__ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store),
 	__ATTR_NULL
@@ -708,7 +792,7 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
 	if (WARN_ON(!name))
 		return NULL;
 
-	if (WARN_ON(type >= NUM_RFKILL_TYPES))
+	if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
 		return NULL;
 
 	rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
@@ -754,7 +838,9 @@ static void rfkill_uevent_work(struct work_struct *work)
 
 	rfkill = container_of(work, struct rfkill, uevent_work);
 
-	rfkill_uevent(rfkill);
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_event(rfkill);
+	mutex_unlock(&rfkill_global_mutex);
 }
 
 static void rfkill_sync_work(struct work_struct *work)
@@ -785,6 +871,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 		goto unlock;
 	}
 
+	rfkill->idx = rfkill_no;
 	dev_set_name(dev, "rfkill%lu", rfkill_no);
 	rfkill_no++;
 
@@ -819,6 +906,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 
 	INIT_WORK(&rfkill->sync_work, rfkill_sync_work);
 	schedule_work(&rfkill->sync_work);
+	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
 	return 0;
@@ -848,6 +936,7 @@ void rfkill_unregister(struct rfkill *rfkill)
 	device_del(&rfkill->dev);
 
 	mutex_lock(&rfkill_global_mutex);
+	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
 	mutex_unlock(&rfkill_global_mutex);
 
@@ -862,6 +951,227 @@ void rfkill_destroy(struct rfkill *rfkill)
 }
 EXPORT_SYMBOL(rfkill_destroy);
 
+static int rfkill_fop_open(struct inode *inode, struct file *file)
+{
+	struct rfkill_data *data;
+	struct rfkill *rfkill;
+	struct rfkill_int_event *ev, *tmp;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&data->events);
+	mutex_init(&data->mtx);
+	init_waitqueue_head(&data->read_wait);
+
+	mutex_lock(&rfkill_global_mutex);
+	mutex_lock(&data->mtx);
+	/*
+	 * start getting events from elsewhere but hold mtx to get
+	 * startup events added first
+	 */
+	list_add(&data->list, &rfkill_fds);
+
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+		if (!ev)
+			goto free;
+		rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD);
+		list_add_tail(&ev->list, &data->events);
+	}
+	mutex_unlock(&data->mtx);
+	mutex_unlock(&rfkill_global_mutex);
+
+	file->private_data = data;
+
+	return nonseekable_open(inode, file);
+
+ free:
+	mutex_unlock(&data->mtx);
+	mutex_unlock(&rfkill_global_mutex);
+	mutex_destroy(&data->mtx);
+	list_for_each_entry_safe(ev, tmp, &data->events, list)
+		kfree(ev);
+	kfree(data);
+	return -ENOMEM;
+}
+
+static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait)
+{
+	struct rfkill_data *data = file->private_data;
+	unsigned int res = POLLOUT | POLLWRNORM;
+
+	poll_wait(file, &data->read_wait, wait);
+
+	mutex_lock(&data->mtx);
+	if (!list_empty(&data->events))
+		res = POLLIN | POLLRDNORM;
+	mutex_unlock(&data->mtx);
+
+	return res;
+}
+
+static bool rfkill_readable(struct rfkill_data *data)
+{
+	bool r;
+
+	mutex_lock(&data->mtx);
+	r = !list_empty(&data->events);
+	mutex_unlock(&data->mtx);
+
+	return r;
+}
+
+static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *pos)
+{
+	struct rfkill_data *data = file->private_data;
+	struct rfkill_int_event *ev;
+	unsigned long sz;
+	int ret;
+
+	mutex_lock(&data->mtx);
+
+	while (list_empty(&data->events)) {
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			goto out;
+		}
+		mutex_unlock(&data->mtx);
+		ret = wait_event_interruptible(data->read_wait,
+					       rfkill_readable(data));
+		mutex_lock(&data->mtx);
+
+		if (ret)
+			goto out;
+	}
+
+	ev = list_first_entry(&data->events, struct rfkill_int_event,
+				list);
+
+	sz = min_t(unsigned long, sizeof(ev->ev), count);
+	ret = sz;
+	if (copy_to_user(buf, &ev->ev, sz))
+		ret = -EFAULT;
+
+	list_del(&ev->list);
+	kfree(ev);
+ out:
+	mutex_unlock(&data->mtx);
+	return ret;
+}
+
+static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *pos)
+{
+	struct rfkill *rfkill;
+	struct rfkill_event ev;
+
+	/* we don't need the 'hard' variable but accept it */
+	if (count < sizeof(ev) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(&ev, buf, sizeof(ev) - 1))
+		return -EFAULT;
+
+	if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL)
+		return -EINVAL;
+
+	if (ev.type >= NUM_RFKILL_TYPES)
+		return -EINVAL;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	if (ev.op == RFKILL_OP_CHANGE_ALL) {
+		if (ev.type == RFKILL_TYPE_ALL) {
+			enum rfkill_type i;
+			for (i = 0; i < NUM_RFKILL_TYPES; i++)
+				rfkill_global_states[i].cur = ev.soft;
+		} else {
+			rfkill_global_states[ev.type].cur = ev.soft;
+		}
+	}
+
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL)
+			continue;
+
+		if (rfkill->type != ev.type && ev.type != RFKILL_TYPE_ALL)
+			continue;
+
+		rfkill_set_block(rfkill, ev.soft);
+	}
+	mutex_unlock(&rfkill_global_mutex);
+
+	return count;
+}
+
+static int rfkill_fop_release(struct inode *inode, struct file *file)
+{
+	struct rfkill_data *data = file->private_data;
+	struct rfkill_int_event *ev, *tmp;
+
+	mutex_lock(&rfkill_global_mutex);
+	list_del(&data->list);
+	mutex_unlock(&rfkill_global_mutex);
+
+	mutex_destroy(&data->mtx);
+	list_for_each_entry_safe(ev, tmp, &data->events, list)
+		kfree(ev);
+
+#ifdef CONFIG_RFKILL_INPUT
+	if (data->input_handler)
+		atomic_dec(&rfkill_input_disabled);
+#endif
+
+	kfree(data);
+
+	return 0;
+}
+
+#ifdef CONFIG_RFKILL_INPUT
+static long rfkill_fop_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct rfkill_data *data = file->private_data;
+
+	if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC)
+		return -ENOSYS;
+
+	if (_IOC_NR(cmd) != RFKILL_IOC_NOINPUT)
+		return -ENOSYS;
+
+	mutex_lock(&data->mtx);
+
+	if (!data->input_handler) {
+		atomic_inc(&rfkill_input_disabled);
+		data->input_handler = true;
+	}
+
+	mutex_unlock(&data->mtx);
+
+	return 0;
+}
+#endif
+
+static const struct file_operations rfkill_fops = {
+	.open		= rfkill_fop_open,
+	.read		= rfkill_fop_read,
+	.write		= rfkill_fop_write,
+	.poll		= rfkill_fop_poll,
+	.release	= rfkill_fop_release,
+#ifdef CONFIG_RFKILL_INPUT
+	.unlocked_ioctl	= rfkill_fop_ioctl,
+	.compat_ioctl	= rfkill_fop_ioctl,
+#endif
+};
+
+static struct miscdevice rfkill_miscdev = {
+	.name	= "rfkill",
+	.fops	= &rfkill_fops,
+	.minor	= MISC_DYNAMIC_MINOR,
+};
 
 static int __init rfkill_init(void)
 {
@@ -875,10 +1185,19 @@ static int __init rfkill_init(void)
 	if (error)
 		goto out;
 
+	error = misc_register(&rfkill_miscdev);
+	if (error) {
+		class_unregister(&rfkill_class);
+		goto out;
+	}
+
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
-	if (error)
+	if (error) {
+		misc_deregister(&rfkill_miscdev);
 		class_unregister(&rfkill_class);
+		goto out;
+	}
 #endif
 
  out:
@@ -891,6 +1210,7 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
+	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
 module_exit(rfkill_exit);
-- 
cgit v1.2.3-70-g09d2


From 6081162e2ed78dfcf149b076b047078ab1445cc2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 2 Jun 2009 13:01:40 +0200
Subject: rfkill: add function to query state

Sometimes it is necessary to know how the state is,
and it is easier to query rfkill than keep track of
it somewhere else, so add a function for that. This
could later be expanded to return hard/soft block,
but so far that isn't necessary.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/rfkill.h | 12 ++++++++++++
 net/rfkill/core.c      | 13 +++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 7c116f6631b..ee3eddea856 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -261,6 +261,13 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
  *	registered drivers?
  */
 void rfkill_set_global_sw_state(const enum rfkill_type type, bool blocked);
+
+/**
+ * rfkill_blocked - query rfkill block
+ *
+ * @rfkill: rfkill struct to query
+ */
+bool rfkill_blocked(struct rfkill *rfkill);
 #else /* !RFKILL */
 static inline struct rfkill * __must_check
 rfkill_alloc(const char *name,
@@ -313,6 +320,11 @@ static inline void rfkill_set_global_sw_state(const enum rfkill_type type,
 					      bool blocked)
 {
 }
+
+static inline bool rfkill_blocked(struct rfkill *rfkill)
+{
+	return false;
+}
 #endif /* RFKILL || RFKILL_MODULE */
 
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 2230aa6b14f..91e9168b544 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -773,6 +773,19 @@ static struct class rfkill_class = {
 	.resume		= rfkill_resume,
 };
 
+bool rfkill_blocked(struct rfkill *rfkill)
+{
+	unsigned long flags;
+	u32 state;
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	state = rfkill->state;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	return !!(state & RFKILL_BLOCK_ANY);
+}
+EXPORT_SYMBOL(rfkill_blocked);
+
 
 struct rfkill * __must_check rfkill_alloc(const char *name,
 					  struct device *parent,
-- 
cgit v1.2.3-70-g09d2


From e0a94c2a63f2644826069044649669b5e7ca75d3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Wed, 3 Jun 2009 16:04:31 -0400
Subject: security: use mmap_min_addr indepedently of security models

This patch removes the dependency of mmap_min_addr on CONFIG_SECURITY.
It also sets a default mmap_min_addr of 4096.

mmapping of addresses below 4096 will only be possible for processes
with CAP_SYS_RAWIO.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Eric Paris <eparis@redhat.com>
Looks-ok-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/mm.h       |  2 --
 include/linux/security.h |  2 ++
 kernel/sysctl.c          |  2 --
 mm/Kconfig               | 19 +++++++++++++++++++
 mm/mmap.c                |  3 +++
 security/Kconfig         | 22 +---------------------
 security/security.c      |  3 ---
 7 files changed, 25 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d475c..0c21af6abff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -580,12 +580,10 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  */
 static inline unsigned long round_hint_to_min(unsigned long hint)
 {
-#ifdef CONFIG_SECURITY
 	hint &= PAGE_MASK;
 	if (((void *)hint != NULL) &&
 	    (hint < mmap_min_addr))
 		return PAGE_ALIGN(mmap_min_addr);
-#endif
 	return hint;
 }
 
diff --git a/include/linux/security.h b/include/linux/security.h
index d5fd6163606..5eff459b383 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2197,6 +2197,8 @@ static inline int security_file_mmap(struct file *file, unsigned long reqprot,
 				     unsigned long addr,
 				     unsigned long addr_only)
 {
+	if ((addr < mmap_min_addr) && !capable(CAP_SYS_RAWIO))
+		return -EACCES;
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 149581fb48a..45bd711a242 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1237,7 +1237,6 @@ static struct ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
-#ifdef CONFIG_SECURITY
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "mmap_min_addr",
@@ -1246,7 +1245,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
-#endif
 #ifdef CONFIG_NUMA
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/mm/Kconfig b/mm/Kconfig
index c2b57d81e15..71830ba7b98 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -226,6 +226,25 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
 	bool
 
+config DEFAULT_MMAP_MIN_ADDR
+        int "Low address space to protect from user allocation"
+        default 4096
+        help
+	  This is the portion of low virtual memory which should be protected
+	  from userspace allocation.  Keeping a user from writing to low pages
+	  can help reduce the impact of kernel NULL pointer bugs.
+
+	  For most ia64, ppc64 and x86 users with lots of address space
+	  a value of 65536 is reasonable and should cause no problems.
+	  On arm and other archs it should not be higher than 32768.
+	  Programs which use vm86 functionality would either need additional
+	  permissions from either the LSM or the capabilities module or have
+	  this protection disabled.
+
+	  This value can be changed after boot using the
+	  /proc/sys/vm/mmap_min_addr tunable.
+
+
 config NOMMU_INITIAL_TRIM_EXCESS
 	int "Turn on mmap() excess space trimming before booting"
 	depends on !MMU
diff --git a/mm/mmap.c b/mm/mmap.c
index 6b7b1a95944..2b43fa1aa3c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -87,6 +87,9 @@ int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to
diff --git a/security/Kconfig b/security/Kconfig
index bb244774e9d..d23c839038f 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -110,28 +110,8 @@ config SECURITY_ROOTPLUG
 
 	  See <http://www.linuxjournal.com/article.php?sid=6279> for
 	  more information about this module.
-	  
-	  If you are unsure how to answer this question, answer N.
-
-config SECURITY_DEFAULT_MMAP_MIN_ADDR
-        int "Low address space to protect from user allocation"
-        depends on SECURITY
-        default 0
-        help
-	  This is the portion of low virtual memory which should be protected
-	  from userspace allocation.  Keeping a user from writing to low pages
-	  can help reduce the impact of kernel NULL pointer bugs.
-
-	  For most ia64, ppc64 and x86 users with lots of address space
-	  a value of 65536 is reasonable and should cause no problems.
-	  On arm and other archs it should not be higher than 32768.
-	  Programs which use vm86 functionality would either need additional
-	  permissions from either the LSM or the capabilities module or have
-	  this protection disabled.
-
-	  This value can be changed after boot using the
-	  /proc/sys/vm/mmap_min_addr tunable.
 
+	  If you are unsure how to answer this question, answer N.
 
 source security/selinux/Kconfig
 source security/smack/Kconfig
diff --git a/security/security.c b/security/security.c
index 5284255c5cd..dc7674fbfc7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -26,9 +26,6 @@ extern void security_fixup_ops(struct security_operations *ops);
 
 struct security_operations *security_ops;	/* Initialized to NULL */
 
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR;
-
 static inline int verify(struct security_operations *ops)
 {
 	/* verify the security_operations structure exists */
-- 
cgit v1.2.3-70-g09d2


From 278b2513f76161a9cf1ebddd620dc9d1714fe573 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 3 Jun 2009 21:20:51 -0700
Subject: gso: Stop fraglists from escaping

As it stands skb fraglists can get past the check in dev_queue_xmit
if the skb is marked as GSO.  In particular, if the packet doesn't
have the proper checksums for GSO, but can otherwise be handled by
the underlying device, we will not perform the fraglist check on it
at all.

If the underlying device cannot handle fraglists, then this will
break.

The fix is as simple as moving the fraglist check from the device
check into skb_gso_ok.

This has caused crashes with Xen when used together with GRO which
can generate GSO packets with fraglists.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bbfabf3012b..2a801380b50 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1903,15 +1903,14 @@ static inline int net_gso_ok(int features, int gso_type)
 
 static inline int skb_gso_ok(struct sk_buff *skb, int features)
 {
-	return net_gso_ok(features, skb_shinfo(skb)->gso_type);
+	return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
+	       (!skb_shinfo(skb)->frag_list || (features & NETIF_F_FRAGLIST));
 }
 
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
 	return skb_is_gso(skb) &&
 	       (!skb_gso_ok(skb, dev->features) ||
-	        (skb_shinfo(skb)->frag_list &&
-	         !(dev->features & NETIF_F_FRAGLIST)) ||
 		unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
 }
 
-- 
cgit v1.2.3-70-g09d2


From a5e78820966e17c2316866e00047e4e7e5480f04 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Thu, 4 Jun 2009 16:54:42 +0200
Subject: netfilter: x_tables: added hook number into match extension parameter
 structure.

Signed-off-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h | 6 ++++--
 net/bridge/netfilter/ebtables.c    | 2 +-
 net/ipv4/netfilter/ip_tables.c     | 2 +-
 net/ipv6/netfilter/ip6_tables.c    | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c9efe039dc5..1030b759389 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -184,9 +184,10 @@ struct xt_counters_info
  * @matchinfo:	per-match data
  * @fragoff:	packet is a fragment, this is the data offset
  * @thoff:	position of transport header relative to skb->data
- * @hotdrop:	drop packet if we had inspection problems
+ * @hook:	hook number given packet came from
  * @family:	Actual NFPROTO_* through which the function is invoked
  * 		(helpful when match->family == NFPROTO_UNSPEC)
+ * @hotdrop:	drop packet if we had inspection problems
  */
 struct xt_match_param {
 	const struct net_device *in, *out;
@@ -194,8 +195,9 @@ struct xt_match_param {
 	const void *matchinfo;
 	int fragoff;
 	unsigned int thoff;
-	bool *hotdrop;
+	unsigned int hooknum;
 	u_int8_t family;
+	bool *hotdrop;
 };
 
 /**
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 24555834d43..37928d5f284 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -170,7 +170,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 	mtpar.in      = tgpar.in  = in;
 	mtpar.out     = tgpar.out = out;
 	mtpar.hotdrop = &hotdrop;
-	tgpar.hooknum = hook;
+	mtpar.hooknum = tgpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
 	private = table->private;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7b35c0b3841..5bf7c3f185d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -343,7 +343,7 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.in      = tgpar.in  = in;
 	mtpar.out     = tgpar.out = out;
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
-	tgpar.hooknum = hook;
+	mtpar.hooknum = tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	xt_info_rdlock_bh();
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 5164e0bf3bc..ced1f2c0cb6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.in      = tgpar.in  = in;
 	mtpar.out     = tgpar.out = out;
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
-	tgpar.hooknum = hook;
+	mtpar.hooknum = tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-- 
cgit v1.2.3-70-g09d2


From 60313ebed739b331e8e61079da27a11ee3b73a30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 16:53:44 +0200
Subject: perf_counter: Add fork event

Create a fork event so that we can easily clone the comm and
dso maps without having to generate all those events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  10 ++++
 kernel/fork.c                |   4 +-
 kernel/perf_counter.c        | 131 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 126 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 37d5541d74c..380247bdb91 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -276,6 +276,14 @@ enum perf_event_type {
 	PERF_EVENT_THROTTLE		= 5,
 	PERF_EVENT_UNTHROTTLE		= 6,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u32				pid, ppid;
+	 * };
+	 */
+	PERF_EVENT_FORK			= 7,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -618,6 +626,7 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
 extern void perf_counter_comm(struct task_struct *tsk);
+extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
@@ -673,6 +682,7 @@ perf_counter_munmap(unsigned long addr, unsigned long len,
 		    unsigned long pgoff, struct file *file)		{ }
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
+static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
 static inline void perf_counter_task_migration(struct task_struct *task,
 					       int cpu)			{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d7a9f0bd7..f4466ca37ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1412,12 +1412,12 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
-		} else {
+		} else if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * vfork will do an exec which will call
 			 * set_task_comm()
 			 */
-			perf_counter_comm(p);
+			perf_counter_fork(p);
 		}
 
 		audit_finish_fork(p);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bb03f15a5b..78c58623a0d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -40,9 +40,9 @@ static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_tracking __read_mostly;
-static atomic_t nr_munmap_tracking __read_mostly;
-static atomic_t nr_comm_tracking __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_munmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1447,11 +1447,11 @@ static void free_counter(struct perf_counter *counter)
 
 	atomic_dec(&nr_counters);
 	if (counter->attr.mmap)
-		atomic_dec(&nr_mmap_tracking);
+		atomic_dec(&nr_mmap_counters);
 	if (counter->attr.munmap)
-		atomic_dec(&nr_munmap_tracking);
+		atomic_dec(&nr_munmap_counters);
 	if (counter->attr.comm)
-		atomic_dec(&nr_comm_tracking);
+		atomic_dec(&nr_comm_counters);
 
 	if (counter->destroy)
 		counter->destroy(counter);
@@ -2475,6 +2475,105 @@ static void perf_counter_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
+/*
+ * fork tracking
+ */
+
+struct perf_fork_event {
+	struct task_struct	*task;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				ppid;
+	} event;
+};
+
+static void perf_counter_fork_output(struct perf_counter *counter,
+				     struct perf_fork_event *fork_event)
+{
+	struct perf_output_handle handle;
+	int size = fork_event->event.header.size;
+	struct task_struct *task = fork_event->task;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	fork_event->event.pid = perf_counter_pid(counter, task);
+	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+
+	perf_output_put(&handle, fork_event->event);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_fork_match(struct perf_counter *counter)
+{
+	if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
+				  struct perf_fork_event *fork_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_fork_match(counter))
+			perf_counter_fork_output(counter, fork_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_fork_ctx(ctx, fork_event);
+	rcu_read_unlock();
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+	struct perf_fork_event fork_event;
+
+	if (!atomic_read(&nr_comm_counters) &&
+	    !atomic_read(&nr_mmap_counters) &&
+	    !atomic_read(&nr_munmap_counters))
+		return;
+
+	fork_event = (struct perf_fork_event){
+		.task	= task,
+		.event  = {
+			.header = {
+				.type = PERF_EVENT_FORK,
+				.size = sizeof(fork_event.event),
+			},
+		},
+	};
+
+	perf_counter_fork_event(&fork_event);
+}
+
 /*
  * comm tracking
  */
@@ -2511,11 +2610,9 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
-static int perf_counter_comm_match(struct perf_counter *counter,
-				   struct perf_comm_event *comm_event)
+static int perf_counter_comm_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm &&
-	    comm_event->event.header.type == PERF_EVENT_COMM)
+	if (counter->attr.comm)
 		return 1;
 
 	return 0;
@@ -2531,7 +2628,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_comm_match(counter, comm_event))
+		if (perf_counter_comm_match(counter))
 			perf_counter_comm_output(counter, comm_event);
 	}
 	rcu_read_unlock();
@@ -2570,7 +2667,7 @@ void perf_counter_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
 
-	if (!atomic_read(&nr_comm_tracking))
+	if (!atomic_read(&nr_comm_counters))
 		return;
 
 	comm_event = (struct perf_comm_event){
@@ -2708,7 +2805,7 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_mmap_tracking))
+	if (!atomic_read(&nr_mmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
@@ -2729,7 +2826,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_munmap_tracking))
+	if (!atomic_read(&nr_munmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
@@ -3427,11 +3524,11 @@ done:
 
 	atomic_inc(&nr_counters);
 	if (counter->attr.mmap)
-		atomic_inc(&nr_mmap_tracking);
+		atomic_inc(&nr_mmap_counters);
 	if (counter->attr.munmap)
-		atomic_inc(&nr_munmap_tracking);
+		atomic_inc(&nr_munmap_counters);
 	if (counter->attr.comm)
-		atomic_inc(&nr_comm_tracking);
+		atomic_inc(&nr_comm_counters);
 
 	return counter;
 }
-- 
cgit v1.2.3-70-g09d2


From d99e9446200c1ffab28cb0e39b76c34a2bfafd06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 17:08:58 +0200
Subject: perf_counter: Remove munmap stuff

In name of keeping it simple, only track mmap events. Userspace
will have to remove old overlapping maps when it encounters them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 +----------
 kernel/perf_counter.c        | 38 +++-----------------------------------
 mm/mmap.c                    |  6 ------
 3 files changed, 4 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 380247bdb91..6ca403acd41 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -148,11 +148,10 @@ struct perf_counter_attr {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
-				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -246,7 +245,6 @@ enum perf_event_type {
 	 * };
 	 */
 	PERF_EVENT_MMAP			= 1,
-	PERF_EVENT_MUNMAP		= 2,
 
 	/*
 	 * struct {
@@ -622,9 +620,6 @@ extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
 
-extern void perf_counter_munmap(unsigned long addr, unsigned long len,
-				unsigned long pgoff, struct file *file);
-
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
@@ -677,10 +672,6 @@ static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
 		  unsigned long pgoff, struct file *file)		{ }
 
-static inline void
-perf_counter_munmap(unsigned long addr, unsigned long len,
-		    unsigned long pgoff, struct file *file)		{ }
-
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 78c58623a0d..195712e20d0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -41,7 +41,6 @@ static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
-static atomic_t nr_munmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
@@ -1448,8 +1447,6 @@ static void free_counter(struct perf_counter *counter)
 	atomic_dec(&nr_counters);
 	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_counters);
-	if (counter->attr.munmap)
-		atomic_dec(&nr_munmap_counters);
 	if (counter->attr.comm)
 		atomic_dec(&nr_comm_counters);
 
@@ -2510,7 +2507,7 @@ static void perf_counter_fork_output(struct perf_counter *counter,
 
 static int perf_counter_fork_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap)
+	if (counter->attr.comm || counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2557,8 +2554,7 @@ void perf_counter_fork(struct task_struct *task)
 	struct perf_fork_event fork_event;
 
 	if (!atomic_read(&nr_comm_counters) &&
-	    !atomic_read(&nr_mmap_counters) &&
-	    !atomic_read(&nr_munmap_counters))
+	    !atomic_read(&nr_mmap_counters))
 		return;
 
 	fork_event = (struct perf_fork_event){
@@ -2722,12 +2718,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->attr.mmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MMAP)
-		return 1;
-
-	if (counter->attr.munmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+	if (counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2821,27 +2812,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
-void perf_counter_munmap(unsigned long addr, unsigned long len,
-			 unsigned long pgoff, struct file *file)
-{
-	struct perf_mmap_event mmap_event;
-
-	if (!atomic_read(&nr_munmap_counters))
-		return;
-
-	mmap_event = (struct perf_mmap_event){
-		.file   = file,
-		.event  = {
-			.header = { .type = PERF_EVENT_MUNMAP, },
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
-		},
-	};
-
-	perf_counter_mmap_event(&mmap_event);
-}
-
 /*
  * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
@@ -3525,8 +3495,6 @@ done:
 	atomic_inc(&nr_counters);
 	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_counters);
-	if (counter->attr.munmap)
-		atomic_inc(&nr_munmap_counters);
 	if (counter->attr.comm)
 		atomic_inc(&nr_comm_counters);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 2c1c2cb0e2e..6451ce2854b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1756,12 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 	do {
 		long nrpages = vma_pages(vma);
 
-		if (vma->vm_flags & VM_EXEC) {
-			perf_counter_munmap(vma->vm_start,
-					nrpages << PAGE_SHIFT,
-					vma->vm_pgoff, vma->vm_file);
-		}
-
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
-- 
cgit v1.2.3-70-g09d2


From 5926a295bb78272b3f648f62febecd19a1b6a6ca Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Thu, 4 Jun 2009 17:43:04 +0100
Subject: [ARM] 5541/1: serial/amba-pl011.c: add support for the modified port
 found in Nomadik

The Nomadik 8815 SoC has a slightly modified version of the PL011 block.
The patch uses the different ID value as a key to select a vendor
structure that is used to keep track of the differences, as suggested
by Russell King.

Signed-off-by: Alessandro Rubini <rubini@unipv.it>
Acked-by: Andrea Gallo <andrea.gallo@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/amba-pl011.c | 30 +++++++++++++++++++++++++++---
 include/linux/amba/serial.h |  3 +++
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 4cfa1eb2689..8c5bda27736 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -70,6 +70,23 @@ struct uart_amba_port {
 	struct clk		*clk;
 	unsigned int		im;	/* interrupt mask */
 	unsigned int		old_status;
+	unsigned int		ifls;	/* vendor-specific */
+};
+
+/* There is by now at least one vendor with differing details, so handle it */
+struct vendor_data {
+	unsigned int		ifls;
+	unsigned int		fifosize;
+};
+
+static struct vendor_data vendor_arm = {
+	.ifls			= UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
+	.fifosize		= 16,
+};
+
+static struct vendor_data vendor_st = {
+	.ifls			= UART011_IFLS_RX_HALF|UART011_IFLS_TX_HALF,
+	.fifosize		= 64,
 };
 
 static void pl011_stop_tx(struct uart_port *port)
@@ -360,8 +377,7 @@ static int pl011_startup(struct uart_port *port)
 	if (retval)
 		goto clk_dis;
 
-	writew(UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
-	       uap->port.membase + UART011_IFLS);
+	writew(uap->ifls, uap->port.membase + UART011_IFLS);
 
 	/*
 	 * Provoke TX FIFO interrupt into asserting.
@@ -732,6 +748,7 @@ static struct uart_driver amba_reg = {
 static int pl011_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct uart_amba_port *uap;
+	struct vendor_data *vendor = id->data;
 	void __iomem *base;
 	int i, ret;
 
@@ -762,12 +779,13 @@ static int pl011_probe(struct amba_device *dev, struct amba_id *id)
 		goto unmap;
 	}
 
+	uap->ifls = vendor->ifls;
 	uap->port.dev = &dev->dev;
 	uap->port.mapbase = dev->res.start;
 	uap->port.membase = base;
 	uap->port.iotype = UPIO_MEM;
 	uap->port.irq = dev->irq[0];
-	uap->port.fifosize = 16;
+	uap->port.fifosize = vendor->fifosize;
 	uap->port.ops = &amba_pl011_pops;
 	uap->port.flags = UPF_BOOT_AUTOCONF;
 	uap->port.line = i;
@@ -812,6 +830,12 @@ static struct amba_id pl011_ids[] __initdata = {
 	{
 		.id	= 0x00041011,
 		.mask	= 0x000fffff,
+		.data	= &vendor_arm,
+	},
+	{
+		.id	= 0x00380802,
+		.mask	= 0x00ffffff,
+		.data	= &vendor_st,
 	},
 	{ 0, 0 },
 };
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 48ee32a18ac..42949e210cb 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -114,6 +114,9 @@
 #define UART011_IFLS_TX4_8	(2 << 0)
 #define UART011_IFLS_TX6_8	(3 << 0)
 #define UART011_IFLS_TX7_8	(4 << 0)
+/* special values for ST vendor with deeper fifo */
+#define UART011_IFLS_RX_HALF	(5 << 3)
+#define UART011_IFLS_TX_HALF	(5 << 0)
 
 #define UART011_OEIM		(1 << 10)	/* overrun error interrupt mask */
 #define UART011_BEIM		(1 << 9)	/* break error interrupt mask */
-- 
cgit v1.2.3-70-g09d2


From c0683039207226afcffbe0fbf6a1caaee77a37b0 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 3 Jun 2009 17:43:14 +0100
Subject: [ARM] 5536/1: Move clk_add_alias() to arch/arm/common/clkdev.c

This can be used for other arm platforms too as discussed
on the linux-arm-kernel list.

Also check the return value with IS_ERR and return PTR_ERR
as suggested by Russell King.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/clkdev.c  | 18 ++++++++++++++++++
 arch/arm/mach-pxa/clock.c | 17 -----------------
 include/linux/clk.h       | 13 +++++++++++++
 3 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/common/clkdev.c b/arch/arm/common/clkdev.c
index 5589444ff43..f37afd9422f 100644
--- a/arch/arm/common/clkdev.c
+++ b/arch/arm/common/clkdev.c
@@ -135,6 +135,24 @@ struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
 }
 EXPORT_SYMBOL(clkdev_alloc);
 
+int clk_add_alias(const char *alias, const char *alias_dev_name, char *id,
+	struct device *dev)
+{
+	struct clk *r = clk_get(dev, id);
+	struct clk_lookup *l;
+
+	if (IS_ERR(r))
+		return PTR_ERR(r);
+
+	l = clkdev_alloc(r, alias, alias_dev_name);
+	clk_put(r);
+	if (!l)
+		return -ENODEV;
+	clkdev_add(l);
+	return 0;
+}
+EXPORT_SYMBOL(clk_add_alias);
+
 /*
  * clkdev_drop - remove a clock dynamically allocated
  */
diff --git a/arch/arm/mach-pxa/clock.c b/arch/arm/mach-pxa/clock.c
index db52d2c4791..49ae3829231 100644
--- a/arch/arm/mach-pxa/clock.c
+++ b/arch/arm/mach-pxa/clock.c
@@ -86,20 +86,3 @@ void clks_register(struct clk_lookup *clks, size_t num)
 	for (i = 0; i < num; i++)
 		clkdev_add(&clks[i]);
 }
-
-int clk_add_alias(const char *alias, const char *alias_dev_name, char *id,
-	struct device *dev)
-{
-	struct clk *r = clk_get(dev, id);
-	struct clk_lookup *l;
-
-	if (!r)
-		return -ENODEV;
-
-	l = clkdev_alloc(r, alias, alias_dev_name);
-	clk_put(r);
-	if (!l)
-		return -ENODEV;
-	clkdev_add(l);
-	return 0;
-}
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 1db9bbf444a..1d37f42ac29 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -142,4 +142,17 @@ struct clk *clk_get_parent(struct clk *clk);
  */
 struct clk *clk_get_sys(const char *dev_id, const char *con_id);
 
+/**
+ * clk_add_alias - add a new clock alias
+ * @alias: name for clock alias
+ * @alias_dev_name: device name
+ * @id: platform specific clock name
+ * @dev: device
+ *
+ * Allows using generic clock names for drivers by adding a new alias.
+ * Assumes clkdev, see clkdev.h for more info.
+ */
+int clk_add_alias(const char *alias, const char *alias_dev_name, char *id,
+			struct device *dev);
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 10662aa3083f869c645cc2abf5d66849001e2f5d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 5 Jun 2009 13:24:24 +0200
Subject: netfilter: xt_NFQUEUE: queue balancing support

Adds support for specifying a range of queues instead of a single queue
id. Flows will be distributed across the given range.

This is useful for multicore systems: Instead of having a single
application read packets from a queue, start multiple
instances on queues x, x+1, .. x+n. Each instance can process
flows independently.

Packets for the same connection are put into the same queue.

Signed-off-by: Holger Eitzenberger <heitzenberger@astaro.com>
Signed-off-by: Florian Westphal <fwestphal@astaro.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_NFQUEUE.h |  5 ++
 net/netfilter/xt_NFQUEUE.c           | 93 ++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_NFQUEUE.h b/include/linux/netfilter/xt_NFQUEUE.h
index 982a89f7827..2584f4a777d 100644
--- a/include/linux/netfilter/xt_NFQUEUE.h
+++ b/include/linux/netfilter/xt_NFQUEUE.h
@@ -15,4 +15,9 @@ struct xt_NFQ_info {
 	__u16 queuenum;
 };
 
+struct xt_NFQ_info_v1 {
+	__u16 queuenum;
+	__u16 queues_total;
+};
+
 #endif /* _XT_NFQ_TARGET_H */
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 6e0f84d4d05..498b45101df 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -11,6 +11,10 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+
 #include <linux/netfilter.h>
 #include <linux/netfilter_arp.h>
 #include <linux/netfilter/x_tables.h>
@@ -23,6 +27,8 @@ MODULE_ALIAS("ipt_NFQUEUE");
 MODULE_ALIAS("ip6t_NFQUEUE");
 MODULE_ALIAS("arpt_NFQUEUE");
 
+static u32 jhash_initval __read_mostly;
+
 static unsigned int
 nfqueue_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
@@ -31,6 +37,72 @@ nfqueue_tg(struct sk_buff *skb, const struct xt_target_param *par)
 	return NF_QUEUE_NR(tinfo->queuenum);
 }
 
+static u32 hash_v4(const struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	u32 ipaddr;
+
+	/* packets in either direction go into same queue */
+	ipaddr = iph->saddr ^ iph->daddr;
+
+	return jhash_2words(ipaddr, iph->protocol, jhash_initval);
+}
+
+static unsigned int
+nfqueue_tg4_v1(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_NFQ_info_v1 *info = par->targinfo;
+	u32 queue = info->queuenum;
+
+	if (info->queues_total > 1)
+		queue = hash_v4(skb) % info->queues_total + queue;
+	return NF_QUEUE_NR(queue);
+}
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+static u32 hash_v6(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	u32 addr[4];
+
+	addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0];
+	addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1];
+	addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2];
+	addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3];
+
+	return jhash2(addr, ARRAY_SIZE(addr), jhash_initval);
+}
+
+static unsigned int
+nfqueue_tg6_v1(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_NFQ_info_v1 *info = par->targinfo;
+	u32 queue = info->queuenum;
+
+	if (info->queues_total > 1)
+		queue = hash_v6(skb) % info->queues_total + queue;
+	return NF_QUEUE_NR(queue);
+}
+#endif
+
+static bool nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_NFQ_info_v1 *info = par->targinfo;
+	u32 maxid;
+
+	if (info->queues_total == 0) {
+		pr_err("NFQUEUE: number of total queues is 0\n");
+		return false;
+	}
+	maxid = info->queues_total - 1 + info->queuenum;
+	if (maxid > 0xffff) {
+		pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n",
+		       info->queues_total, maxid);
+		return false;
+	}
+	return true;
+}
+
 static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 	{
 		.name		= "NFQUEUE",
@@ -39,10 +111,31 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 		.targetsize	= sizeof(struct xt_NFQ_info),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "NFQUEUE",
+		.revision	= 1,
+		.family		= NFPROTO_IPV4,
+		.checkentry	= nfqueue_tg_v1_check,
+		.target		= nfqueue_tg4_v1,
+		.targetsize	= sizeof(struct xt_NFQ_info_v1),
+		.me		= THIS_MODULE,
+	},
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	{
+		.name		= "NFQUEUE",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.checkentry	= nfqueue_tg_v1_check,
+		.target		= nfqueue_tg6_v1,
+		.targetsize	= sizeof(struct xt_NFQ_info_v1),
+		.me		= THIS_MODULE,
+	},
+#endif
 };
 
 static int __init nfqueue_tg_init(void)
 {
+	get_random_bytes(&jhash_initval, sizeof(jhash_initval));
 	return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg));
 }
 
-- 
cgit v1.2.3-70-g09d2


From 089dd79db9264dc0da602bad45d42f1b3e7d1e07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:55 +0200
Subject: perf_counter: Generate mmap events for install_special_mapping()

In order to track the vdso also generate mmap events for
install_special_mapping().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 14 ++++++++------
 kernel/perf_counter.c        | 34 ++++++++++++++++++++++------------
 mm/mmap.c                    |  5 +++--
 3 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6ca403acd41..40dc0e273d9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -617,8 +617,13 @@ static inline int is_software_counter(struct perf_counter *counter)
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void perf_counter_mmap(unsigned long addr, unsigned long len,
-			      unsigned long pgoff, struct file *file);
+extern void __perf_counter_mmap(struct vm_area_struct *vma);
+
+static inline void perf_counter_mmap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_EXEC)
+		__perf_counter_mmap(vma);
+}
 
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
@@ -668,10 +673,7 @@ static inline void
 perf_swcounter_event(u32 event, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
 
-static inline void
-perf_counter_mmap(unsigned long addr, unsigned long len,
-		  unsigned long pgoff, struct file *file)		{ }
-
+static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a5d3e2aedd2..37a5a241ca7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2255,7 +2255,7 @@ out:
 }
 
 static void perf_output_copy(struct perf_output_handle *handle,
-			     void *buf, unsigned int len)
+			     const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2681,9 +2681,10 @@ void perf_counter_comm(struct task_struct *task)
  */
 
 struct perf_mmap_event {
-	struct file	*file;
-	char		*file_name;
-	int		file_size;
+	struct vm_area_struct	*vma;
+
+	const char		*file_name;
+	int			file_size;
 
 	struct {
 		struct perf_event_header	header;
@@ -2744,11 +2745,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct file *file = mmap_event->file;
+	struct vm_area_struct *vma = mmap_event->vma;
+	struct file *file = vma->vm_file;
 	unsigned int size;
 	char tmp[16];
 	char *buf = NULL;
-	char *name;
+	const char *name;
 
 	if (file) {
 		buf = kzalloc(PATH_MAX, GFP_KERNEL);
@@ -2762,6 +2764,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			goto got_name;
 		}
 	} else {
+		name = arch_vma_name(mmap_event->vma);
+		if (name)
+			goto got_name;
+
+		if (!vma->vm_mm) {
+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
+			goto got_name;
+		}
+
 		name = strncpy(tmp, "//anon", sizeof(tmp));
 		goto got_name;
 	}
@@ -2791,8 +2802,7 @@ got_name:
 	kfree(buf);
 }
 
-void perf_counter_mmap(unsigned long addr, unsigned long len,
-		       unsigned long pgoff, struct file *file)
+void __perf_counter_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
@@ -2800,12 +2810,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 		return;
 
 	mmap_event = (struct perf_mmap_event){
-		.file   = file,
+		.vma	= vma,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
+			.start  = vma->vm_start,
+			.len    = vma->vm_end - vma->vm_start,
+			.pgoff  = vma->vm_pgoff,
 		},
 	};
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 6451ce2854b..8101de490c7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1220,8 +1220,7 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
-	if (vm_flags & VM_EXEC)
-		perf_counter_mmap(addr, len, pgoff, file);
+	perf_counter_mmap(vma);
 
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2309,6 +2308,8 @@ int install_special_mapping(struct mm_struct *mm,
 
 	mm->total_vm += len >> PAGE_SHIFT;
 
+	perf_counter_mmap(vma);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 77c9a5daa9c4d9b37812c9c69c7bcbb3f9399c3c Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 5 Jun 2009 16:26:18 +0200
Subject: firewire: reorganize header files

The three header files of firewire-core, i.e.
 "drivers/firewire/fw-device.h",
 "drivers/firewire/fw-topology.h",
 "drivers/firewire/fw-transaction.h",
are replaced by
 "drivers/firewire/core.h",
 "include/linux/firewire.h".

The latter includes everything which a firewire high-level driver (like
firewire-sbp2) needs besides linux/firewire-constants.h, while core.h
contains the rest which is needed by firewire-core itself and by low-
level drivers (card drivers) like firewire-ohci.

High-level drivers can now also reside outside of drivers/firewire
without having to add drivers/firewire to the header file search path in
makefiles.  At least the firedtv driver will be such a driver.

I also considered to spread the contents of core.h over several files,
one for each .c file where the respective implementation resides.  But
it turned out that most core .c files will end up including most of the
core .h files.  Also, the combined core.h isn't unreasonably big, and it
will lose more of its contents to linux/firewire.h anyway soon when more
firewire drivers are added.  (IP-over-1394, firedtv, and there are plans
for one or two more.)

Furthermore, fw-ohci.h is renamed to ohci.h.  The name of core.h and
ohci.h is chosen with regard to name changes of the .c files in a
follow-up change.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core.h           | 293 +++++++++++++++++++++++++
 drivers/firewire/fw-card.c        |   6 +-
 drivers/firewire/fw-cdev.c        |   5 +-
 drivers/firewire/fw-device.c      |   9 +-
 drivers/firewire/fw-device.h      | 190 ----------------
 drivers/firewire/fw-iso.c         |   4 +-
 drivers/firewire/fw-ohci.c        |   5 +-
 drivers/firewire/fw-ohci.h        | 157 --------------
 drivers/firewire/fw-sbp2.c        |   4 +-
 drivers/firewire/fw-topology.c    |  10 +-
 drivers/firewire/fw-topology.h    |  77 -------
 drivers/firewire/fw-transaction.c |   8 +-
 drivers/firewire/fw-transaction.h | 446 --------------------------------------
 drivers/firewire/ohci.h           | 157 ++++++++++++++
 include/linux/firewire.h          | 350 ++++++++++++++++++++++++++++++
 15 files changed, 830 insertions(+), 891 deletions(-)
 create mode 100644 drivers/firewire/core.h
 delete mode 100644 drivers/firewire/fw-device.h
 delete mode 100644 drivers/firewire/fw-ohci.h
 delete mode 100644 drivers/firewire/fw-topology.h
 delete mode 100644 drivers/firewire/fw-transaction.h
 create mode 100644 drivers/firewire/ohci.h
 create mode 100644 include/linux/firewire.h

(limited to 'include/linux')

diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
new file mode 100644
index 00000000000..273f0ab8292
--- /dev/null
+++ b/drivers/firewire/core.h
@@ -0,0 +1,293 @@
+#ifndef _FIREWIRE_CORE_H
+#define _FIREWIRE_CORE_H
+
+#include <linux/dma-mapping.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/idr.h>
+#include <linux/mm_types.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+
+struct device;
+struct fw_card;
+struct fw_device;
+struct fw_iso_buffer;
+struct fw_iso_context;
+struct fw_iso_packet;
+struct fw_node;
+struct fw_packet;
+
+
+/* -card */
+
+/* bitfields within the PHY registers */
+#define PHY_LINK_ACTIVE		0x80
+#define PHY_CONTENDER		0x40
+#define PHY_BUS_RESET		0x40
+#define PHY_BUS_SHORT_RESET	0x40
+
+#define BANDWIDTH_AVAILABLE_INITIAL	4915
+#define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
+#define BROADCAST_CHANNEL_VALID		(1 << 30)
+
+struct fw_card_driver {
+	/*
+	 * Enable the given card with the given initial config rom.
+	 * This function is expected to activate the card, and either
+	 * enable the PHY or set the link_on bit and initiate a bus
+	 * reset.
+	 */
+	int (*enable)(struct fw_card *card, u32 *config_rom, size_t length);
+
+	int (*update_phy_reg)(struct fw_card *card, int address,
+			      int clear_bits, int set_bits);
+
+	/*
+	 * Update the config rom for an enabled card.  This function
+	 * should change the config rom that is presented on the bus
+	 * an initiate a bus reset.
+	 */
+	int (*set_config_rom)(struct fw_card *card,
+			      u32 *config_rom, size_t length);
+
+	void (*send_request)(struct fw_card *card, struct fw_packet *packet);
+	void (*send_response)(struct fw_card *card, struct fw_packet *packet);
+	/* Calling cancel is valid once a packet has been submitted. */
+	int (*cancel_packet)(struct fw_card *card, struct fw_packet *packet);
+
+	/*
+	 * Allow the specified node ID to do direct DMA out and in of
+	 * host memory.  The card will disable this for all node when
+	 * a bus reset happens, so driver need to reenable this after
+	 * bus reset.  Returns 0 on success, -ENODEV if the card
+	 * doesn't support this, -ESTALE if the generation doesn't
+	 * match.
+	 */
+	int (*enable_phys_dma)(struct fw_card *card,
+			       int node_id, int generation);
+
+	u64 (*get_bus_time)(struct fw_card *card);
+
+	struct fw_iso_context *
+	(*allocate_iso_context)(struct fw_card *card,
+				int type, int channel, size_t header_size);
+	void (*free_iso_context)(struct fw_iso_context *ctx);
+
+	int (*start_iso)(struct fw_iso_context *ctx,
+			 s32 cycle, u32 sync, u32 tags);
+
+	int (*queue_iso)(struct fw_iso_context *ctx,
+			 struct fw_iso_packet *packet,
+			 struct fw_iso_buffer *buffer,
+			 unsigned long payload);
+
+	int (*stop_iso)(struct fw_iso_context *ctx);
+};
+
+void fw_card_initialize(struct fw_card *card,
+		const struct fw_card_driver *driver, struct device *device);
+int fw_card_add(struct fw_card *card,
+		u32 max_receive, u32 link_speed, u64 guid);
+void fw_core_remove_card(struct fw_card *card);
+int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
+int fw_compute_block_crc(u32 *block);
+void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
+
+struct fw_descriptor {
+	struct list_head link;
+	size_t length;
+	u32 immediate;
+	u32 key;
+	const u32 *data;
+};
+
+int fw_core_add_descriptor(struct fw_descriptor *desc);
+void fw_core_remove_descriptor(struct fw_descriptor *desc);
+
+
+/* -cdev */
+
+extern const struct file_operations fw_device_ops;
+
+void fw_device_cdev_update(struct fw_device *device);
+void fw_device_cdev_remove(struct fw_device *device);
+
+
+/* -device */
+
+extern struct rw_semaphore fw_device_rwsem;
+extern struct idr fw_device_idr;
+extern int fw_cdev_major;
+
+struct fw_device *fw_device_get_by_devt(dev_t devt);
+void fw_device_set_broadcast_channel(struct fw_device *device, int generation);
+void fw_node_event(struct fw_card *card, struct fw_node *node, int event);
+
+
+/* -iso */
+
+/*
+ * The iso packet format allows for an immediate header/payload part
+ * stored in 'header' immediately after the packet info plus an
+ * indirect payload part that is pointer to by the 'payload' field.
+ * Applications can use one or the other or both to implement simple
+ * low-bandwidth streaming (e.g. audio) or more advanced
+ * scatter-gather streaming (e.g. assembling video frame automatically).
+ */
+struct fw_iso_packet {
+	u16 payload_length;	/* Length of indirect payload. */
+	u32 interrupt:1;	/* Generate interrupt on this packet */
+	u32 skip:1;		/* Set to not send packet at all. */
+	u32 tag:2;
+	u32 sy:4;
+	u32 header_length:8;	/* Length of immediate header. */
+	u32 header[0];
+};
+
+#define FW_ISO_CONTEXT_TRANSMIT	0
+#define FW_ISO_CONTEXT_RECEIVE	1
+
+#define FW_ISO_CONTEXT_MATCH_TAG0	 1
+#define FW_ISO_CONTEXT_MATCH_TAG1	 2
+#define FW_ISO_CONTEXT_MATCH_TAG2	 4
+#define FW_ISO_CONTEXT_MATCH_TAG3	 8
+#define FW_ISO_CONTEXT_MATCH_ALL_TAGS	15
+
+/*
+ * An iso buffer is just a set of pages mapped for DMA in the
+ * specified direction.  Since the pages are to be used for DMA, they
+ * are not mapped into the kernel virtual address space.  We store the
+ * DMA address in the page private. The helper function
+ * fw_iso_buffer_map() will map the pages into a given vma.
+ */
+struct fw_iso_buffer {
+	enum dma_data_direction direction;
+	struct page **pages;
+	int page_count;
+};
+
+typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
+				  u32 cycle, size_t header_length,
+				  void *header, void *data);
+
+struct fw_iso_context {
+	struct fw_card *card;
+	int type;
+	int channel;
+	int speed;
+	size_t header_size;
+	fw_iso_callback_t callback;
+	void *callback_data;
+};
+
+int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
+		       int page_count, enum dma_data_direction direction);
+int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma);
+void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
+
+struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
+		int type, int channel, int speed, size_t header_size,
+		fw_iso_callback_t callback, void *callback_data);
+int fw_iso_context_queue(struct fw_iso_context *ctx,
+			 struct fw_iso_packet *packet,
+			 struct fw_iso_buffer *buffer,
+			 unsigned long payload);
+int fw_iso_context_start(struct fw_iso_context *ctx,
+			 int cycle, int sync, int tags);
+int fw_iso_context_stop(struct fw_iso_context *ctx);
+void fw_iso_context_destroy(struct fw_iso_context *ctx);
+
+void fw_iso_resource_manage(struct fw_card *card, int generation,
+		u64 channels_mask, int *channel, int *bandwidth, bool allocate);
+
+
+/* -topology */
+
+enum {
+	FW_NODE_CREATED,
+	FW_NODE_UPDATED,
+	FW_NODE_DESTROYED,
+	FW_NODE_LINK_ON,
+	FW_NODE_LINK_OFF,
+	FW_NODE_INITIATED_RESET,
+};
+
+struct fw_node {
+	u16 node_id;
+	u8 color;
+	u8 port_count;
+	u8 link_on:1;
+	u8 initiated_reset:1;
+	u8 b_path:1;
+	u8 phy_speed:2;	/* As in the self ID packet. */
+	u8 max_speed:2;	/* Minimum of all phy-speeds on the path from the
+			 * local node to this node. */
+	u8 max_depth:4;	/* Maximum depth to any leaf node */
+	u8 max_hops:4;	/* Max hops in this sub tree */
+	atomic_t ref_count;
+
+	/* For serializing node topology into a list. */
+	struct list_head link;
+
+	/* Upper layer specific data. */
+	void *data;
+
+	struct fw_node *ports[0];
+};
+
+static inline struct fw_node *fw_node_get(struct fw_node *node)
+{
+	atomic_inc(&node->ref_count);
+
+	return node;
+}
+
+static inline void fw_node_put(struct fw_node *node)
+{
+	if (atomic_dec_and_test(&node->ref_count))
+		kfree(node);
+}
+
+void fw_core_handle_bus_reset(struct fw_card *card, int node_id,
+			      int generation, int self_id_count, u32 *self_ids);
+void fw_destroy_nodes(struct fw_card *card);
+
+/*
+ * Check whether new_generation is the immediate successor of old_generation.
+ * Take counter roll-over at 255 (as per OHCI) into account.
+ */
+static inline bool is_next_generation(int new_generation, int old_generation)
+{
+	return (new_generation & 0xff) == ((old_generation + 1) & 0xff);
+}
+
+
+/* -transaction */
+
+#define TCODE_IS_READ_REQUEST(tcode)	(((tcode) & ~1) == 4)
+#define TCODE_IS_BLOCK_PACKET(tcode)	(((tcode) &  1) != 0)
+#define TCODE_IS_REQUEST(tcode)		(((tcode) &  2) == 0)
+#define TCODE_IS_RESPONSE(tcode)	(((tcode) &  2) != 0)
+#define TCODE_HAS_REQUEST_DATA(tcode)	(((tcode) & 12) != 4)
+#define TCODE_HAS_RESPONSE_DATA(tcode)	(((tcode) & 12) != 0)
+
+#define LOCAL_BUS 0xffc0
+
+void fw_core_handle_request(struct fw_card *card, struct fw_packet *request);
+void fw_core_handle_response(struct fw_card *card, struct fw_packet *packet);
+void fw_fill_response(struct fw_packet *response, u32 *request_header,
+		      int rcode, void *payload, size_t length);
+void fw_flush_transactions(struct fw_card *card);
+void fw_send_phy_config(struct fw_card *card,
+			int node_id, int generation, int gap_count);
+
+static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
+{
+	return tag << 14 | channel << 8 | sy;
+}
+
+#endif /* _FIREWIRE_CORE_H */
diff --git a/drivers/firewire/fw-card.c b/drivers/firewire/fw-card.c
index b6f55e262e7..ba6cd70b851 100644
--- a/drivers/firewire/fw-card.c
+++ b/drivers/firewire/fw-card.c
@@ -21,6 +21,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/device.h>
 #include <linux/errno.h>
+#include <linux/firewire.h>
+#include <linux/firewire-constants.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
@@ -34,9 +36,7 @@
 #include <asm/atomic.h>
 #include <asm/byteorder.h>
 
-#include "fw-device.h"
-#include "fw-topology.h"
-#include "fw-transaction.h"
+#include "core.h"
 
 int fw_compute_block_crc(u32 *block)
 {
diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 8a5e6ae2552..042c0454047 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/errno.h>
+#include <linux/firewire.h>
 #include <linux/firewire-cdev.h>
 #include <linux/idr.h>
 #include <linux/jiffies.h>
@@ -41,9 +42,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-#include "fw-device.h"
-#include "fw-topology.h"
-#include "fw-transaction.h"
+#include "core.h"
 
 struct client {
 	u32 version;
diff --git a/drivers/firewire/fw-device.c b/drivers/firewire/fw-device.c
index 238acac5bad..65d84dd6c1d 100644
--- a/drivers/firewire/fw-device.c
+++ b/drivers/firewire/fw-device.c
@@ -22,6 +22,8 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/errno.h>
+#include <linux/firewire.h>
+#include <linux/firewire-constants.h>
 #include <linux/idr.h>
 #include <linux/jiffies.h>
 #include <linux/kobject.h>
@@ -39,9 +41,7 @@
 #include <asm/byteorder.h>
 #include <asm/system.h>
 
-#include "fw-device.h"
-#include "fw-topology.h"
-#include "fw-transaction.h"
+#include "core.h"
 
 void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 * p)
 {
@@ -94,8 +94,9 @@ static int fw_unit_match(struct device *dev, struct device_driver *drv)
 		return 0;
 
 	device = fw_device(unit->device.parent);
+	id = container_of(drv, struct fw_driver, driver)->id_table;
 
-	for (id = fw_driver(drv)->id_table; id->match_flags != 0; id++) {
+	for (; id->match_flags != 0; id++) {
 		if (match_unit_directory(unit->directory, id->match_flags, id))
 			return 1;
 
diff --git a/drivers/firewire/fw-device.h b/drivers/firewire/fw-device.h
deleted file mode 100644
index e973c4361f4..00000000000
--- a/drivers/firewire/fw-device.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (C) 2005-2006  Kristian Hoegsberg <krh@bitplanet.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-
-#ifndef __fw_device_h
-#define __fw_device_h
-
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/idr.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mod_devicetable.h>
-#include <linux/mutex.h>
-#include <linux/rwsem.h>
-#include <linux/sysfs.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-
-#include <asm/atomic.h>
-
-enum fw_device_state {
-	FW_DEVICE_INITIALIZING,
-	FW_DEVICE_RUNNING,
-	FW_DEVICE_GONE,
-	FW_DEVICE_SHUTDOWN,
-};
-
-struct fw_attribute_group {
-	struct attribute_group *groups[2];
-	struct attribute_group group;
-	struct attribute *attrs[12];
-};
-
-struct fw_node;
-struct fw_card;
-
-/*
- * Note, fw_device.generation always has to be read before fw_device.node_id.
- * Use SMP memory barriers to ensure this.  Otherwise requests will be sent
- * to an outdated node_id if the generation was updated in the meantime due
- * to a bus reset.
- *
- * Likewise, fw-core will take care to update .node_id before .generation so
- * that whenever fw_device.generation is current WRT the actual bus generation,
- * fw_device.node_id is guaranteed to be current too.
- *
- * The same applies to fw_device.card->node_id vs. fw_device.generation.
- *
- * fw_device.config_rom and fw_device.config_rom_length may be accessed during
- * the lifetime of any fw_unit belonging to the fw_device, before device_del()
- * was called on the last fw_unit.  Alternatively, they may be accessed while
- * holding fw_device_rwsem.
- */
-struct fw_device {
-	atomic_t state;
-	struct fw_node *node;
-	int node_id;
-	int generation;
-	unsigned max_speed;
-	struct fw_card *card;
-	struct device device;
-
-	struct mutex client_list_mutex;
-	struct list_head client_list;
-
-	u32 *config_rom;
-	size_t config_rom_length;
-	int config_rom_retries;
-	unsigned is_local:1;
-	unsigned cmc:1;
-	unsigned bc_implemented:2;
-
-	struct delayed_work work;
-	struct fw_attribute_group attribute_group;
-};
-
-static inline struct fw_device *fw_device(struct device *dev)
-{
-	return container_of(dev, struct fw_device, device);
-}
-
-static inline int fw_device_is_shutdown(struct fw_device *device)
-{
-	return atomic_read(&device->state) == FW_DEVICE_SHUTDOWN;
-}
-
-static inline struct fw_device *fw_device_get(struct fw_device *device)
-{
-	get_device(&device->device);
-
-	return device;
-}
-
-static inline void fw_device_put(struct fw_device *device)
-{
-	put_device(&device->device);
-}
-
-struct fw_device *fw_device_get_by_devt(dev_t devt);
-int fw_device_enable_phys_dma(struct fw_device *device);
-void fw_device_set_broadcast_channel(struct fw_device *device, int generation);
-
-void fw_device_cdev_update(struct fw_device *device);
-void fw_device_cdev_remove(struct fw_device *device);
-
-extern struct rw_semaphore fw_device_rwsem;
-extern struct idr fw_device_idr;
-extern int fw_cdev_major;
-
-/*
- * fw_unit.directory must not be accessed after device_del(&fw_unit.device).
- */
-struct fw_unit {
-	struct device device;
-	u32 *directory;
-	struct fw_attribute_group attribute_group;
-};
-
-static inline struct fw_unit *fw_unit(struct device *dev)
-{
-	return container_of(dev, struct fw_unit, device);
-}
-
-static inline struct fw_unit *fw_unit_get(struct fw_unit *unit)
-{
-	get_device(&unit->device);
-
-	return unit;
-}
-
-static inline void fw_unit_put(struct fw_unit *unit)
-{
-	put_device(&unit->device);
-}
-
-#define CSR_OFFSET	0x40
-#define CSR_LEAF	0x80
-#define CSR_DIRECTORY	0xc0
-
-#define CSR_DESCRIPTOR		0x01
-#define CSR_VENDOR		0x03
-#define CSR_HARDWARE_VERSION	0x04
-#define CSR_NODE_CAPABILITIES	0x0c
-#define CSR_UNIT		0x11
-#define CSR_SPECIFIER_ID	0x12
-#define CSR_VERSION		0x13
-#define CSR_DEPENDENT_INFO	0x14
-#define CSR_MODEL		0x17
-#define CSR_INSTANCE		0x18
-#define CSR_DIRECTORY_ID	0x20
-
-struct fw_csr_iterator {
-	u32 *p;
-	u32 *end;
-};
-
-void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p);
-int fw_csr_iterator_next(struct fw_csr_iterator *ci,
-			 int *key, int *value);
-
-struct fw_driver {
-	struct device_driver driver;
-	/* Called when the parent device sits through a bus reset. */
-	void (*update)(struct fw_unit *unit);
-	const struct ieee1394_device_id *id_table;
-};
-
-static inline struct fw_driver *fw_driver(struct device_driver *drv)
-{
-	return container_of(drv, struct fw_driver, driver);
-}
-
-extern const struct file_operations fw_device_ops;
-
-#endif /* __fw_device_h */
diff --git a/drivers/firewire/fw-iso.c b/drivers/firewire/fw-iso.c
index 0ff3e9c42eb..28076c892d7 100644
--- a/drivers/firewire/fw-iso.c
+++ b/drivers/firewire/fw-iso.c
@@ -22,6 +22,7 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/errno.h>
+#include <linux/firewire.h>
 #include <linux/firewire-constants.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -30,8 +31,7 @@
 
 #include <asm/byteorder.h>
 
-#include "fw-topology.h"
-#include "fw-transaction.h"
+#include "core.h"
 
 /*
  * Isochronous DMA context management
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index d296d12909d..ecddd11b797 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/firewire.h>
 #include <linux/firewire-constants.h>
 #include <linux/gfp.h>
 #include <linux/init.h>
@@ -45,8 +46,8 @@
 #include <asm/pmac_feature.h>
 #endif
 
-#include "fw-ohci.h"
-#include "fw-transaction.h"
+#include "core.h"
+#include "ohci.h"
 
 #define DESCRIPTOR_OUTPUT_MORE		0
 #define DESCRIPTOR_OUTPUT_LAST		(1 << 12)
diff --git a/drivers/firewire/fw-ohci.h b/drivers/firewire/fw-ohci.h
deleted file mode 100644
index a2fbb6240ca..00000000000
--- a/drivers/firewire/fw-ohci.h
+++ /dev/null
@@ -1,157 +0,0 @@
-#ifndef __fw_ohci_h
-#define __fw_ohci_h
-
-/* OHCI register map */
-
-#define OHCI1394_Version                      0x000
-#define OHCI1394_GUID_ROM                     0x004
-#define OHCI1394_ATRetries                    0x008
-#define OHCI1394_CSRData                      0x00C
-#define OHCI1394_CSRCompareData               0x010
-#define OHCI1394_CSRControl                   0x014
-#define OHCI1394_ConfigROMhdr                 0x018
-#define OHCI1394_BusID                        0x01C
-#define OHCI1394_BusOptions                   0x020
-#define OHCI1394_GUIDHi                       0x024
-#define OHCI1394_GUIDLo                       0x028
-#define OHCI1394_ConfigROMmap                 0x034
-#define OHCI1394_PostedWriteAddressLo         0x038
-#define OHCI1394_PostedWriteAddressHi         0x03C
-#define OHCI1394_VendorID                     0x040
-#define OHCI1394_HCControlSet                 0x050
-#define OHCI1394_HCControlClear               0x054
-#define  OHCI1394_HCControl_BIBimageValid	0x80000000
-#define  OHCI1394_HCControl_noByteSwapData	0x40000000
-#define  OHCI1394_HCControl_programPhyEnable	0x00800000
-#define  OHCI1394_HCControl_aPhyEnhanceEnable	0x00400000
-#define  OHCI1394_HCControl_LPS			0x00080000
-#define  OHCI1394_HCControl_postedWriteEnable	0x00040000
-#define  OHCI1394_HCControl_linkEnable		0x00020000
-#define  OHCI1394_HCControl_softReset		0x00010000
-#define OHCI1394_SelfIDBuffer                 0x064
-#define OHCI1394_SelfIDCount                  0x068
-#define  OHCI1394_SelfIDCount_selfIDError	0x80000000
-#define OHCI1394_IRMultiChanMaskHiSet         0x070
-#define OHCI1394_IRMultiChanMaskHiClear       0x074
-#define OHCI1394_IRMultiChanMaskLoSet         0x078
-#define OHCI1394_IRMultiChanMaskLoClear       0x07C
-#define OHCI1394_IntEventSet                  0x080
-#define OHCI1394_IntEventClear                0x084
-#define OHCI1394_IntMaskSet                   0x088
-#define OHCI1394_IntMaskClear                 0x08C
-#define OHCI1394_IsoXmitIntEventSet           0x090
-#define OHCI1394_IsoXmitIntEventClear         0x094
-#define OHCI1394_IsoXmitIntMaskSet            0x098
-#define OHCI1394_IsoXmitIntMaskClear          0x09C
-#define OHCI1394_IsoRecvIntEventSet           0x0A0
-#define OHCI1394_IsoRecvIntEventClear         0x0A4
-#define OHCI1394_IsoRecvIntMaskSet            0x0A8
-#define OHCI1394_IsoRecvIntMaskClear          0x0AC
-#define OHCI1394_InitialBandwidthAvailable    0x0B0
-#define OHCI1394_InitialChannelsAvailableHi   0x0B4
-#define OHCI1394_InitialChannelsAvailableLo   0x0B8
-#define OHCI1394_FairnessControl              0x0DC
-#define OHCI1394_LinkControlSet               0x0E0
-#define OHCI1394_LinkControlClear             0x0E4
-#define   OHCI1394_LinkControl_rcvSelfID	(1 << 9)
-#define   OHCI1394_LinkControl_rcvPhyPkt	(1 << 10)
-#define   OHCI1394_LinkControl_cycleTimerEnable	(1 << 20)
-#define   OHCI1394_LinkControl_cycleMaster	(1 << 21)
-#define   OHCI1394_LinkControl_cycleSource	(1 << 22)
-#define OHCI1394_NodeID                       0x0E8
-#define   OHCI1394_NodeID_idValid             0x80000000
-#define   OHCI1394_NodeID_nodeNumber          0x0000003f
-#define   OHCI1394_NodeID_busNumber           0x0000ffc0
-#define OHCI1394_PhyControl                   0x0EC
-#define   OHCI1394_PhyControl_Read(addr)	(((addr) << 8) | 0x00008000)
-#define   OHCI1394_PhyControl_ReadDone		0x80000000
-#define   OHCI1394_PhyControl_ReadData(r)	(((r) & 0x00ff0000) >> 16)
-#define   OHCI1394_PhyControl_Write(addr, data)	(((addr) << 8) | (data) | 0x00004000)
-#define   OHCI1394_PhyControl_WriteDone		0x00004000
-#define OHCI1394_IsochronousCycleTimer        0x0F0
-#define OHCI1394_AsReqFilterHiSet             0x100
-#define OHCI1394_AsReqFilterHiClear           0x104
-#define OHCI1394_AsReqFilterLoSet             0x108
-#define OHCI1394_AsReqFilterLoClear           0x10C
-#define OHCI1394_PhyReqFilterHiSet            0x110
-#define OHCI1394_PhyReqFilterHiClear          0x114
-#define OHCI1394_PhyReqFilterLoSet            0x118
-#define OHCI1394_PhyReqFilterLoClear          0x11C
-#define OHCI1394_PhyUpperBound                0x120
-
-#define OHCI1394_AsReqTrContextBase           0x180
-#define OHCI1394_AsReqTrContextControlSet     0x180
-#define OHCI1394_AsReqTrContextControlClear   0x184
-#define OHCI1394_AsReqTrCommandPtr            0x18C
-
-#define OHCI1394_AsRspTrContextBase           0x1A0
-#define OHCI1394_AsRspTrContextControlSet     0x1A0
-#define OHCI1394_AsRspTrContextControlClear   0x1A4
-#define OHCI1394_AsRspTrCommandPtr            0x1AC
-
-#define OHCI1394_AsReqRcvContextBase          0x1C0
-#define OHCI1394_AsReqRcvContextControlSet    0x1C0
-#define OHCI1394_AsReqRcvContextControlClear  0x1C4
-#define OHCI1394_AsReqRcvCommandPtr           0x1CC
-
-#define OHCI1394_AsRspRcvContextBase          0x1E0
-#define OHCI1394_AsRspRcvContextControlSet    0x1E0
-#define OHCI1394_AsRspRcvContextControlClear  0x1E4
-#define OHCI1394_AsRspRcvCommandPtr           0x1EC
-
-/* Isochronous transmit registers */
-#define OHCI1394_IsoXmitContextBase(n)           (0x200 + 16 * (n))
-#define OHCI1394_IsoXmitContextControlSet(n)     (0x200 + 16 * (n))
-#define OHCI1394_IsoXmitContextControlClear(n)   (0x204 + 16 * (n))
-#define OHCI1394_IsoXmitCommandPtr(n)            (0x20C + 16 * (n))
-
-/* Isochronous receive registers */
-#define OHCI1394_IsoRcvContextBase(n)         (0x400 + 32 * (n))
-#define OHCI1394_IsoRcvContextControlSet(n)   (0x400 + 32 * (n))
-#define OHCI1394_IsoRcvContextControlClear(n) (0x404 + 32 * (n))
-#define OHCI1394_IsoRcvCommandPtr(n)          (0x40C + 32 * (n))
-#define OHCI1394_IsoRcvContextMatch(n)        (0x410 + 32 * (n))
-
-/* Interrupts Mask/Events */
-#define OHCI1394_reqTxComplete		0x00000001
-#define OHCI1394_respTxComplete		0x00000002
-#define OHCI1394_ARRQ			0x00000004
-#define OHCI1394_ARRS			0x00000008
-#define OHCI1394_RQPkt			0x00000010
-#define OHCI1394_RSPkt			0x00000020
-#define OHCI1394_isochTx		0x00000040
-#define OHCI1394_isochRx		0x00000080
-#define OHCI1394_postedWriteErr		0x00000100
-#define OHCI1394_lockRespErr		0x00000200
-#define OHCI1394_selfIDComplete		0x00010000
-#define OHCI1394_busReset		0x00020000
-#define OHCI1394_regAccessFail		0x00040000
-#define OHCI1394_phy			0x00080000
-#define OHCI1394_cycleSynch		0x00100000
-#define OHCI1394_cycle64Seconds		0x00200000
-#define OHCI1394_cycleLost		0x00400000
-#define OHCI1394_cycleInconsistent	0x00800000
-#define OHCI1394_unrecoverableError	0x01000000
-#define OHCI1394_cycleTooLong		0x02000000
-#define OHCI1394_phyRegRcvd		0x04000000
-#define OHCI1394_masterIntEnable	0x80000000
-
-#define OHCI1394_evt_no_status		0x0
-#define OHCI1394_evt_long_packet	0x2
-#define OHCI1394_evt_missing_ack	0x3
-#define OHCI1394_evt_underrun		0x4
-#define OHCI1394_evt_overrun		0x5
-#define OHCI1394_evt_descriptor_read	0x6
-#define OHCI1394_evt_data_read		0x7
-#define OHCI1394_evt_data_write		0x8
-#define OHCI1394_evt_bus_reset		0x9
-#define OHCI1394_evt_timeout		0xa
-#define OHCI1394_evt_tcode_err		0xb
-#define OHCI1394_evt_reserved_b		0xc
-#define OHCI1394_evt_reserved_c		0xd
-#define OHCI1394_evt_unknown		0xe
-#define OHCI1394_evt_flushed		0xf
-
-#define OHCI1394_phy_tcode		0xe
-
-#endif /* __fw_ohci_h */
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index 027b91f4a6e..d41cb6e455b 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -34,6 +34,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/firewire.h>
 #include <linux/firewire-constants.h>
 #include <linux/init.h>
 #include <linux/jiffies.h>
@@ -58,9 +59,6 @@
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 
-#include "fw-device.h"
-#include "fw-transaction.h"
-
 /*
  * So far only bridges from Oxford Semiconductor are known to support
  * concurrent logins. Depending on firmware, four or two concurrent logins
diff --git a/drivers/firewire/fw-topology.c b/drivers/firewire/fw-topology.c
index 6d0ea1bb7e2..fddf2b35893 100644
--- a/drivers/firewire/fw-topology.c
+++ b/drivers/firewire/fw-topology.c
@@ -20,6 +20,8 @@
 
 #include <linux/bug.h>
 #include <linux/errno.h>
+#include <linux/firewire.h>
+#include <linux/firewire-constants.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -31,8 +33,7 @@
 #include <asm/atomic.h>
 #include <asm/system.h>
 
-#include "fw-topology.h"
-#include "fw-transaction.h"
+#include "core.h"
 
 #define SELF_ID_PHY_ID(q)		(((q) >> 24) & 0x3f)
 #define SELF_ID_EXTENDED(q)		(((q) >> 23) & 0x01)
@@ -45,6 +46,11 @@
 
 #define SELF_ID_EXT_SEQUENCE(q)		(((q) >> 20) & 0x07)
 
+#define SELFID_PORT_CHILD	0x3
+#define SELFID_PORT_PARENT	0x2
+#define SELFID_PORT_NCONN	0x1
+#define SELFID_PORT_NONE	0x0
+
 static u32 *count_ports(u32 *sid, int *total_port_count, int *child_port_count)
 {
 	u32 q;
diff --git a/drivers/firewire/fw-topology.h b/drivers/firewire/fw-topology.h
deleted file mode 100644
index 3c497bb4fae..00000000000
--- a/drivers/firewire/fw-topology.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2003-2006 Kristian Hoegsberg <krh@bitplanet.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-
-#ifndef __fw_topology_h
-#define __fw_topology_h
-
-#include <linux/list.h>
-#include <linux/slab.h>
-
-#include <asm/atomic.h>
-
-enum {
-	FW_NODE_CREATED,
-	FW_NODE_UPDATED,
-	FW_NODE_DESTROYED,
-	FW_NODE_LINK_ON,
-	FW_NODE_LINK_OFF,
-	FW_NODE_INITIATED_RESET,
-};
-
-struct fw_node {
-	u16 node_id;
-	u8 color;
-	u8 port_count;
-	u8 link_on : 1;
-	u8 initiated_reset : 1;
-	u8 b_path : 1;
-	u8 phy_speed : 2; /* As in the self ID packet. */
-	u8 max_speed : 2; /* Minimum of all phy-speeds on the path from the
-			   * local node to this node. */
-	u8 max_depth : 4; /* Maximum depth to any leaf node */
-	u8 max_hops : 4;  /* Max hops in this sub tree */
-	atomic_t ref_count;
-
-	/* For serializing node topology into a list. */
-	struct list_head link;
-
-	/* Upper layer specific data. */
-	void *data;
-
-	struct fw_node *ports[0];
-};
-
-static inline struct fw_node *fw_node_get(struct fw_node *node)
-{
-	atomic_inc(&node->ref_count);
-
-	return node;
-}
-
-static inline void fw_node_put(struct fw_node *node)
-{
-	if (atomic_dec_and_test(&node->ref_count))
-		kfree(node);
-}
-
-struct fw_card;
-void fw_destroy_nodes(struct fw_card *card);
-
-int fw_compute_block_crc(u32 *block);
-
-#endif /* __fw_topology_h */
diff --git a/drivers/firewire/fw-transaction.c b/drivers/firewire/fw-transaction.c
index 7008214e333..9a6ce9ab2a6 100644
--- a/drivers/firewire/fw-transaction.c
+++ b/drivers/firewire/fw-transaction.c
@@ -22,6 +22,7 @@
 #include <linux/completion.h>
 #include <linux/device.h>
 #include <linux/errno.h>
+#include <linux/firewire.h>
 #include <linux/firewire-constants.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -38,8 +39,7 @@
 
 #include <asm/byteorder.h>
 
-#include "fw-device.h"	/* for fw_device_ops */
-#include "fw-transaction.h"
+#include "core.h"
 
 #define HEADER_PRI(pri)			((pri) << 0)
 #define HEADER_TCODE(tcode)		((tcode) << 4)
@@ -64,6 +64,10 @@
 #define HEADER_DESTINATION_IS_BROADCAST(q) \
 	(((q) & HEADER_DESTINATION(0x3f)) == HEADER_DESTINATION(0x3f))
 
+#define PHY_PACKET_CONFIG	0x0
+#define PHY_PACKET_LINK_ON	0x1
+#define PHY_PACKET_SELF_ID	0x2
+
 #define PHY_CONFIG_GAP_COUNT(gap_count)	(((gap_count) << 16) | (1 << 22))
 #define PHY_CONFIG_ROOT_ID(node_id)	((((node_id) & 0x3f) << 24) | (1 << 23))
 #define PHY_IDENTIFIER(id)		((id) << 30)
diff --git a/drivers/firewire/fw-transaction.h b/drivers/firewire/fw-transaction.h
deleted file mode 100644
index dfa799068f8..00000000000
--- a/drivers/firewire/fw-transaction.h
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * Copyright (C) 2003-2006 Kristian Hoegsberg <krh@bitplanet.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-
-#ifndef __fw_transaction_h
-#define __fw_transaction_h
-
-#include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/firewire-constants.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/spinlock_types.h>
-#include <linux/timer.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-
-#define TCODE_IS_READ_REQUEST(tcode)	(((tcode) & ~1) == 4)
-#define TCODE_IS_BLOCK_PACKET(tcode)	(((tcode) &  1) != 0)
-#define TCODE_IS_REQUEST(tcode)		(((tcode) &  2) == 0)
-#define TCODE_IS_RESPONSE(tcode)	(((tcode) &  2) != 0)
-#define TCODE_HAS_REQUEST_DATA(tcode)	(((tcode) & 12) != 4)
-#define TCODE_HAS_RESPONSE_DATA(tcode)	(((tcode) & 12) != 0)
-
-#define LOCAL_BUS 0xffc0
-
-#define SELFID_PORT_CHILD	0x3
-#define SELFID_PORT_PARENT	0x2
-#define SELFID_PORT_NCONN	0x1
-#define SELFID_PORT_NONE	0x0
-
-#define PHY_PACKET_CONFIG	0x0
-#define PHY_PACKET_LINK_ON	0x1
-#define PHY_PACKET_SELF_ID	0x2
-
-/* Bit fields _within_ the PHY registers. */
-#define PHY_LINK_ACTIVE		0x80
-#define PHY_CONTENDER		0x40
-#define PHY_BUS_RESET		0x40
-#define PHY_BUS_SHORT_RESET	0x40
-
-#define CSR_REGISTER_BASE		0xfffff0000000ULL
-
-/* register offsets relative to CSR_REGISTER_BASE */
-#define CSR_STATE_CLEAR			0x0
-#define CSR_STATE_SET			0x4
-#define CSR_NODE_IDS			0x8
-#define CSR_RESET_START			0xc
-#define CSR_SPLIT_TIMEOUT_HI		0x18
-#define CSR_SPLIT_TIMEOUT_LO		0x1c
-#define CSR_CYCLE_TIME			0x200
-#define CSR_BUS_TIME			0x204
-#define CSR_BUSY_TIMEOUT		0x210
-#define CSR_BUS_MANAGER_ID		0x21c
-#define CSR_BANDWIDTH_AVAILABLE		0x220
-#define CSR_CHANNELS_AVAILABLE		0x224
-#define CSR_CHANNELS_AVAILABLE_HI	0x224
-#define CSR_CHANNELS_AVAILABLE_LO	0x228
-#define CSR_BROADCAST_CHANNEL		0x234
-#define CSR_CONFIG_ROM			0x400
-#define CSR_CONFIG_ROM_END		0x800
-#define CSR_FCP_COMMAND			0xB00
-#define CSR_FCP_RESPONSE		0xD00
-#define CSR_FCP_END			0xF00
-#define CSR_TOPOLOGY_MAP		0x1000
-#define CSR_TOPOLOGY_MAP_END		0x1400
-#define CSR_SPEED_MAP			0x2000
-#define CSR_SPEED_MAP_END		0x3000
-
-#define BANDWIDTH_AVAILABLE_INITIAL	4915
-#define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
-#define BROADCAST_CHANNEL_VALID		(1 << 30)
-
-#define fw_notify(s, args...) printk(KERN_NOTICE KBUILD_MODNAME ": " s, ## args)
-#define fw_error(s, args...) printk(KERN_ERR KBUILD_MODNAME ": " s, ## args)
-
-static inline void fw_memcpy_from_be32(void *_dst, void *_src, size_t size)
-{
-	u32    *dst = _dst;
-	__be32 *src = _src;
-	int i;
-
-	for (i = 0; i < size / 4; i++)
-		dst[i] = be32_to_cpu(src[i]);
-}
-
-static inline void fw_memcpy_to_be32(void *_dst, void *_src, size_t size)
-{
-	fw_memcpy_from_be32(_dst, _src, size);
-}
-
-struct fw_card;
-struct fw_packet;
-struct fw_node;
-struct fw_request;
-
-struct fw_descriptor {
-	struct list_head link;
-	size_t length;
-	u32 immediate;
-	u32 key;
-	const u32 *data;
-};
-
-int fw_core_add_descriptor(struct fw_descriptor *desc);
-void fw_core_remove_descriptor(struct fw_descriptor *desc);
-
-typedef void (*fw_packet_callback_t)(struct fw_packet *packet,
-				     struct fw_card *card, int status);
-
-typedef void (*fw_transaction_callback_t)(struct fw_card *card, int rcode,
-					  void *data, size_t length,
-					  void *callback_data);
-
-/*
- * Important note:  The callback must guarantee that either fw_send_response()
- * or kfree() is called on the @request.
- */
-typedef void (*fw_address_callback_t)(struct fw_card *card,
-				      struct fw_request *request,
-				      int tcode, int destination, int source,
-				      int generation, int speed,
-				      unsigned long long offset,
-				      void *data, size_t length,
-				      void *callback_data);
-
-struct fw_packet {
-	int speed;
-	int generation;
-	u32 header[4];
-	size_t header_length;
-	void *payload;
-	size_t payload_length;
-	dma_addr_t payload_bus;
-	u32 timestamp;
-
-	/*
-	 * This callback is called when the packet transmission has
-	 * completed; for successful transmission, the status code is
-	 * the ack received from the destination, otherwise it's a
-	 * negative errno: ENOMEM, ESTALE, ETIMEDOUT, ENODEV, EIO.
-	 * The callback can be called from tasklet context and thus
-	 * must never block.
-	 */
-	fw_packet_callback_t callback;
-	int ack;
-	struct list_head link;
-	void *driver_data;
-};
-
-struct fw_transaction {
-	int node_id; /* The generation is implied; it is always the current. */
-	int tlabel;
-	int timestamp;
-	struct list_head link;
-
-	struct fw_packet packet;
-
-	/*
-	 * The data passed to the callback is valid only during the
-	 * callback.
-	 */
-	fw_transaction_callback_t callback;
-	void *callback_data;
-};
-
-struct fw_address_handler {
-	u64 offset;
-	size_t length;
-	fw_address_callback_t address_callback;
-	void *callback_data;
-	struct list_head link;
-};
-
-struct fw_address_region {
-	u64 start;
-	u64 end;
-};
-
-extern const struct fw_address_region fw_high_memory_region;
-
-int fw_core_add_address_handler(struct fw_address_handler *handler,
-				const struct fw_address_region *region);
-void fw_core_remove_address_handler(struct fw_address_handler *handler);
-void fw_fill_response(struct fw_packet *response, u32 *request_header,
-		      int rcode, void *payload, size_t length);
-void fw_send_response(struct fw_card *card,
-		      struct fw_request *request, int rcode);
-
-extern struct bus_type fw_bus_type;
-
-struct fw_card {
-	const struct fw_card_driver *driver;
-	struct device *device;
-	struct kref kref;
-	struct completion done;
-
-	int node_id;
-	int generation;
-	int current_tlabel, tlabel_mask;
-	struct list_head transaction_list;
-	struct timer_list flush_timer;
-	unsigned long reset_jiffies;
-
-	unsigned long long guid;
-	unsigned max_receive;
-	int link_speed;
-	int config_rom_generation;
-
-	spinlock_t lock; /* Take this lock when handling the lists in
-			  * this struct. */
-	struct fw_node *local_node;
-	struct fw_node *root_node;
-	struct fw_node *irm_node;
-	u8 color; /* must be u8 to match the definition in struct fw_node */
-	int gap_count;
-	bool beta_repeaters_present;
-
-	int index;
-
-	struct list_head link;
-
-	/* Work struct for BM duties. */
-	struct delayed_work work;
-	int bm_retries;
-	int bm_generation;
-
-	bool broadcast_channel_allocated;
-	u32 broadcast_channel;
-	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-};
-
-static inline struct fw_card *fw_card_get(struct fw_card *card)
-{
-	kref_get(&card->kref);
-
-	return card;
-}
-
-void fw_card_release(struct kref *kref);
-
-static inline void fw_card_put(struct fw_card *card)
-{
-	kref_put(&card->kref, fw_card_release);
-}
-
-extern void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
-
-/*
- * Check whether new_generation is the immediate successor of old_generation.
- * Take counter roll-over at 255 (as per to OHCI) into account.
- */
-static inline bool is_next_generation(int new_generation, int old_generation)
-{
-	return (new_generation & 0xff) == ((old_generation + 1) & 0xff);
-}
-
-/*
- * The iso packet format allows for an immediate header/payload part
- * stored in 'header' immediately after the packet info plus an
- * indirect payload part that is pointer to by the 'payload' field.
- * Applications can use one or the other or both to implement simple
- * low-bandwidth streaming (e.g. audio) or more advanced
- * scatter-gather streaming (e.g. assembling video frame automatically).
- */
-
-struct fw_iso_packet {
-	u16 payload_length;	/* Length of indirect payload. */
-	u32 interrupt : 1;	/* Generate interrupt on this packet */
-	u32 skip : 1;		/* Set to not send packet at all. */
-	u32 tag : 2;
-	u32 sy : 4;
-	u32 header_length : 8;	/* Length of immediate header. */
-	u32 header[0];
-};
-
-#define FW_ISO_CONTEXT_TRANSMIT	0
-#define FW_ISO_CONTEXT_RECEIVE	1
-
-#define FW_ISO_CONTEXT_MATCH_TAG0	 1
-#define FW_ISO_CONTEXT_MATCH_TAG1	 2
-#define FW_ISO_CONTEXT_MATCH_TAG2	 4
-#define FW_ISO_CONTEXT_MATCH_TAG3	 8
-#define FW_ISO_CONTEXT_MATCH_ALL_TAGS	15
-
-struct fw_iso_context;
-
-typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
-				  u32 cycle, size_t header_length,
-				  void *header, void *data);
-
-/*
- * An iso buffer is just a set of pages mapped for DMA in the
- * specified direction.  Since the pages are to be used for DMA, they
- * are not mapped into the kernel virtual address space.  We store the
- * DMA address in the page private. The helper function
- * fw_iso_buffer_map() will map the pages into a given vma.
- */
-
-struct fw_iso_buffer {
-	enum dma_data_direction direction;
-	struct page **pages;
-	int page_count;
-};
-
-struct fw_iso_context {
-	struct fw_card *card;
-	int type;
-	int channel;
-	int speed;
-	size_t header_size;
-	fw_iso_callback_t callback;
-	void *callback_data;
-};
-
-int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
-		       int page_count, enum dma_data_direction direction);
-int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma);
-void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
-
-struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
-		int type, int channel, int speed, size_t header_size,
-		fw_iso_callback_t callback, void *callback_data);
-int fw_iso_context_queue(struct fw_iso_context *ctx,
-			 struct fw_iso_packet *packet,
-			 struct fw_iso_buffer *buffer,
-			 unsigned long payload);
-int fw_iso_context_start(struct fw_iso_context *ctx,
-			 int cycle, int sync, int tags);
-int fw_iso_context_stop(struct fw_iso_context *ctx);
-void fw_iso_context_destroy(struct fw_iso_context *ctx);
-
-void fw_iso_resource_manage(struct fw_card *card, int generation,
-		u64 channels_mask, int *channel, int *bandwidth, bool allocate);
-
-struct fw_card_driver {
-	/*
-	 * Enable the given card with the given initial config rom.
-	 * This function is expected to activate the card, and either
-	 * enable the PHY or set the link_on bit and initiate a bus
-	 * reset.
-	 */
-	int (*enable)(struct fw_card *card, u32 *config_rom, size_t length);
-
-	int (*update_phy_reg)(struct fw_card *card, int address,
-			      int clear_bits, int set_bits);
-
-	/*
-	 * Update the config rom for an enabled card.  This function
-	 * should change the config rom that is presented on the bus
-	 * an initiate a bus reset.
-	 */
-	int (*set_config_rom)(struct fw_card *card,
-			      u32 *config_rom, size_t length);
-
-	void (*send_request)(struct fw_card *card, struct fw_packet *packet);
-	void (*send_response)(struct fw_card *card, struct fw_packet *packet);
-	/* Calling cancel is valid once a packet has been submitted. */
-	int (*cancel_packet)(struct fw_card *card, struct fw_packet *packet);
-
-	/*
-	 * Allow the specified node ID to do direct DMA out and in of
-	 * host memory.  The card will disable this for all node when
-	 * a bus reset happens, so driver need to reenable this after
-	 * bus reset.  Returns 0 on success, -ENODEV if the card
-	 * doesn't support this, -ESTALE if the generation doesn't
-	 * match.
-	 */
-	int (*enable_phys_dma)(struct fw_card *card,
-			       int node_id, int generation);
-
-	u64 (*get_bus_time)(struct fw_card *card);
-
-	struct fw_iso_context *
-	(*allocate_iso_context)(struct fw_card *card,
-				int type, int channel, size_t header_size);
-	void (*free_iso_context)(struct fw_iso_context *ctx);
-
-	int (*start_iso)(struct fw_iso_context *ctx,
-			 s32 cycle, u32 sync, u32 tags);
-
-	int (*queue_iso)(struct fw_iso_context *ctx,
-			 struct fw_iso_packet *packet,
-			 struct fw_iso_buffer *buffer,
-			 unsigned long payload);
-
-	int (*stop_iso)(struct fw_iso_context *ctx);
-};
-
-int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
-
-void fw_send_request(struct fw_card *card, struct fw_transaction *t,
-		int tcode, int destination_id, int generation, int speed,
-		unsigned long long offset, void *payload, size_t length,
-		fw_transaction_callback_t callback, void *callback_data);
-int fw_cancel_transaction(struct fw_card *card,
-			  struct fw_transaction *transaction);
-void fw_flush_transactions(struct fw_card *card);
-int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
-		       int generation, int speed, unsigned long long offset,
-		       void *payload, size_t length);
-void fw_send_phy_config(struct fw_card *card,
-			int node_id, int generation, int gap_count);
-
-static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
-{
-	return tag << 14 | channel << 8 | sy;
-}
-
-/*
- * Called by the topology code to inform the device code of node
- * activity; found, lost, or updated nodes.
- */
-void fw_node_event(struct fw_card *card, struct fw_node *node, int event);
-
-/* API used by card level drivers */
-
-void fw_card_initialize(struct fw_card *card,
-		const struct fw_card_driver *driver, struct device *device);
-int fw_card_add(struct fw_card *card,
-		u32 max_receive, u32 link_speed, u64 guid);
-void fw_core_remove_card(struct fw_card *card);
-void fw_core_handle_bus_reset(struct fw_card *card, int node_id,
-		int generation, int self_id_count, u32 *self_ids);
-void fw_core_handle_request(struct fw_card *card, struct fw_packet *request);
-void fw_core_handle_response(struct fw_card *card, struct fw_packet *packet);
-
-extern int fw_irm_set_broadcast_channel_register(struct device *dev,
-						 void *data);
-
-#endif /* __fw_transaction_h */
diff --git a/drivers/firewire/ohci.h b/drivers/firewire/ohci.h
new file mode 100644
index 00000000000..ba492d85c51
--- /dev/null
+++ b/drivers/firewire/ohci.h
@@ -0,0 +1,157 @@
+#ifndef _FIREWIRE_OHCI_H
+#define _FIREWIRE_OHCI_H
+
+/* OHCI register map */
+
+#define OHCI1394_Version                      0x000
+#define OHCI1394_GUID_ROM                     0x004
+#define OHCI1394_ATRetries                    0x008
+#define OHCI1394_CSRData                      0x00C
+#define OHCI1394_CSRCompareData               0x010
+#define OHCI1394_CSRControl                   0x014
+#define OHCI1394_ConfigROMhdr                 0x018
+#define OHCI1394_BusID                        0x01C
+#define OHCI1394_BusOptions                   0x020
+#define OHCI1394_GUIDHi                       0x024
+#define OHCI1394_GUIDLo                       0x028
+#define OHCI1394_ConfigROMmap                 0x034
+#define OHCI1394_PostedWriteAddressLo         0x038
+#define OHCI1394_PostedWriteAddressHi         0x03C
+#define OHCI1394_VendorID                     0x040
+#define OHCI1394_HCControlSet                 0x050
+#define OHCI1394_HCControlClear               0x054
+#define  OHCI1394_HCControl_BIBimageValid	0x80000000
+#define  OHCI1394_HCControl_noByteSwapData	0x40000000
+#define  OHCI1394_HCControl_programPhyEnable	0x00800000
+#define  OHCI1394_HCControl_aPhyEnhanceEnable	0x00400000
+#define  OHCI1394_HCControl_LPS			0x00080000
+#define  OHCI1394_HCControl_postedWriteEnable	0x00040000
+#define  OHCI1394_HCControl_linkEnable		0x00020000
+#define  OHCI1394_HCControl_softReset		0x00010000
+#define OHCI1394_SelfIDBuffer                 0x064
+#define OHCI1394_SelfIDCount                  0x068
+#define  OHCI1394_SelfIDCount_selfIDError	0x80000000
+#define OHCI1394_IRMultiChanMaskHiSet         0x070
+#define OHCI1394_IRMultiChanMaskHiClear       0x074
+#define OHCI1394_IRMultiChanMaskLoSet         0x078
+#define OHCI1394_IRMultiChanMaskLoClear       0x07C
+#define OHCI1394_IntEventSet                  0x080
+#define OHCI1394_IntEventClear                0x084
+#define OHCI1394_IntMaskSet                   0x088
+#define OHCI1394_IntMaskClear                 0x08C
+#define OHCI1394_IsoXmitIntEventSet           0x090
+#define OHCI1394_IsoXmitIntEventClear         0x094
+#define OHCI1394_IsoXmitIntMaskSet            0x098
+#define OHCI1394_IsoXmitIntMaskClear          0x09C
+#define OHCI1394_IsoRecvIntEventSet           0x0A0
+#define OHCI1394_IsoRecvIntEventClear         0x0A4
+#define OHCI1394_IsoRecvIntMaskSet            0x0A8
+#define OHCI1394_IsoRecvIntMaskClear          0x0AC
+#define OHCI1394_InitialBandwidthAvailable    0x0B0
+#define OHCI1394_InitialChannelsAvailableHi   0x0B4
+#define OHCI1394_InitialChannelsAvailableLo   0x0B8
+#define OHCI1394_FairnessControl              0x0DC
+#define OHCI1394_LinkControlSet               0x0E0
+#define OHCI1394_LinkControlClear             0x0E4
+#define   OHCI1394_LinkControl_rcvSelfID	(1 << 9)
+#define   OHCI1394_LinkControl_rcvPhyPkt	(1 << 10)
+#define   OHCI1394_LinkControl_cycleTimerEnable	(1 << 20)
+#define   OHCI1394_LinkControl_cycleMaster	(1 << 21)
+#define   OHCI1394_LinkControl_cycleSource	(1 << 22)
+#define OHCI1394_NodeID                       0x0E8
+#define   OHCI1394_NodeID_idValid             0x80000000
+#define   OHCI1394_NodeID_nodeNumber          0x0000003f
+#define   OHCI1394_NodeID_busNumber           0x0000ffc0
+#define OHCI1394_PhyControl                   0x0EC
+#define   OHCI1394_PhyControl_Read(addr)	(((addr) << 8) | 0x00008000)
+#define   OHCI1394_PhyControl_ReadDone		0x80000000
+#define   OHCI1394_PhyControl_ReadData(r)	(((r) & 0x00ff0000) >> 16)
+#define   OHCI1394_PhyControl_Write(addr, data)	(((addr) << 8) | (data) | 0x00004000)
+#define   OHCI1394_PhyControl_WriteDone		0x00004000
+#define OHCI1394_IsochronousCycleTimer        0x0F0
+#define OHCI1394_AsReqFilterHiSet             0x100
+#define OHCI1394_AsReqFilterHiClear           0x104
+#define OHCI1394_AsReqFilterLoSet             0x108
+#define OHCI1394_AsReqFilterLoClear           0x10C
+#define OHCI1394_PhyReqFilterHiSet            0x110
+#define OHCI1394_PhyReqFilterHiClear          0x114
+#define OHCI1394_PhyReqFilterLoSet            0x118
+#define OHCI1394_PhyReqFilterLoClear          0x11C
+#define OHCI1394_PhyUpperBound                0x120
+
+#define OHCI1394_AsReqTrContextBase           0x180
+#define OHCI1394_AsReqTrContextControlSet     0x180
+#define OHCI1394_AsReqTrContextControlClear   0x184
+#define OHCI1394_AsReqTrCommandPtr            0x18C
+
+#define OHCI1394_AsRspTrContextBase           0x1A0
+#define OHCI1394_AsRspTrContextControlSet     0x1A0
+#define OHCI1394_AsRspTrContextControlClear   0x1A4
+#define OHCI1394_AsRspTrCommandPtr            0x1AC
+
+#define OHCI1394_AsReqRcvContextBase          0x1C0
+#define OHCI1394_AsReqRcvContextControlSet    0x1C0
+#define OHCI1394_AsReqRcvContextControlClear  0x1C4
+#define OHCI1394_AsReqRcvCommandPtr           0x1CC
+
+#define OHCI1394_AsRspRcvContextBase          0x1E0
+#define OHCI1394_AsRspRcvContextControlSet    0x1E0
+#define OHCI1394_AsRspRcvContextControlClear  0x1E4
+#define OHCI1394_AsRspRcvCommandPtr           0x1EC
+
+/* Isochronous transmit registers */
+#define OHCI1394_IsoXmitContextBase(n)           (0x200 + 16 * (n))
+#define OHCI1394_IsoXmitContextControlSet(n)     (0x200 + 16 * (n))
+#define OHCI1394_IsoXmitContextControlClear(n)   (0x204 + 16 * (n))
+#define OHCI1394_IsoXmitCommandPtr(n)            (0x20C + 16 * (n))
+
+/* Isochronous receive registers */
+#define OHCI1394_IsoRcvContextBase(n)         (0x400 + 32 * (n))
+#define OHCI1394_IsoRcvContextControlSet(n)   (0x400 + 32 * (n))
+#define OHCI1394_IsoRcvContextControlClear(n) (0x404 + 32 * (n))
+#define OHCI1394_IsoRcvCommandPtr(n)          (0x40C + 32 * (n))
+#define OHCI1394_IsoRcvContextMatch(n)        (0x410 + 32 * (n))
+
+/* Interrupts Mask/Events */
+#define OHCI1394_reqTxComplete		0x00000001
+#define OHCI1394_respTxComplete		0x00000002
+#define OHCI1394_ARRQ			0x00000004
+#define OHCI1394_ARRS			0x00000008
+#define OHCI1394_RQPkt			0x00000010
+#define OHCI1394_RSPkt			0x00000020
+#define OHCI1394_isochTx		0x00000040
+#define OHCI1394_isochRx		0x00000080
+#define OHCI1394_postedWriteErr		0x00000100
+#define OHCI1394_lockRespErr		0x00000200
+#define OHCI1394_selfIDComplete		0x00010000
+#define OHCI1394_busReset		0x00020000
+#define OHCI1394_regAccessFail		0x00040000
+#define OHCI1394_phy			0x00080000
+#define OHCI1394_cycleSynch		0x00100000
+#define OHCI1394_cycle64Seconds		0x00200000
+#define OHCI1394_cycleLost		0x00400000
+#define OHCI1394_cycleInconsistent	0x00800000
+#define OHCI1394_unrecoverableError	0x01000000
+#define OHCI1394_cycleTooLong		0x02000000
+#define OHCI1394_phyRegRcvd		0x04000000
+#define OHCI1394_masterIntEnable	0x80000000
+
+#define OHCI1394_evt_no_status		0x0
+#define OHCI1394_evt_long_packet	0x2
+#define OHCI1394_evt_missing_ack	0x3
+#define OHCI1394_evt_underrun		0x4
+#define OHCI1394_evt_overrun		0x5
+#define OHCI1394_evt_descriptor_read	0x6
+#define OHCI1394_evt_data_read		0x7
+#define OHCI1394_evt_data_write		0x8
+#define OHCI1394_evt_bus_reset		0x9
+#define OHCI1394_evt_timeout		0xa
+#define OHCI1394_evt_tcode_err		0xb
+#define OHCI1394_evt_reserved_b		0xc
+#define OHCI1394_evt_reserved_c		0xd
+#define OHCI1394_evt_unknown		0xe
+#define OHCI1394_evt_flushed		0xf
+
+#define OHCI1394_phy_tcode		0xe
+
+#endif /* _FIREWIRE_OHCI_H */
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
new file mode 100644
index 00000000000..e979f9b22cb
--- /dev/null
+++ b/include/linux/firewire.h
@@ -0,0 +1,350 @@
+#ifndef _LINUX_FIREWIRE_H
+#define _LINUX_FIREWIRE_H
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+
+#define fw_notify(s, args...) printk(KERN_NOTICE KBUILD_MODNAME ": " s, ## args)
+#define fw_error(s, args...) printk(KERN_ERR KBUILD_MODNAME ": " s, ## args)
+
+static inline void fw_memcpy_from_be32(void *_dst, void *_src, size_t size)
+{
+	u32    *dst = _dst;
+	__be32 *src = _src;
+	int i;
+
+	for (i = 0; i < size / 4; i++)
+		dst[i] = be32_to_cpu(src[i]);
+}
+
+static inline void fw_memcpy_to_be32(void *_dst, void *_src, size_t size)
+{
+	fw_memcpy_from_be32(_dst, _src, size);
+}
+#define CSR_REGISTER_BASE		0xfffff0000000ULL
+
+/* register offsets are relative to CSR_REGISTER_BASE */
+#define CSR_STATE_CLEAR			0x0
+#define CSR_STATE_SET			0x4
+#define CSR_NODE_IDS			0x8
+#define CSR_RESET_START			0xc
+#define CSR_SPLIT_TIMEOUT_HI		0x18
+#define CSR_SPLIT_TIMEOUT_LO		0x1c
+#define CSR_CYCLE_TIME			0x200
+#define CSR_BUS_TIME			0x204
+#define CSR_BUSY_TIMEOUT		0x210
+#define CSR_BUS_MANAGER_ID		0x21c
+#define CSR_BANDWIDTH_AVAILABLE		0x220
+#define CSR_CHANNELS_AVAILABLE		0x224
+#define CSR_CHANNELS_AVAILABLE_HI	0x224
+#define CSR_CHANNELS_AVAILABLE_LO	0x228
+#define CSR_BROADCAST_CHANNEL		0x234
+#define CSR_CONFIG_ROM			0x400
+#define CSR_CONFIG_ROM_END		0x800
+#define CSR_FCP_COMMAND			0xB00
+#define CSR_FCP_RESPONSE		0xD00
+#define CSR_FCP_END			0xF00
+#define CSR_TOPOLOGY_MAP		0x1000
+#define CSR_TOPOLOGY_MAP_END		0x1400
+#define CSR_SPEED_MAP			0x2000
+#define CSR_SPEED_MAP_END		0x3000
+
+#define CSR_OFFSET		0x40
+#define CSR_LEAF		0x80
+#define CSR_DIRECTORY		0xc0
+
+#define CSR_DESCRIPTOR		0x01
+#define CSR_VENDOR		0x03
+#define CSR_HARDWARE_VERSION	0x04
+#define CSR_NODE_CAPABILITIES	0x0c
+#define CSR_UNIT		0x11
+#define CSR_SPECIFIER_ID	0x12
+#define CSR_VERSION		0x13
+#define CSR_DEPENDENT_INFO	0x14
+#define CSR_MODEL		0x17
+#define CSR_INSTANCE		0x18
+#define CSR_DIRECTORY_ID	0x20
+
+struct fw_csr_iterator {
+	u32 *p;
+	u32 *end;
+};
+
+void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p);
+int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value);
+
+extern struct bus_type fw_bus_type;
+
+struct fw_card_driver;
+struct fw_node;
+
+struct fw_card {
+	const struct fw_card_driver *driver;
+	struct device *device;
+	struct kref kref;
+	struct completion done;
+
+	int node_id;
+	int generation;
+	int current_tlabel, tlabel_mask;
+	struct list_head transaction_list;
+	struct timer_list flush_timer;
+	unsigned long reset_jiffies;
+
+	unsigned long long guid;
+	unsigned max_receive;
+	int link_speed;
+	int config_rom_generation;
+
+	spinlock_t lock; /* Take this lock when handling the lists in
+			  * this struct. */
+	struct fw_node *local_node;
+	struct fw_node *root_node;
+	struct fw_node *irm_node;
+	u8 color; /* must be u8 to match the definition in struct fw_node */
+	int gap_count;
+	bool beta_repeaters_present;
+
+	int index;
+
+	struct list_head link;
+
+	/* Work struct for BM duties. */
+	struct delayed_work work;
+	int bm_retries;
+	int bm_generation;
+
+	bool broadcast_channel_allocated;
+	u32 broadcast_channel;
+	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
+};
+
+static inline struct fw_card *fw_card_get(struct fw_card *card)
+{
+	kref_get(&card->kref);
+
+	return card;
+}
+
+void fw_card_release(struct kref *kref);
+
+static inline void fw_card_put(struct fw_card *card)
+{
+	kref_put(&card->kref, fw_card_release);
+}
+
+struct fw_attribute_group {
+	struct attribute_group *groups[2];
+	struct attribute_group group;
+	struct attribute *attrs[12];
+};
+
+enum fw_device_state {
+	FW_DEVICE_INITIALIZING,
+	FW_DEVICE_RUNNING,
+	FW_DEVICE_GONE,
+	FW_DEVICE_SHUTDOWN,
+};
+
+/*
+ * Note, fw_device.generation always has to be read before fw_device.node_id.
+ * Use SMP memory barriers to ensure this.  Otherwise requests will be sent
+ * to an outdated node_id if the generation was updated in the meantime due
+ * to a bus reset.
+ *
+ * Likewise, fw-core will take care to update .node_id before .generation so
+ * that whenever fw_device.generation is current WRT the actual bus generation,
+ * fw_device.node_id is guaranteed to be current too.
+ *
+ * The same applies to fw_device.card->node_id vs. fw_device.generation.
+ *
+ * fw_device.config_rom and fw_device.config_rom_length may be accessed during
+ * the lifetime of any fw_unit belonging to the fw_device, before device_del()
+ * was called on the last fw_unit.  Alternatively, they may be accessed while
+ * holding fw_device_rwsem.
+ */
+struct fw_device {
+	atomic_t state;
+	struct fw_node *node;
+	int node_id;
+	int generation;
+	unsigned max_speed;
+	struct fw_card *card;
+	struct device device;
+
+	struct mutex client_list_mutex;
+	struct list_head client_list;
+
+	u32 *config_rom;
+	size_t config_rom_length;
+	int config_rom_retries;
+	unsigned is_local:1;
+	unsigned cmc:1;
+	unsigned bc_implemented:2;
+
+	struct delayed_work work;
+	struct fw_attribute_group attribute_group;
+};
+
+static inline struct fw_device *fw_device(struct device *dev)
+{
+	return container_of(dev, struct fw_device, device);
+}
+
+static inline int fw_device_is_shutdown(struct fw_device *device)
+{
+	return atomic_read(&device->state) == FW_DEVICE_SHUTDOWN;
+}
+
+static inline struct fw_device *fw_device_get(struct fw_device *device)
+{
+	get_device(&device->device);
+
+	return device;
+}
+
+static inline void fw_device_put(struct fw_device *device)
+{
+	put_device(&device->device);
+}
+
+int fw_device_enable_phys_dma(struct fw_device *device);
+
+/*
+ * fw_unit.directory must not be accessed after device_del(&fw_unit.device).
+ */
+struct fw_unit {
+	struct device device;
+	u32 *directory;
+	struct fw_attribute_group attribute_group;
+};
+
+static inline struct fw_unit *fw_unit(struct device *dev)
+{
+	return container_of(dev, struct fw_unit, device);
+}
+
+static inline struct fw_unit *fw_unit_get(struct fw_unit *unit)
+{
+	get_device(&unit->device);
+
+	return unit;
+}
+
+static inline void fw_unit_put(struct fw_unit *unit)
+{
+	put_device(&unit->device);
+}
+
+struct ieee1394_device_id;
+
+struct fw_driver {
+	struct device_driver driver;
+	/* Called when the parent device sits through a bus reset. */
+	void (*update)(struct fw_unit *unit);
+	const struct ieee1394_device_id *id_table;
+};
+
+struct fw_packet;
+struct fw_request;
+
+typedef void (*fw_packet_callback_t)(struct fw_packet *packet,
+				     struct fw_card *card, int status);
+typedef void (*fw_transaction_callback_t)(struct fw_card *card, int rcode,
+					  void *data, size_t length,
+					  void *callback_data);
+/*
+ * Important note:  The callback must guarantee that either fw_send_response()
+ * or kfree() is called on the @request.
+ */
+typedef void (*fw_address_callback_t)(struct fw_card *card,
+				      struct fw_request *request,
+				      int tcode, int destination, int source,
+				      int generation, int speed,
+				      unsigned long long offset,
+				      void *data, size_t length,
+				      void *callback_data);
+
+struct fw_packet {
+	int speed;
+	int generation;
+	u32 header[4];
+	size_t header_length;
+	void *payload;
+	size_t payload_length;
+	dma_addr_t payload_bus;
+	u32 timestamp;
+
+	/*
+	 * This callback is called when the packet transmission has
+	 * completed; for successful transmission, the status code is
+	 * the ack received from the destination, otherwise it's a
+	 * negative errno: ENOMEM, ESTALE, ETIMEDOUT, ENODEV, EIO.
+	 * The callback can be called from tasklet context and thus
+	 * must never block.
+	 */
+	fw_packet_callback_t callback;
+	int ack;
+	struct list_head link;
+	void *driver_data;
+};
+
+struct fw_transaction {
+	int node_id; /* The generation is implied; it is always the current. */
+	int tlabel;
+	int timestamp;
+	struct list_head link;
+
+	struct fw_packet packet;
+
+	/*
+	 * The data passed to the callback is valid only during the
+	 * callback.
+	 */
+	fw_transaction_callback_t callback;
+	void *callback_data;
+};
+
+struct fw_address_handler {
+	u64 offset;
+	size_t length;
+	fw_address_callback_t address_callback;
+	void *callback_data;
+	struct list_head link;
+};
+
+struct fw_address_region {
+	u64 start;
+	u64 end;
+};
+
+extern const struct fw_address_region fw_high_memory_region;
+
+int fw_core_add_address_handler(struct fw_address_handler *handler,
+				const struct fw_address_region *region);
+void fw_core_remove_address_handler(struct fw_address_handler *handler);
+void fw_send_response(struct fw_card *card,
+		      struct fw_request *request, int rcode);
+void fw_send_request(struct fw_card *card, struct fw_transaction *t,
+		     int tcode, int destination_id, int generation, int speed,
+		     unsigned long long offset, void *payload, size_t length,
+		     fw_transaction_callback_t callback, void *callback_data);
+int fw_cancel_transaction(struct fw_card *card,
+			  struct fw_transaction *transaction);
+int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
+		       int generation, int speed, unsigned long long offset,
+		       void *payload, size_t length);
+
+#endif /* _LINUX_FIREWIRE_H */
-- 
cgit v1.2.3-70-g09d2


From 1b8e69662e1a086878bf930a6042daf7f8a076cc Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas at hp.com>
Date: Fri, 5 Jun 2009 14:37:23 +0000
Subject: pnp: add PNP resource range checking function

Add a PNP resource range check function, indicating whether a resource
has been assigned to any device.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
[apw@canonical.com: fixed up exports et al]
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/pnp/resource.c | 18 ++++++++++++++++++
 include/linux/pnp.h    |  2 ++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pnp/resource.c b/drivers/pnp/resource.c
index f604061d2bb..ba976542788 100644
--- a/drivers/pnp/resource.c
+++ b/drivers/pnp/resource.c
@@ -638,6 +638,24 @@ int pnp_possible_config(struct pnp_dev *dev, int type, resource_size_t start,
 }
 EXPORT_SYMBOL(pnp_possible_config);
 
+int pnp_range_reserved(resource_size_t start, resource_size_t end)
+{
+	struct pnp_dev *dev;
+	struct pnp_resource *pnp_res;
+	resource_size_t *dev_start, *dev_end;
+
+	pnp_for_each_dev(dev) {
+		list_for_each_entry(pnp_res, &dev->resources, list) {
+			dev_start = &pnp_res->res.start;
+			dev_end   = &pnp_res->res.end;
+			if (ranged_conflict(&start, &end, dev_start, dev_end))
+				return 1;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(pnp_range_reserved);
+
 /* format is: pnp_reserve_irq=irq1[,irq2] .... */
 static int __init pnp_setup_reserve_irq(char *str)
 {
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index ca3c8877302..b063c7328ba 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -446,6 +446,7 @@ int pnp_start_dev(struct pnp_dev *dev);
 int pnp_stop_dev(struct pnp_dev *dev);
 int pnp_activate_dev(struct pnp_dev *dev);
 int pnp_disable_dev(struct pnp_dev *dev);
+int pnp_range_reserved(resource_size_t start, resource_size_t end);
 
 /* protocol helpers */
 int pnp_is_active(struct pnp_dev *dev);
@@ -476,6 +477,7 @@ static inline int pnp_start_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_stop_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_activate_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_disable_dev(struct pnp_dev *dev) { return -ENODEV; }
+static inline int pnp_range_reserved(resource_size_t start, resource_size_t end) { return 0;}
 
 /* protocol helpers */
 static inline int pnp_is_active(struct pnp_dev *dev) { return 0; }
-- 
cgit v1.2.3-70-g09d2


From ac4bcf889469ffbca88f234d3184452886a47905 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:44:52 +0200
Subject: perf_counter: Change PERF_SAMPLE_CONFIG into PERF_SAMPLE_ID

The purpose of PERF_SAMPLE_CONFIG was to identify the counters,
since then we've added counter ids, use those instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 +-
 kernel/perf_counter.c        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40dc0e273d9..9cea32a0655 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,7 +104,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_ADDR		= 1U << 3,
 	PERF_SAMPLE_GROUP		= 1U << 4,
 	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
-	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_ID			= 1U << 6,
 	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 37a5a241ca7..e75b91a76a5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2392,8 +2392,8 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_CONFIG) {
-		header.type |= PERF_SAMPLE_CONFIG;
+	if (sample_type & PERF_SAMPLE_ID) {
+		header.type |= PERF_SAMPLE_ID;
 		header.size += sizeof(u64);
 	}
 
@@ -2439,8 +2439,8 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->attr.config);
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(&handle, counter->id);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
-- 
cgit v1.2.3-70-g09d2


From 689802b2d0536e72281dc959ab9cb34fb3c304cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 15:05:43 +0200
Subject: perf_counter: Add PERF_SAMPLE_PERIOD

In order to allow easy tracking of the period, also provide means of
adding it to the sample data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9cea32a0655..6bc25003973 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -106,6 +106,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
 	PERF_SAMPLE_ID			= 1U << 6,
 	PERF_SAMPLE_CPU			= 1U << 7,
+	PERF_SAMPLE_PERIOD		= 1U << 8,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
+	 *	u64				id;
 	 *	u64				sample_period;
 	 * };
 	 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e75b91a76a5..f8390668c39 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2404,6 +2404,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		header.type |= PERF_SAMPLE_PERIOD;
+		header.size += sizeof(u64);
+	}
+
 	if (sample_type & PERF_SAMPLE_GROUP) {
 		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
@@ -2445,6 +2450,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(&handle, counter->hw.sample_period);
+
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
@@ -2835,6 +2843,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 		u64				period;
 	} freq_event = {
 		.header = {
@@ -2843,6 +2852,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 			.size = sizeof(freq_event),
 		},
 		.time = sched_clock(),
+		.id = counter->id,
 		.period = period,
 	};
 
-- 
cgit v1.2.3-70-g09d2


From 6a24ed6c6082ec65d19331a4bfa30c0512a1a822 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 18:01:29 +0200
Subject: perf_counter: Fix frequency adjustment for < HZ

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 +++
 kernel/perf_counter.c        | 32 +++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bc25003973..4f9d39ecdc0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -373,6 +373,9 @@ struct hw_perf_counter {
 	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
+
+	u64				freq_count;
+	u64				freq_interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8390668c39..47c92fb927f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1187,8 +1187,9 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	u64 interrupts, sample_period;
-	u64 events, period;
+	u64 events, period, freq;
 	s64 delta;
 
 	spin_lock(&ctx->lock);
@@ -1196,8 +1197,10 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
-		interrupts = counter->hw.interrupts;
-		counter->hw.interrupts = 0;
+		hwc = &counter->hw;
+
+		interrupts = hwc->interrupts;
+		hwc->interrupts = 0;
 
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
@@ -1208,20 +1211,35 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.sample_period;
+		if (counter->attr.sample_freq < HZ) {
+			freq = counter->attr.sample_freq;
+
+			hwc->freq_count += freq;
+			hwc->freq_interrupts += interrupts;
+
+			if (hwc->freq_count < HZ)
+				continue;
+
+			interrupts = hwc->freq_interrupts;
+			hwc->freq_interrupts = 0;
+			hwc->freq_count -= HZ;
+		} else
+			freq = HZ;
+
+		events = freq * interrupts * hwc->sample_period;
 		period = div64_u64(events, counter->attr.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.sample_period);
+		delta = (s64)(1 + period - hwc->sample_period);
 		delta >>= 1;
 
-		sample_period = counter->hw.sample_period + delta;
+		sample_period = hwc->sample_period + delta;
 
 		if (!sample_period)
 			sample_period = 1;
 
 		perf_log_period(counter, sample_period);
 
-		counter->hw.sample_period = sample_period;
+		hwc->sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
-- 
cgit v1.2.3-70-g09d2


From ee8f37688966ab1438d0cf42e0cb7c6595d9592c Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 5 May 2009 11:04:19 +0300
Subject: mtd: OneNAND: add support for OneNAND manufactured by Numonyx

In addition to adding the Numonyx manufacturer code, this patch
also ensures 'sync. write' is disabled when reading identification
data - something that the Numonyx chip objects to, but the
Samsung chip seems to ignore.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/onenand/onenand_base.c | 3 ++-
 include/linux/mtd/onenand.h        | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 30d6999e5f9..2346857a275 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -2576,6 +2576,7 @@ static void onenand_print_device_info(int device, int version)
 
 static const struct onenand_manufacturers onenand_manuf_ids[] = {
         {ONENAND_MFR_SAMSUNG, "Samsung"},
+	{ONENAND_MFR_NUMONYX, "Numonyx"},
 };
 
 /**
@@ -2621,7 +2622,7 @@ static int onenand_probe(struct mtd_info *mtd)
 	/* Save system configuration 1 */
 	syscfg = this->read_word(this->base + ONENAND_REG_SYS_CFG1);
 	/* Clear Sync. Burst Read mode to read BootRAM */
-	this->write_word((syscfg & ~ONENAND_SYS_CFG1_SYNC_READ), this->base + ONENAND_REG_SYS_CFG1);
+	this->write_word((syscfg & ~ONENAND_SYS_CFG1_SYNC_READ & ~ONENAND_SYS_CFG1_SYNC_WRITE), this->base + ONENAND_REG_SYS_CFG1);
 
 	/* Send the command for reading device ID from BootRAM */
 	this->write_word(ONENAND_CMD_READID, this->base + ONENAND_BOOTRAM);
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 9aa2a9149b5..0fa3ac4ad57 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -176,6 +176,7 @@ struct onenand_chip {
  * OneNAND Flash Manufacturer ID Codes
  */
 #define ONENAND_MFR_SAMSUNG	0xec
+#define ONENAND_MFR_NUMONYX	0x20
 
 /**
  * struct onenand_manufacturers - NAND Flash Manufacturer ID Structure
-- 
cgit v1.2.3-70-g09d2


From d6fed9e9fc5eefae5be0ecf222bac7e7496e8e74 Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Mon, 11 May 2009 19:28:01 +0100
Subject: mtd: extend plat_nand for (read|write)_buf

This patch adds (write|read)_buf callbacks to plat_nand.

The NAND on the TS-7800 provisioned by the FPGA allows readw() and
readl() to be used which gives a 2.5x speed up.  To be able to use this
from the plat_nand driver a hook for read_buf (and also write_buf whilst
we are in there) need to be made available.  This patch adds the hook.

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/plat_nand.c | 2 ++
 include/linux/mtd/nand.h     | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 28ffd4e8bb2..47a2105b967 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -61,6 +61,8 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 	data->chip.cmd_ctrl = pdata->ctrl.cmd_ctrl;
 	data->chip.dev_ready = pdata->ctrl.dev_ready;
 	data->chip.select_chip = pdata->ctrl.select_chip;
+	data->chip.write_buf = pdata->ctrl.write_buf;
+	data->chip.read_buf = pdata->ctrl.read_buf;
 	data->chip.chip_delay = pdata->chip.chip_delay;
 	data->chip.options |= pdata->chip.options;
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7efb9be3466..0e35375ea79 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -584,6 +584,8 @@ struct platform_nand_chip {
  * @select_chip:	platform specific chip select function
  * @cmd_ctrl:		platform specific function for controlling
  *			ALE/CLE/nCE. Also used to write command and address
+ * @write_buf:		platform specific function for write buffer
+ * @read_buf:		platform specific function for read buffer
  * @priv:		private data to transport driver specific settings
  *
  * All fields are optional and depend on the hardware driver requirements
@@ -594,6 +596,10 @@ struct platform_nand_ctrl {
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
 	void		(*cmd_ctrl)(struct mtd_info *mtd, int dat,
 				    unsigned int ctrl);
+	void		(*write_buf)(struct mtd_info *mtd,
+				    const uint8_t *buf, int len);
+	void		(*read_buf)(struct mtd_info *mtd,
+				    uint8_t *buf, int len);
 	void		*priv;
 };
 
-- 
cgit v1.2.3-70-g09d2


From bf95efd41b1a760128eb25402791b0a4941eb655 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Tue, 12 May 2009 13:46:58 -0700
Subject: mtd: plat_nand: add platform probe/remove callbacks

Add optional probe and remove callbacks to the plat_nand driver.

Some platforms may require additional setup, such as configuring the
memory controller, before the nand device can be accessed.  This patch
provides an optional callback to handle this setup as well as a callback
to teardown the setup.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Tested-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/plat_nand.c | 13 +++++++++++--
 include/linux/mtd/nand.h     |  7 +++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 47a2105b967..22e0ce78841 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -72,6 +72,13 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, data);
 
+	/* Handle any platform specific setup */
+	if (pdata->ctrl.probe) {
+		res = pdata->ctrl.probe(pdev);
+		if (res)
+			goto out;
+	}
+
 	/* Scan to find existance of the device */
 	if (nand_scan(&data->mtd, 1)) {
 		res = -ENXIO;
@@ -101,6 +108,8 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 
 	nand_release(&data->mtd);
 out:
+	if (pdata->ctrl.remove)
+		pdata->ctrl.remove(pdev);
 	platform_set_drvdata(pdev, NULL);
 	iounmap(data->io_base);
 	kfree(data);
@@ -113,15 +122,15 @@ out:
 static int __devexit plat_nand_remove(struct platform_device *pdev)
 {
 	struct plat_nand_data *data = platform_get_drvdata(pdev);
-#ifdef CONFIG_MTD_PARTITIONS
 	struct platform_nand_data *pdata = pdev->dev.platform_data;
-#endif
 
 	nand_release(&data->mtd);
 #ifdef CONFIG_MTD_PARTITIONS
 	if (data->parts && data->parts != pdata->chip.partitions)
 		kfree(data->parts);
 #endif
+	if (pdata->ctrl.remove)
+		pdata->ctrl.remove(pdev);
 	iounmap(data->io_base);
 	kfree(data);
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 0e35375ea79..7f2d6935655 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -577,8 +577,13 @@ struct platform_nand_chip {
 	void			*priv;
 };
 
+/* Keep gcc happy */
+struct platform_device;
+
 /**
  * struct platform_nand_ctrl - controller level device structure
+ * @probe:		platform specific function to probe/setup hardware
+ * @remove:		platform specific function to remove/teardown hardware
  * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
@@ -591,6 +596,8 @@ struct platform_nand_chip {
  * All fields are optional and depend on the hardware driver requirements
  */
 struct platform_nand_ctrl {
+	int		(*probe)(struct platform_device *pdev);
+	void		(*remove)(struct platform_device *pdev);
 	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
-- 
cgit v1.2.3-70-g09d2


From f36e20c01ad0104688f2eaebdf2213e749929c97 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Tue, 12 May 2009 13:46:59 -0700
Subject: mtd: plat_nand: allow platform to set partitions

Add optional callback to allow platform to initialize partitions.

Static partitions on a nand device could vary depending on the size of the
device.  This patch allows an optional platform callback to be used to
setup this partition information at runtime.

Scan order is:
	1) chip.part_probe_types
	2) chip.set_parts
	3) chip.partitions
	4) full mtd device (fallback for no partitions)

Some of the existing nand drivers could possibly be replaced by the
plat_nand driver by using this patch.  These include autcpu12.c and
ts7250.c drivers.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/plat_nand.c | 2 ++
 include/linux/mtd/nand.h     | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 22e0ce78841..4e16c6f5bdd 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -95,6 +95,8 @@ static int __devinit plat_nand_probe(struct platform_device *pdev)
 			return 0;
 		}
 	}
+	if (pdata->chip.set_parts)
+		pdata->chip.set_parts(data->mtd.size, &pdata->chip);
 	if (pdata->chip.partitions) {
 		data->parts = pdata->chip.partitions;
 		res = add_mtd_partitions(&data->mtd, data->parts,
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7f2d6935655..4030ebada49 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -563,6 +563,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @options:		Option flags, e.g. 16bit buswidth
  * @ecclayout:		ecc layout info structure
  * @part_probe_types:	NULL-terminated array of probe types
+ * @set_parts:		platform specific function to set partitions
  * @priv:		hardware controller specific settings
  */
 struct platform_nand_chip {
@@ -574,6 +575,8 @@ struct platform_nand_chip {
 	int			chip_delay;
 	unsigned int		options;
 	const char		**part_probe_types;
+	void			(*set_parts)(uint64_t size,
+					struct platform_nand_chip *chip);
 	void			*priv;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 2ac6bf4ddc87c3b6b609f8fa82f6ebbffeac12f4 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Fri, 5 Jun 2009 10:36:24 -0700
Subject: IB/mlx4: Add strong ordering to local inval and fast reg work
 requests

The ConnectX Programmer's Reference Manual states that the "SO" bit
must be set when posting Fast Register and Local Invalidate send work
requests.  When this bit is set, the work request will be executed
only after all previous work requests on the send queue have been
executed.  (If the bit is not set, Fast Register and Local Invalidate
WQEs may begin execution too early, which violates the defined
semantics for these operations)

This fixes the issue with NFS/RDMA reported in
<http://lists.openfabrics.org/pipermail/general/2009-April/059253.html>

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Cc: <stable@kernel.org>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/qp.c | 4 ++++
 include/linux/mlx4/qp.h         | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 20724aee76f..c4a02648c8a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1585,12 +1585,16 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				break;
 
 			case IB_WR_LOCAL_INV:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
 				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
 				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
 				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
 				break;
 
 			case IB_WR_FAST_REG_MR:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
 				set_fmr_seg(wqe, wr);
 				wqe  += sizeof (struct mlx4_wqe_fmr_seg);
 				size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index bf8f11982da..9f29d86e5dc 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -165,6 +165,7 @@ enum {
 	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
 	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
 	MLX4_WQE_CTRL_INS_VLAN		= 1 << 6,
+	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
 };
 
 struct mlx4_wqe_ctrl_seg {
-- 
cgit v1.2.3-70-g09d2


From 5988af2319781bc8e0ce418affec4e09cfa77907 Mon Sep 17 00:00:00 2001
From: Rohit Hagargundgi <h.rohit@samsung.com>
Date: Tue, 12 May 2009 13:46:57 -0700
Subject: mtd: Flex-OneNAND support

Add support for Samsung Flex-OneNAND devices.

Flex-OneNAND combines SLC and MLC technologies into a single device.
SLC area provides increased reliability and speed, suitable for storing
code such as bootloader, kernel and root file system.  MLC area
provides high density and is suitable for storing user data.

SLC and MLC regions can be configured through kernel parameter.

[akpm@linux-foundation.org: export flexoand_region and onenand_addr]
Signed-off-by: Rohit Hagargundgi <h.rohit@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Vishak G <vishak.g@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/kernel-parameters.txt |  10 +
 drivers/mtd/onenand/onenand_base.c  | 857 ++++++++++++++++++++++++++++++++----
 drivers/mtd/onenand/onenand_bbt.c   |  14 +-
 drivers/mtd/onenand/onenand_sim.c   |  81 +++-
 include/linux/mtd/onenand.h         |  18 +
 include/linux/mtd/onenand_regs.h    |  20 +-
 6 files changed, 913 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc7..12df135f8af 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1380,6 +1380,16 @@ and is between 256 and 4096 characters. It is defined in the file
 	mtdparts=	[MTD]
 			See drivers/mtd/cmdlinepart.c.
 
+	onenand.bdry=	[HW,MTD] Flex-OneNAND Boundary Configuration
+
+			Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
+
+			boundary - index of last SLC block on Flex-OneNAND.
+				   The remaining blocks are configured as MLC blocks.
+			lock	 - Configure if Flex-OneNAND boundary should be locked.
+				   Once locked, the boundary cannot be changed.
+				   1 indicates lock status, 0 indicates unlock status.
+
 	mtdset=		[ARM]
 			ARM/S3C2412 JIVE boot control
 
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 2346857a275..8d4c9c25373 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -9,6 +9,10 @@
  *	auto-placement support, read-while load support, various fixes
  *	Copyright (C) Nokia Corporation, 2007
  *
+ *	Vishak G <vishak.g at samsung.com>, Rohit Hagargundgi <h.rohit at samsung.com>
+ *	Flex-OneNAND support
+ *	Copyright (C) Samsung Electronics, 2008
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -27,6 +31,30 @@
 
 #include <asm/io.h>
 
+/* Default Flex-OneNAND boundary and lock respectively */
+static int flex_bdry[MAX_DIES * 2] = { -1, 0, -1, 0 };
+
+/**
+ *  onenand_oob_128 - oob info for Flex-Onenand with 4KB page
+ *  For now, we expose only 64 out of 80 ecc bytes
+ */
+static struct nand_ecclayout onenand_oob_128 = {
+	.eccbytes	= 64,
+	.eccpos		= {
+		6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+		22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+		38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+		54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+		70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+		86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+		102, 103, 104, 105
+		},
+	.oobfree	= {
+		{2, 4}, {18, 4}, {34, 4}, {50, 4},
+		{66, 4}, {82, 4}, {98, 4}, {114, 4}
+	}
+};
+
 /**
  * onenand_oob_64 - oob info for large (2KB) page
  */
@@ -65,6 +93,14 @@ static const unsigned char ffchars[] = {
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 48 */
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 64 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 80 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 96 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 112 */
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,	/* 128 */
 };
 
 /**
@@ -170,6 +206,70 @@ static int onenand_buffer_address(int dataram1, int sectors, int count)
 	return ((bsa << ONENAND_BSA_SHIFT) | bsc);
 }
 
+/**
+ * flexonenand_block- For given address return block number
+ * @param this         - OneNAND device structure
+ * @param addr		- Address for which block number is needed
+ */
+static unsigned flexonenand_block(struct onenand_chip *this, loff_t addr)
+{
+	unsigned boundary, blk, die = 0;
+
+	if (ONENAND_IS_DDP(this) && addr >= this->diesize[0]) {
+		die = 1;
+		addr -= this->diesize[0];
+	}
+
+	boundary = this->boundary[die];
+
+	blk = addr >> (this->erase_shift - 1);
+	if (blk > boundary)
+		blk = (blk + boundary + 1) >> 1;
+
+	blk += die ? this->density_mask : 0;
+	return blk;
+}
+
+inline unsigned onenand_block(struct onenand_chip *this, loff_t addr)
+{
+	if (!FLEXONENAND(this))
+		return addr >> this->erase_shift;
+	return flexonenand_block(this, addr);
+}
+
+/**
+ * flexonenand_addr - Return address of the block
+ * @this:		OneNAND device structure
+ * @block:		Block number on Flex-OneNAND
+ *
+ * Return address of the block
+ */
+static loff_t flexonenand_addr(struct onenand_chip *this, int block)
+{
+	loff_t ofs = 0;
+	int die = 0, boundary;
+
+	if (ONENAND_IS_DDP(this) && block >= this->density_mask) {
+		block -= this->density_mask;
+		die = 1;
+		ofs = this->diesize[0];
+	}
+
+	boundary = this->boundary[die];
+	ofs += (loff_t)block << (this->erase_shift - 1);
+	if (block > (boundary + 1))
+		ofs += (loff_t)(block - boundary - 1) << (this->erase_shift - 1);
+	return ofs;
+}
+
+loff_t onenand_addr(struct onenand_chip *this, int block)
+{
+	if (!FLEXONENAND(this))
+		return (loff_t)block << this->erase_shift;
+	return flexonenand_addr(this, block);
+}
+EXPORT_SYMBOL(onenand_addr);
+
 /**
  * onenand_get_density - [DEFAULT] Get OneNAND density
  * @param dev_id	OneNAND device ID
@@ -182,6 +282,22 @@ static inline int onenand_get_density(int dev_id)
 	return (density & ONENAND_DEVICE_DENSITY_MASK);
 }
 
+/**
+ * flexonenand_region - [Flex-OneNAND] Return erase region of addr
+ * @param mtd		MTD device structure
+ * @param addr		address whose erase region needs to be identified
+ */
+int flexonenand_region(struct mtd_info *mtd, loff_t addr)
+{
+	int i;
+
+	for (i = 0; i < mtd->numeraseregions; i++)
+		if (addr < mtd->eraseregions[i].offset)
+			break;
+	return i - 1;
+}
+EXPORT_SYMBOL(flexonenand_region);
+
 /**
  * onenand_command - [DEFAULT] Send command to OneNAND device
  * @param mtd		MTD device structure
@@ -207,16 +323,28 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 		page = -1;
 		break;
 
+	case FLEXONENAND_CMD_PI_ACCESS:
+		/* addr contains die index */
+		block = addr * this->density_mask;
+		page = -1;
+		break;
+
 	case ONENAND_CMD_ERASE:
 	case ONENAND_CMD_BUFFERRAM:
 	case ONENAND_CMD_OTP_ACCESS:
-		block = (int) (addr >> this->erase_shift);
+		block = onenand_block(this, addr);
 		page = -1;
 		break;
 
+	case FLEXONENAND_CMD_READ_PI:
+		cmd = ONENAND_CMD_READ;
+		block = addr * this->density_mask;
+		page = 0;
+		break;
+
 	default:
-		block = (int) (addr >> this->erase_shift);
-		page = (int) (addr >> this->page_shift);
+		block = onenand_block(this, addr);
+		page = (int) (addr - onenand_addr(this, block)) >> this->page_shift;
 
 		if (ONENAND_IS_2PLANE(this)) {
 			/* Make the even block number */
@@ -236,7 +364,7 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 		value = onenand_bufferram_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS2);
 
-		if (ONENAND_IS_2PLANE(this))
+		if (ONENAND_IS_MLC(this) || ONENAND_IS_2PLANE(this))
 			/* It is always BufferRAM0 */
 			ONENAND_SET_BUFFERRAM0(this);
 		else
@@ -258,13 +386,18 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 
 	if (page != -1) {
 		/* Now we use page size operation */
-		int sectors = 4, count = 4;
+		int sectors = 0, count = 0;
 		int dataram;
 
 		switch (cmd) {
+		case FLEXONENAND_CMD_RECOVER_LSB:
 		case ONENAND_CMD_READ:
 		case ONENAND_CMD_READOOB:
-			dataram = ONENAND_SET_NEXT_BUFFERRAM(this);
+			if (ONENAND_IS_MLC(this))
+				/* It is always BufferRAM0 */
+				dataram = ONENAND_SET_BUFFERRAM0(this);
+			else
+				dataram = ONENAND_SET_NEXT_BUFFERRAM(this);
 			break;
 
 		default:
@@ -292,6 +425,30 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le
 	return 0;
 }
 
+/**
+ * onenand_read_ecc - return ecc status
+ * @param this		onenand chip structure
+ */
+static inline int onenand_read_ecc(struct onenand_chip *this)
+{
+	int ecc, i, result = 0;
+
+	if (!FLEXONENAND(this))
+		return this->read_word(this->base + ONENAND_REG_ECC_STATUS);
+
+	for (i = 0; i < 4; i++) {
+		ecc = this->read_word(this->base + ONENAND_REG_ECC_STATUS + i);
+		if (likely(!ecc))
+			continue;
+		if (ecc & FLEXONENAND_UNCORRECTABLE_ERROR)
+			return ONENAND_ECC_2BIT_ALL;
+		else
+			result = ONENAND_ECC_1BIT_ALL;
+	}
+
+	return result;
+}
+
 /**
  * onenand_wait - [DEFAULT] wait until the command is done
  * @param mtd		MTD device structure
@@ -331,14 +488,14 @@ static int onenand_wait(struct mtd_info *mtd, int state)
 	 * power off recovery (POR) test, it should read ECC status first
 	 */
 	if (interrupt & ONENAND_INT_READ) {
-		int ecc = this->read_word(this->base + ONENAND_REG_ECC_STATUS);
+		int ecc = onenand_read_ecc(this);
 		if (ecc) {
 			if (ecc & ONENAND_ECC_2BIT_ALL) {
 				printk(KERN_ERR "onenand_wait: ECC error = 0x%04x\n", ecc);
 				mtd->ecc_stats.failed++;
 				return -EBADMSG;
 			} else if (ecc & ONENAND_ECC_1BIT_ALL) {
-				printk(KERN_INFO "onenand_wait: correctable ECC error = 0x%04x\n", ecc);
+				printk(KERN_DEBUG "onenand_wait: correctable ECC error = 0x%04x\n", ecc);
 				mtd->ecc_stats.corrected++;
 			}
 		}
@@ -656,7 +813,7 @@ static int onenand_check_bufferram(struct mtd_info *mtd, loff_t addr)
 
 	if (found && ONENAND_IS_DDP(this)) {
 		/* Select DataRAM for DDP */
-		int block = (int) (addr >> this->erase_shift);
+		int block = onenand_block(this, addr);
 		int value = onenand_bufferram_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS2);
 	}
@@ -815,6 +972,149 @@ static int onenand_transfer_auto_oob(struct mtd_info *mtd, uint8_t *buf, int col
 	return 0;
 }
 
+/**
+ * onenand_recover_lsb - [Flex-OneNAND] Recover LSB page data
+ * @param mtd		MTD device structure
+ * @param addr		address to recover
+ * @param status	return value from onenand_wait / onenand_bbt_wait
+ *
+ * MLC NAND Flash cell has paired pages - LSB page and MSB page. LSB page has
+ * lower page address and MSB page has higher page address in paired pages.
+ * If power off occurs during MSB page program, the paired LSB page data can
+ * become corrupt. LSB page recovery read is a way to read LSB page though page
+ * data are corrupted. When uncorrectable error occurs as a result of LSB page
+ * read after power up, issue LSB page recovery read.
+ */
+static int onenand_recover_lsb(struct mtd_info *mtd, loff_t addr, int status)
+{
+	struct onenand_chip *this = mtd->priv;
+	int i;
+
+	/* Recovery is only for Flex-OneNAND */
+	if (!FLEXONENAND(this))
+		return status;
+
+	/* check if we failed due to uncorrectable error */
+	if (status != -EBADMSG && status != ONENAND_BBT_READ_ECC_ERROR)
+		return status;
+
+	/* check if address lies in MLC region */
+	i = flexonenand_region(mtd, addr);
+	if (mtd->eraseregions[i].erasesize < (1 << this->erase_shift))
+		return status;
+
+	/* We are attempting to reread, so decrement stats.failed
+	 * which was incremented by onenand_wait due to read failure
+	 */
+	printk(KERN_INFO "onenand_recover_lsb: Attempting to recover from uncorrectable read\n");
+	mtd->ecc_stats.failed--;
+
+	/* Issue the LSB page recovery command */
+	this->command(mtd, FLEXONENAND_CMD_RECOVER_LSB, addr, this->writesize);
+	return this->wait(mtd, FL_READING);
+}
+
+/**
+ * onenand_mlc_read_ops_nolock - MLC OneNAND read main and/or out-of-band
+ * @param mtd		MTD device structure
+ * @param from		offset to read from
+ * @param ops:		oob operation description structure
+ *
+ * MLC OneNAND / Flex-OneNAND has 4KB page size and 4KB dataram.
+ * So, read-while-load is not present.
+ */
+static int onenand_mlc_read_ops_nolock(struct mtd_info *mtd, loff_t from,
+				struct mtd_oob_ops *ops)
+{
+	struct onenand_chip *this = mtd->priv;
+	struct mtd_ecc_stats stats;
+	size_t len = ops->len;
+	size_t ooblen = ops->ooblen;
+	u_char *buf = ops->datbuf;
+	u_char *oobbuf = ops->oobbuf;
+	int read = 0, column, thislen;
+	int oobread = 0, oobcolumn, thisooblen, oobsize;
+	int ret = 0;
+	int writesize = this->writesize;
+
+	DEBUG(MTD_DEBUG_LEVEL3, "onenand_mlc_read_ops_nolock: from = 0x%08x, len = %i\n", (unsigned int) from, (int) len);
+
+	if (ops->mode == MTD_OOB_AUTO)
+		oobsize = this->ecclayout->oobavail;
+	else
+		oobsize = mtd->oobsize;
+
+	oobcolumn = from & (mtd->oobsize - 1);
+
+	/* Do not allow reads past end of device */
+	if (from + len > mtd->size) {
+		printk(KERN_ERR "onenand_mlc_read_ops_nolock: Attempt read beyond end of device\n");
+		ops->retlen = 0;
+		ops->oobretlen = 0;
+		return -EINVAL;
+	}
+
+	stats = mtd->ecc_stats;
+
+	while (read < len) {
+		cond_resched();
+
+		thislen = min_t(int, writesize, len - read);
+
+		column = from & (writesize - 1);
+		if (column + thislen > writesize)
+			thislen = writesize - column;
+
+		if (!onenand_check_bufferram(mtd, from)) {
+			this->command(mtd, ONENAND_CMD_READ, from, writesize);
+
+			ret = this->wait(mtd, FL_READING);
+			if (unlikely(ret))
+				ret = onenand_recover_lsb(mtd, from, ret);
+			onenand_update_bufferram(mtd, from, !ret);
+			if (ret == -EBADMSG)
+				ret = 0;
+		}
+
+		this->read_bufferram(mtd, ONENAND_DATARAM, buf, column, thislen);
+		if (oobbuf) {
+			thisooblen = oobsize - oobcolumn;
+			thisooblen = min_t(int, thisooblen, ooblen - oobread);
+
+			if (ops->mode == MTD_OOB_AUTO)
+				onenand_transfer_auto_oob(mtd, oobbuf, oobcolumn, thisooblen);
+			else
+				this->read_bufferram(mtd, ONENAND_SPARERAM, oobbuf, oobcolumn, thisooblen);
+			oobread += thisooblen;
+			oobbuf += thisooblen;
+			oobcolumn = 0;
+		}
+
+		read += thislen;
+		if (read == len)
+			break;
+
+		from += thislen;
+		buf += thislen;
+	}
+
+	/*
+	 * Return success, if no ECC failures, else -EBADMSG
+	 * fs driver will take care of that, because
+	 * retlen == desired len and result == -EBADMSG
+	 */
+	ops->retlen = read;
+	ops->oobretlen = oobread;
+
+	if (ret)
+		return ret;
+
+	if (mtd->ecc_stats.failed - stats.failed)
+		return -EBADMSG;
+
+	return mtd->ecc_stats.corrected - stats.corrected ? -EUCLEAN : 0;
+}
+
 /**
  * onenand_read_ops_nolock - [OneNAND Interface] OneNAND read main and/or out-of-band
  * @param mtd		MTD device structure
@@ -962,7 +1262,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 	size_t len = ops->ooblen;
 	mtd_oob_mode_t mode = ops->mode;
 	u_char *buf = ops->oobbuf;
-	int ret = 0;
+	int ret = 0, readcmd;
 
 	from += ops->ooboffs;
 
@@ -993,17 +1293,22 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 
 	stats = mtd->ecc_stats;
 
+	readcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_READ : ONENAND_CMD_READOOB;
+
 	while (read < len) {
 		cond_resched();
 
 		thislen = oobsize - column;
 		thislen = min_t(int, thislen, len);
 
-		this->command(mtd, ONENAND_CMD_READOOB, from, mtd->oobsize);
+		this->command(mtd, readcmd, from, mtd->oobsize);
 
 		onenand_update_bufferram(mtd, from, 0);
 
 		ret = this->wait(mtd, FL_READING);
+		if (unlikely(ret))
+			ret = onenand_recover_lsb(mtd, from, ret);
+
 		if (ret && ret != -EBADMSG) {
 			printk(KERN_ERR "onenand_read_oob_nolock: read failed = 0x%x\n", ret);
 			break;
@@ -1053,6 +1358,7 @@ static int onenand_read_oob_nolock(struct mtd_info *mtd, loff_t from,
 static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
 	size_t *retlen, u_char *buf)
 {
+	struct onenand_chip *this = mtd->priv;
 	struct mtd_oob_ops ops = {
 		.len	= len,
 		.ooblen	= 0,
@@ -1062,7 +1368,9 @@ static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
 	int ret;
 
 	onenand_get_device(mtd, FL_READING);
-	ret = onenand_read_ops_nolock(mtd, from, &ops);
+	ret = ONENAND_IS_MLC(this) ?
+		onenand_mlc_read_ops_nolock(mtd, from, &ops) :
+		onenand_read_ops_nolock(mtd, from, &ops);
 	onenand_release_device(mtd);
 
 	*retlen = ops.retlen;
@@ -1080,6 +1388,7 @@ static int onenand_read(struct mtd_info *mtd, loff_t from, size_t len,
 static int onenand_read_oob(struct mtd_info *mtd, loff_t from,
 			    struct mtd_oob_ops *ops)
 {
+	struct onenand_chip *this = mtd->priv;
 	int ret;
 
 	switch (ops->mode) {
@@ -1094,7 +1403,9 @@ static int onenand_read_oob(struct mtd_info *mtd, loff_t from,
 
 	onenand_get_device(mtd, FL_READING);
 	if (ops->datbuf)
-		ret = onenand_read_ops_nolock(mtd, from, ops);
+		ret = ONENAND_IS_MLC(this) ?
+			onenand_mlc_read_ops_nolock(mtd, from, ops) :
+			onenand_read_ops_nolock(mtd, from, ops);
 	else
 		ret = onenand_read_oob_nolock(mtd, from, ops);
 	onenand_release_device(mtd);
@@ -1128,11 +1439,11 @@ static int onenand_bbt_wait(struct mtd_info *mtd, int state)
 	ctrl = this->read_word(this->base + ONENAND_REG_CTRL_STATUS);
 
 	if (interrupt & ONENAND_INT_READ) {
-		int ecc = this->read_word(this->base + ONENAND_REG_ECC_STATUS);
+		int ecc = onenand_read_ecc(this);
 		if (ecc & ONENAND_ECC_2BIT_ALL) {
 			printk(KERN_INFO "onenand_bbt_wait: ecc error = 0x%04x"
 				", controller error 0x%04x\n", ecc, ctrl);
-			return ONENAND_BBT_READ_ERROR;
+			return ONENAND_BBT_READ_ECC_ERROR;
 		}
 	} else {
 		printk(KERN_ERR "onenand_bbt_wait: read timeout!"
@@ -1163,7 +1474,7 @@ int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 {
 	struct onenand_chip *this = mtd->priv;
 	int read = 0, thislen, column;
-	int ret = 0;
+	int ret = 0, readcmd;
 	size_t len = ops->ooblen;
 	u_char *buf = ops->oobbuf;
 
@@ -1183,17 +1494,22 @@ int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 
 	column = from & (mtd->oobsize - 1);
 
+	readcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_READ : ONENAND_CMD_READOOB;
+
 	while (read < len) {
 		cond_resched();
 
 		thislen = mtd->oobsize - column;
 		thislen = min_t(int, thislen, len);
 
-		this->command(mtd, ONENAND_CMD_READOOB, from, mtd->oobsize);
+		this->command(mtd, readcmd, from, mtd->oobsize);
 
 		onenand_update_bufferram(mtd, from, 0);
 
 		ret = onenand_bbt_wait(mtd, FL_READING);
+		if (unlikely(ret))
+			ret = onenand_recover_lsb(mtd, from, ret);
+
 		if (ret)
 			break;
 
@@ -1230,9 +1546,11 @@ static int onenand_verify_oob(struct mtd_info *mtd, const u_char *buf, loff_t to
 {
 	struct onenand_chip *this = mtd->priv;
 	u_char *oob_buf = this->oob_buf;
-	int status, i;
+	int status, i, readcmd;
+
+	readcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_READ : ONENAND_CMD_READOOB;
 
-	this->command(mtd, ONENAND_CMD_READOOB, to, mtd->oobsize);
+	this->command(mtd, readcmd, to, mtd->oobsize);
 	onenand_update_bufferram(mtd, to, 0);
 	status = this->wait(mtd, FL_READING);
 	if (status)
@@ -1633,7 +1951,7 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 {
 	struct onenand_chip *this = mtd->priv;
 	int column, ret = 0, oobsize;
-	int written = 0;
+	int written = 0, oobcmd;
 	u_char *oobbuf;
 	size_t len = ops->ooblen;
 	const u_char *buf = ops->oobbuf;
@@ -1675,6 +1993,8 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 
 	oobbuf = this->oob_buf;
 
+	oobcmd = ONENAND_IS_MLC(this) ? ONENAND_CMD_PROG : ONENAND_CMD_PROGOOB;
+
 	/* Loop until all data write */
 	while (written < len) {
 		int thislen = min_t(int, oobsize, len - written);
@@ -1692,7 +2012,14 @@ static int onenand_write_oob_nolock(struct mtd_info *mtd, loff_t to,
 			memcpy(oobbuf + column, buf, thislen);
 		this->write_bufferram(mtd, ONENAND_SPARERAM, oobbuf, 0, mtd->oobsize);
 
-		this->command(mtd, ONENAND_CMD_PROGOOB, to, mtd->oobsize);
+		if (ONENAND_IS_MLC(this)) {
+			/* Set main area of DataRAM to 0xff*/
+			memset(this->page_buf, 0xff, mtd->writesize);
+			this->write_bufferram(mtd, ONENAND_DATARAM,
+					 this->page_buf, 0, mtd->writesize);
+		}
+
+		this->command(mtd, oobcmd, to, mtd->oobsize);
 
 		onenand_update_bufferram(mtd, to, 0);
 		if (ONENAND_IS_2PLANE(this)) {
@@ -1815,29 +2142,48 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 {
 	struct onenand_chip *this = mtd->priv;
 	unsigned int block_size;
-	loff_t addr;
-	int len;
-	int ret = 0;
+	loff_t addr = instr->addr;
+	loff_t len = instr->len;
+	int ret = 0, i;
+	struct mtd_erase_region_info *region = NULL;
+	loff_t region_end = 0;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%012llx, len = %llu\n", (unsigned long long) instr->addr, (unsigned long long) instr->len);
 
-	block_size = (1 << this->erase_shift);
-
-	/* Start address must align on block boundary */
-	if (unlikely(instr->addr & (block_size - 1))) {
-		printk(KERN_ERR "onenand_erase: Unaligned address\n");
+	/* Do not allow erase past end of device */
+	if (unlikely((len + addr) > mtd->size)) {
+		printk(KERN_ERR "onenand_erase: Erase past end of device\n");
 		return -EINVAL;
 	}
 
-	/* Length must align on block boundary */
-	if (unlikely(instr->len & (block_size - 1))) {
-		printk(KERN_ERR "onenand_erase: Length not block aligned\n");
-		return -EINVAL;
+	if (FLEXONENAND(this)) {
+		/* Find the eraseregion of this address */
+		i = flexonenand_region(mtd, addr);
+		region = &mtd->eraseregions[i];
+
+		block_size = region->erasesize;
+		region_end = region->offset + region->erasesize * region->numblocks;
+
+		/* Start address within region must align on block boundary.
+		 * Erase region's start offset is always block start address.
+		 */
+		if (unlikely((addr - region->offset) & (block_size - 1))) {
+			printk(KERN_ERR "onenand_erase: Unaligned address\n");
+			return -EINVAL;
+		}
+	} else {
+		block_size = 1 << this->erase_shift;
+
+		/* Start address must align on block boundary */
+		if (unlikely(addr & (block_size - 1))) {
+			printk(KERN_ERR "onenand_erase: Unaligned address\n");
+			return -EINVAL;
+		}
 	}
 
-	/* Do not allow erase past end of device */
-	if (unlikely((instr->len + instr->addr) > mtd->size)) {
-		printk(KERN_ERR "onenand_erase: Erase past end of device\n");
+	/* Length must align on block boundary */
+	if (unlikely(len & (block_size - 1))) {
+		printk(KERN_ERR "onenand_erase: Length not block aligned\n");
 		return -EINVAL;
 	}
 
@@ -1847,9 +2193,6 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 	onenand_get_device(mtd, FL_ERASING);
 
 	/* Loop throught the pages */
-	len = instr->len;
-	addr = instr->addr;
-
 	instr->state = MTD_ERASING;
 
 	while (len) {
@@ -1869,7 +2212,8 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 		ret = this->wait(mtd, FL_ERASING);
 		/* Check, if it is write protected */
 		if (ret) {
-			printk(KERN_ERR "onenand_erase: Failed erase, block %d\n", (unsigned) (addr >> this->erase_shift));
+			printk(KERN_ERR "onenand_erase: Failed erase, block %d\n",
+						 onenand_block(this, addr));
 			instr->state = MTD_ERASE_FAILED;
 			instr->fail_addr = addr;
 			goto erase_exit;
@@ -1877,6 +2221,22 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 		len -= block_size;
 		addr += block_size;
+
+		if (addr == region_end) {
+			if (!len)
+				break;
+			region++;
+
+			block_size = region->erasesize;
+			region_end = region->offset + region->erasesize * region->numblocks;
+
+			if (len & (block_size - 1)) {
+				/* FIXME: This should be handled at MTD partitioning level. */
+				printk(KERN_ERR "onenand_erase: Unaligned address\n");
+				goto erase_exit;
+			}
+		}
+
 	}
 
 	instr->state = MTD_ERASE_DONE;
@@ -1955,13 +2315,17 @@ static int onenand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	int block;
 
 	/* Get block number */
-	block = ((int) ofs) >> bbm->bbt_erase_shift;
+	block = onenand_block(this, ofs);
         if (bbm->bbt)
                 bbm->bbt[block >> 2] |= 0x01 << ((block & 0x03) << 1);
 
         /* We write two bytes, so we dont have to mess with 16 bit access */
         ofs += mtd->oobsize + (bbm->badblockpos & ~0x01);
-        return onenand_write_oob_nolock(mtd, ofs, &ops);
+	/* FIXME : What to do when marking SLC block in partition
+	 * 	   with MLC erasesize? For now, it is not advisable to
+	 *	   create partitions containing both SLC and MLC regions.
+	 */
+	return onenand_write_oob_nolock(mtd, ofs, &ops);
 }
 
 /**
@@ -2005,8 +2369,8 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
 	int start, end, block, value, status;
 	int wp_status_mask;
 
-	start = ofs >> this->erase_shift;
-	end = len >> this->erase_shift;
+	start = onenand_block(this, ofs);
+	end = onenand_block(this, ofs + len) - 1;
 
 	if (cmd == ONENAND_CMD_LOCK)
 		wp_status_mask = ONENAND_WP_LS;
@@ -2018,7 +2382,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
 		/* Set start block address */
 		this->write_word(start, this->base + ONENAND_REG_START_BLOCK_ADDRESS);
 		/* Set end block address */
-		this->write_word(start + end - 1, this->base + ONENAND_REG_END_BLOCK_ADDRESS);
+		this->write_word(end, this->base +  ONENAND_REG_END_BLOCK_ADDRESS);
 		/* Write lock command */
 		this->command(mtd, cmd, 0, 0);
 
@@ -2039,7 +2403,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
 	}
 
 	/* Block lock scheme */
-	for (block = start; block < start + end; block++) {
+	for (block = start; block < end + 1; block++) {
 		/* Set block address */
 		value = onenand_block_address(this, block);
 		this->write_word(value, this->base + ONENAND_REG_START_ADDRESS1);
@@ -2147,7 +2511,7 @@ static void onenand_unlock_all(struct mtd_info *mtd)
 {
 	struct onenand_chip *this = mtd->priv;
 	loff_t ofs = 0;
-	size_t len = this->chipsize;
+	loff_t len = mtd->size;
 
 	if (this->options & ONENAND_HAS_UNLOCK_ALL) {
 		/* Set start block address */
@@ -2168,7 +2532,7 @@ static void onenand_unlock_all(struct mtd_info *mtd)
 			return;
 
 		/* Workaround for all block unlock in DDP */
-		if (ONENAND_IS_DDP(this)) {
+		if (ONENAND_IS_DDP(this) && !FLEXONENAND(this)) {
 			/* All blocks on another chip */
 			ofs = this->chipsize >> 1;
 			len = this->chipsize >> 1;
@@ -2210,7 +2574,9 @@ static int do_otp_read(struct mtd_info *mtd, loff_t from, size_t len,
 	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
 	this->wait(mtd, FL_OTPING);
 
-	ret = onenand_read_ops_nolock(mtd, from, &ops);
+	ret = ONENAND_IS_MLC(this) ?
+		onenand_mlc_read_ops_nolock(mtd, from, &ops) :
+		onenand_read_ops_nolock(mtd, from, &ops);
 
 	/* Exit OTP access mode */
 	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
@@ -2277,21 +2643,32 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t *retlen, u_char *buf)
 {
 	struct onenand_chip *this = mtd->priv;
-	struct mtd_oob_ops ops = {
-		.mode = MTD_OOB_PLACE,
-		.ooblen = len,
-		.oobbuf = buf,
-		.ooboffs = 0,
-	};
+	struct mtd_oob_ops ops;
 	int ret;
 
 	/* Enter OTP access mode */
 	this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0);
 	this->wait(mtd, FL_OTPING);
 
-	ret = onenand_write_oob_nolock(mtd, from, &ops);
-
-	*retlen = ops.oobretlen;
+	if (FLEXONENAND(this)) {
+		/*
+		 * For Flex-OneNAND, we write lock mark to 1st word of sector 4 of
+		 * main area of page 49.
+		 */
+		ops.len = mtd->writesize;
+		ops.ooblen = 0;
+		ops.datbuf = buf;
+		ops.oobbuf = NULL;
+		ret = onenand_write_ops_nolock(mtd, mtd->writesize * 49, &ops);
+		*retlen = ops.retlen;
+	} else {
+		ops.mode = MTD_OOB_PLACE;
+		ops.ooblen = len;
+		ops.oobbuf = buf;
+		ops.ooboffs = 0;
+		ret = onenand_write_oob_nolock(mtd, from, &ops);
+		*retlen = ops.oobretlen;
+	}
 
 	/* Exit OTP access mode */
 	this->command(mtd, ONENAND_CMD_RESET, 0, 0);
@@ -2475,27 +2852,34 @@ static int onenand_lock_user_prot_reg(struct mtd_info *mtd, loff_t from,
 			size_t len)
 {
 	struct onenand_chip *this = mtd->priv;
-	u_char *oob_buf = this->oob_buf;
+	u_char *buf = FLEXONENAND(this) ? this->page_buf : this->oob_buf;
 	size_t retlen;
 	int ret;
 
-	memset(oob_buf, 0xff, mtd->oobsize);
+	memset(buf, 0xff, FLEXONENAND(this) ? this->writesize
+						 : mtd->oobsize);
 	/*
 	 * Note: OTP lock operation
 	 *       OTP block : 0xXXFC
 	 *       1st block : 0xXXF3 (If chip support)
 	 *       Both      : 0xXXF0 (If chip support)
 	 */
-	oob_buf[ONENAND_OTP_LOCK_OFFSET] = 0xFC;
+	if (FLEXONENAND(this))
+		buf[FLEXONENAND_OTP_LOCK_OFFSET] = 0xFC;
+	else
+		buf[ONENAND_OTP_LOCK_OFFSET] = 0xFC;
 
 	/*
 	 * Write lock mark to 8th word of sector0 of page0 of the spare0.
 	 * We write 16 bytes spare area instead of 2 bytes.
+	 * For Flex-OneNAND, we write lock mark to 1st word of sector 4 of
+	 * main area of page 49.
 	 */
+
 	from = 0;
-	len = 16;
+	len = FLEXONENAND(this) ? mtd->writesize : 16;
 
-	ret = onenand_otp_walk(mtd, from, len, &retlen, oob_buf, do_otp_lock, MTD_OTP_USER);
+	ret = onenand_otp_walk(mtd, from, len, &retlen, buf, do_otp_lock, MTD_OTP_USER);
 
 	return ret ? : retlen;
 }
@@ -2542,6 +2926,14 @@ static void onenand_check_features(struct mtd_info *mtd)
 		break;
 	}
 
+	if (ONENAND_IS_MLC(this))
+		this->options &= ~ONENAND_HAS_2PLANE;
+
+	if (FLEXONENAND(this)) {
+		this->options &= ~ONENAND_HAS_CONT_LOCK;
+		this->options |= ONENAND_HAS_UNLOCK_ALL;
+	}
+
 	if (this->options & ONENAND_HAS_CONT_LOCK)
 		printk(KERN_DEBUG "Lock scheme is Continuous Lock\n");
 	if (this->options & ONENAND_HAS_UNLOCK_ALL)
@@ -2559,14 +2951,16 @@ static void onenand_check_features(struct mtd_info *mtd)
  */
 static void onenand_print_device_info(int device, int version)
 {
-        int vcc, demuxed, ddp, density;
+	int vcc, demuxed, ddp, density, flexonenand;
 
         vcc = device & ONENAND_DEVICE_VCC_MASK;
         demuxed = device & ONENAND_DEVICE_IS_DEMUX;
         ddp = device & ONENAND_DEVICE_IS_DDP;
         density = onenand_get_density(device);
-        printk(KERN_INFO "%sOneNAND%s %dMB %sV 16-bit (0x%02x)\n",
-                demuxed ? "" : "Muxed ",
+	flexonenand = device & DEVICE_IS_FLEXONENAND;
+	printk(KERN_INFO "%s%sOneNAND%s %dMB %sV 16-bit (0x%02x)\n",
+		demuxed ? "" : "Muxed ",
+		flexonenand ? "Flex-" : "",
                 ddp ? "(DDP)" : "",
                 (16 << density),
                 vcc ? "2.65/3.3" : "1.8",
@@ -2605,6 +2999,280 @@ static int onenand_check_maf(int manuf)
 	return (i == size);
 }
 
+/**
+* flexonenand_get_boundary	- Reads the SLC boundary
+* @param onenand_info		- onenand info structure
+**/
+static int flexonenand_get_boundary(struct mtd_info *mtd)
+{
+	struct onenand_chip *this = mtd->priv;
+	unsigned die, bdry;
+	int ret, syscfg, locked;
+
+	/* Disable ECC */
+	syscfg = this->read_word(this->base + ONENAND_REG_SYS_CFG1);
+	this->write_word((syscfg | 0x0100), this->base + ONENAND_REG_SYS_CFG1);
+
+	for (die = 0; die < this->dies; die++) {
+		this->command(mtd, FLEXONENAND_CMD_PI_ACCESS, die, 0);
+		this->wait(mtd, FL_SYNCING);
+
+		this->command(mtd, FLEXONENAND_CMD_READ_PI, die, 0);
+		ret = this->wait(mtd, FL_READING);
+
+		bdry = this->read_word(this->base + ONENAND_DATARAM);
+		if ((bdry >> FLEXONENAND_PI_UNLOCK_SHIFT) == 3)
+			locked = 0;
+		else
+			locked = 1;
+		this->boundary[die] = bdry & FLEXONENAND_PI_MASK;
+
+		this->command(mtd, ONENAND_CMD_RESET, 0, 0);
+		ret = this->wait(mtd, FL_RESETING);
+
+		printk(KERN_INFO "Die %d boundary: %d%s\n", die,
+		       this->boundary[die], locked ? "(Locked)" : "(Unlocked)");
+	}
+
+	/* Enable ECC */
+	this->write_word(syscfg, this->base + ONENAND_REG_SYS_CFG1);
+	return 0;
+}
+
+/**
+ * flexonenand_get_size - Fill up fields in onenand_chip and mtd_info
+ * 			  boundary[], diesize[], mtd->size, mtd->erasesize
+ * @param mtd		- MTD device structure
+ */
+static void flexonenand_get_size(struct mtd_info *mtd)
+{
+	struct onenand_chip *this = mtd->priv;
+	int die, i, eraseshift, density;
+	int blksperdie, maxbdry;
+	loff_t ofs;
+
+	density = onenand_get_density(this->device_id);
+	blksperdie = ((loff_t)(16 << density) << 20) >> (this->erase_shift);
+	blksperdie >>= ONENAND_IS_DDP(this) ? 1 : 0;
+	maxbdry = blksperdie - 1;
+	eraseshift = this->erase_shift - 1;
+
+	mtd->numeraseregions = this->dies << 1;
+
+	/* This fills up the device boundary */
+	flexonenand_get_boundary(mtd);
+	die = ofs = 0;
+	i = -1;
+	for (; die < this->dies; die++) {
+		if (!die || this->boundary[die-1] != maxbdry) {
+			i++;
+			mtd->eraseregions[i].offset = ofs;
+			mtd->eraseregions[i].erasesize = 1 << eraseshift;
+			mtd->eraseregions[i].numblocks =
+							this->boundary[die] + 1;
+			ofs += mtd->eraseregions[i].numblocks << eraseshift;
+			eraseshift++;
+		} else {
+			mtd->numeraseregions -= 1;
+			mtd->eraseregions[i].numblocks +=
+							this->boundary[die] + 1;
+			ofs += (this->boundary[die] + 1) << (eraseshift - 1);
+		}
+		if (this->boundary[die] != maxbdry) {
+			i++;
+			mtd->eraseregions[i].offset = ofs;
+			mtd->eraseregions[i].erasesize = 1 << eraseshift;
+			mtd->eraseregions[i].numblocks = maxbdry ^
+							 this->boundary[die];
+			ofs += mtd->eraseregions[i].numblocks << eraseshift;
+			eraseshift--;
+		} else
+			mtd->numeraseregions -= 1;
+	}
+
+	/* Expose MLC erase size except when all blocks are SLC */
+	mtd->erasesize = 1 << this->erase_shift;
+	if (mtd->numeraseregions == 1)
+		mtd->erasesize >>= 1;
+
+	printk(KERN_INFO "Device has %d eraseregions\n", mtd->numeraseregions);
+	for (i = 0; i < mtd->numeraseregions; i++)
+		printk(KERN_INFO "[offset: 0x%08x, erasesize: 0x%05x,"
+			" numblocks: %04u]\n",
+			(unsigned int) mtd->eraseregions[i].offset,
+			mtd->eraseregions[i].erasesize,
+			mtd->eraseregions[i].numblocks);
+
+	for (die = 0, mtd->size = 0; die < this->dies; die++) {
+		this->diesize[die] = (loff_t)blksperdie << this->erase_shift;
+		this->diesize[die] -= (loff_t)(this->boundary[die] + 1)
+						 << (this->erase_shift - 1);
+		mtd->size += this->diesize[die];
+	}
+}
+
+/**
+ * flexonenand_check_blocks_erased - Check if blocks are erased
+ * @param mtd_info	- mtd info structure
+ * @param start		- first erase block to check
+ * @param end		- last erase block to check
+ *
+ * Converting an unerased block from MLC to SLC
+ * causes byte values to change. Since both data and its ECC
+ * have changed, reads on the block give uncorrectable error.
+ * This might lead to the block being detected as bad.
+ *
+ * Avoid this by ensuring that the block to be converted is
+ * erased.
+ */
+static int flexonenand_check_blocks_erased(struct mtd_info *mtd, int start, int end)
+{
+	struct onenand_chip *this = mtd->priv;
+	int i, ret;
+	int block;
+	struct mtd_oob_ops ops = {
+		.mode = MTD_OOB_PLACE,
+		.ooboffs = 0,
+		.ooblen	= mtd->oobsize,
+		.datbuf	= NULL,
+		.oobbuf	= this->oob_buf,
+	};
+	loff_t addr;
+
+	printk(KERN_DEBUG "Check blocks from %d to %d\n", start, end);
+
+	for (block = start; block <= end; block++) {
+		addr = flexonenand_addr(this, block);
+		if (onenand_block_isbad_nolock(mtd, addr, 0))
+			continue;
+
+		/*
+		 * Since main area write results in ECC write to spare,
+		 * it is sufficient to check only ECC bytes for change.
+		 */
+		ret = onenand_read_oob_nolock(mtd, addr, &ops);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < mtd->oobsize; i++)
+			if (this->oob_buf[i] != 0xff)
+				break;
+
+		if (i != mtd->oobsize) {
+			printk(KERN_WARNING "Block %d not erased.\n", block);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * flexonenand_set_boundary	- Writes the SLC boundary
+ * @param mtd			- mtd info structure
+ */
+int flexonenand_set_boundary(struct mtd_info *mtd, int die,
+				    int boundary, int lock)
+{
+	struct onenand_chip *this = mtd->priv;
+	int ret, density, blksperdie, old, new, thisboundary;
+	loff_t addr;
+
+	/* Change only once for SDP Flex-OneNAND */
+	if (die && (!ONENAND_IS_DDP(this)))
+		return 0;
+
+	/* boundary value of -1 indicates no required change */
+	if (boundary < 0 || boundary == this->boundary[die])
+		return 0;
+
+	density = onenand_get_density(this->device_id);
+	blksperdie = ((16 << density) << 20) >> this->erase_shift;
+	blksperdie >>= ONENAND_IS_DDP(this) ? 1 : 0;
+
+	if (boundary >= blksperdie) {
+		printk(KERN_ERR "flexonenand_set_boundary: Invalid boundary value. "
+				"Boundary not changed.\n");
+		return -EINVAL;
+	}
+
+	/* Check if converting blocks are erased */
+	old = this->boundary[die] + (die * this->density_mask);
+	new = boundary + (die * this->density_mask);
+	ret = flexonenand_check_blocks_erased(mtd, min(old, new) + 1, max(old, new));
+	if (ret) {
+		printk(KERN_ERR "flexonenand_set_boundary: Please erase blocks before boundary change\n");
+		return ret;
+	}
+
+	this->command(mtd, FLEXONENAND_CMD_PI_ACCESS, die, 0);
+	this->wait(mtd, FL_SYNCING);
+
+	/* Check is boundary is locked */
+	this->command(mtd, FLEXONENAND_CMD_READ_PI, die, 0);
+	ret = this->wait(mtd, FL_READING);
+
+	thisboundary = this->read_word(this->base + ONENAND_DATARAM);
+	if ((thisboundary >> FLEXONENAND_PI_UNLOCK_SHIFT) != 3) {
+		printk(KERN_ERR "flexonenand_set_boundary: boundary locked\n");
+		ret = 1;
+		goto out;
+	}
+
+	printk(KERN_INFO "flexonenand_set_boundary: Changing die %d boundary: %d%s\n",
+			die, boundary, lock ? "(Locked)" : "(Unlocked)");
+
+	addr = die ? this->diesize[0] : 0;
+
+	boundary &= FLEXONENAND_PI_MASK;
+	boundary |= lock ? 0 : (3 << FLEXONENAND_PI_UNLOCK_SHIFT);
+
+	this->command(mtd, ONENAND_CMD_ERASE, addr, 0);
+	ret = this->wait(mtd, FL_ERASING);
+	if (ret) {
+		printk(KERN_ERR "flexonenand_set_boundary: Failed PI erase for Die %d\n", die);
+		goto out;
+	}
+
+	this->write_word(boundary, this->base + ONENAND_DATARAM);
+	this->command(mtd, ONENAND_CMD_PROG, addr, 0);
+	ret = this->wait(mtd, FL_WRITING);
+	if (ret) {
+		printk(KERN_ERR "flexonenand_set_boundary: Failed PI write for Die %d\n", die);
+		goto out;
+	}
+
+	this->command(mtd, FLEXONENAND_CMD_PI_UPDATE, die, 0);
+	ret = this->wait(mtd, FL_WRITING);
+out:
+	this->write_word(ONENAND_CMD_RESET, this->base + ONENAND_REG_COMMAND);
+	this->wait(mtd, FL_RESETING);
+	if (!ret)
+		/* Recalculate device size on boundary change*/
+		flexonenand_get_size(mtd);
+
+	return ret;
+}
+
+/**
+ * flexonenand_setup - 	capture Flex-OneNAND boundary and lock
+ * 			values  passed as kernel parameters
+ * @param s	kernel parameter string
+ */
+static int flexonenand_setup(char *s)
+{
+	int ints[5], i;
+
+	s = get_options(s, 5, ints);
+
+	for (i = 0; i < ints[0]; i++)
+		flex_bdry[i] = ints[i + 1];
+
+	return 1;
+}
+
+__setup("onenand.bdry=", flexonenand_setup);
+
 /**
  * onenand_probe - [OneNAND Interface] Probe the OneNAND device
  * @param mtd		MTD device structure
@@ -2647,6 +3315,7 @@ static int onenand_probe(struct mtd_info *mtd)
 	maf_id = this->read_word(this->base + ONENAND_REG_MANUFACTURER_ID);
 	dev_id = this->read_word(this->base + ONENAND_REG_DEVICE_ID);
 	ver_id = this->read_word(this->base + ONENAND_REG_VERSION_ID);
+	this->technology = this->read_word(this->base + ONENAND_REG_TECHNOLOGY);
 
 	/* Check OneNAND device */
 	if (maf_id != bram_maf_id || dev_id != bram_dev_id)
@@ -2658,29 +3327,55 @@ static int onenand_probe(struct mtd_info *mtd)
 	this->version_id = ver_id;
 
 	density = onenand_get_density(dev_id);
+	if (FLEXONENAND(this)) {
+		this->dies = ONENAND_IS_DDP(this) ? 2 : 1;
+		/* Maximum possible erase regions */
+		mtd->numeraseregions = this->dies << 1;
+		mtd->eraseregions = kzalloc(sizeof(struct mtd_erase_region_info)
+					* (this->dies << 1), GFP_KERNEL);
+		if (!mtd->eraseregions)
+			return -ENOMEM;
+	}
+
+	/*
+	 * For Flex-OneNAND, chipsize represents maximum possible device size.
+	 * mtd->size represents the actual device size.
+	 */
 	this->chipsize = (16 << density) << 20;
-	/* Set density mask. it is used for DDP */
-	if (ONENAND_IS_DDP(this))
-		this->density_mask = (1 << (density + 6));
-	else
-		this->density_mask = 0;
 
 	/* OneNAND page size & block size */
 	/* The data buffer size is equal to page size */
 	mtd->writesize = this->read_word(this->base + ONENAND_REG_DATA_BUFFER_SIZE);
+	/* We use the full BufferRAM */
+	if (ONENAND_IS_MLC(this))
+		mtd->writesize <<= 1;
+
 	mtd->oobsize = mtd->writesize >> 5;
 	/* Pages per a block are always 64 in OneNAND */
 	mtd->erasesize = mtd->writesize << 6;
+	/*
+	 * Flex-OneNAND SLC area has 64 pages per block.
+	 * Flex-OneNAND MLC area has 128 pages per block.
+	 * Expose MLC erase size to find erase_shift and page_mask.
+	 */
+	if (FLEXONENAND(this))
+		mtd->erasesize <<= 1;
 
 	this->erase_shift = ffs(mtd->erasesize) - 1;
 	this->page_shift = ffs(mtd->writesize) - 1;
 	this->page_mask = (1 << (this->erase_shift - this->page_shift)) - 1;
+	/* Set density mask. it is used for DDP */
+	if (ONENAND_IS_DDP(this))
+		this->density_mask = this->chipsize >> (this->erase_shift + 1);
 	/* It's real page size */
 	this->writesize = mtd->writesize;
 
 	/* REVIST: Multichip handling */
 
-	mtd->size = this->chipsize;
+	if (FLEXONENAND(this))
+		flexonenand_get_size(mtd);
+	else
+		mtd->size = this->chipsize;
 
 	/* Check OneNAND features */
 	onenand_check_features(mtd);
@@ -2735,7 +3430,7 @@ static void onenand_resume(struct mtd_info *mtd)
  */
 int onenand_scan(struct mtd_info *mtd, int maxchips)
 {
-	int i;
+	int i, ret;
 	struct onenand_chip *this = mtd->priv;
 
 	if (!this->read_word)
@@ -2797,6 +3492,10 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	 * Allow subpage writes up to oobsize.
 	 */
 	switch (mtd->oobsize) {
+	case 128:
+		this->ecclayout = &onenand_oob_128;
+		mtd->subpage_sft = 0;
+		break;
 	case 64:
 		this->ecclayout = &onenand_oob_64;
 		mtd->subpage_sft = 2;
@@ -2862,7 +3561,16 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	/* Unlock whole block */
 	onenand_unlock_all(mtd);
 
-	return this->scan_bbt(mtd);
+	ret = this->scan_bbt(mtd);
+	if ((!FLEXONENAND(this)) || ret)
+		return ret;
+
+	/* Change Flex-OneNAND boundaries if required */
+	for (i = 0; i < MAX_DIES; i++)
+		flexonenand_set_boundary(mtd, i, flex_bdry[2 * i],
+						 flex_bdry[(2 * i) + 1]);
+
+	return 0;
 }
 
 /**
@@ -2891,6 +3599,7 @@ void onenand_release(struct mtd_info *mtd)
 		kfree(this->page_buf);
 	if (this->options & ONENAND_OOBBUF_ALLOC)
 		kfree(this->oob_buf);
+	kfree(mtd->eraseregions);
 }
 
 EXPORT_SYMBOL_GPL(onenand_scan);
diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c
index 2f53b51c680..a91fcac1af0 100644
--- a/drivers/mtd/onenand/onenand_bbt.c
+++ b/drivers/mtd/onenand/onenand_bbt.c
@@ -63,6 +63,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	loff_t from;
 	size_t readlen, ooblen;
 	struct mtd_oob_ops ops;
+	int rgn;
 
 	printk(KERN_INFO "Scanning device for bad blocks\n");
 
@@ -76,7 +77,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 	/* Note that numblocks is 2 * (real numblocks) here;
 	 * see i += 2 below as it makses shifting and masking less painful
 	 */
-	numblocks = mtd->size >> (bbm->bbt_erase_shift - 1);
+	numblocks = this->chipsize >> (bbm->bbt_erase_shift - 1);
 	startblock = 0;
 	from = 0;
 
@@ -106,7 +107,12 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 			}
 		}
 		i += 2;
-		from += (1 << bbm->bbt_erase_shift);
+
+		if (FLEXONENAND(this)) {
+			rgn = flexonenand_region(mtd, from);
+			from += mtd->eraseregions[rgn].erasesize;
+		} else
+			from += (1 << bbm->bbt_erase_shift);
 	}
 
 	return 0;
@@ -143,7 +149,7 @@ static int onenand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt)
 	uint8_t res;
 
 	/* Get block number * 2 */
-	block = (int) (offs >> (bbm->bbt_erase_shift - 1));
+	block = (int) (onenand_block(this, offs) << 1);
 	res = (bbm->bbt[block >> 3] >> (block & 0x06)) & 0x03;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "onenand_isbad_bbt: bbt info for offs 0x%08x: (block %d) 0x%02x\n",
@@ -178,7 +184,7 @@ int onenand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd)
 	struct bbm_info *bbm = this->bbm;
 	int len, ret = 0;
 
-	len = mtd->size >> (this->erase_shift + 2);
+	len = this->chipsize >> (this->erase_shift + 2);
 	/* Allocate memory (2bit per block) and clear the memory bad block table */
 	bbm->bbt = kzalloc(len, GFP_KERNEL);
 	if (!bbm->bbt) {
diff --git a/drivers/mtd/onenand/onenand_sim.c b/drivers/mtd/onenand/onenand_sim.c
index d64200b7c94..f6e3c8aebd3 100644
--- a/drivers/mtd/onenand/onenand_sim.c
+++ b/drivers/mtd/onenand/onenand_sim.c
@@ -6,6 +6,10 @@
  *  Copyright © 2005-2007 Samsung Electronics
  *  Kyungmin Park <kyungmin.park@samsung.com>
  *
+ *  Vishak G <vishak.g at samsung.com>, Rohit Hagargundgi <h.rohit at samsung.com>
+ *  Flex-OneNAND simulator support
+ *  Copyright (C) Samsung Electronics, 2008
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -24,16 +28,38 @@
 #ifndef CONFIG_ONENAND_SIM_MANUFACTURER
 #define CONFIG_ONENAND_SIM_MANUFACTURER         0xec
 #endif
+
 #ifndef CONFIG_ONENAND_SIM_DEVICE_ID
 #define CONFIG_ONENAND_SIM_DEVICE_ID            0x04
 #endif
+
+#define CONFIG_FLEXONENAND ((CONFIG_ONENAND_SIM_DEVICE_ID >> 9) & 1)
+
 #ifndef CONFIG_ONENAND_SIM_VERSION_ID
 #define CONFIG_ONENAND_SIM_VERSION_ID           0x1e
 #endif
 
+#ifndef CONFIG_ONENAND_SIM_TECHNOLOGY_ID
+#define CONFIG_ONENAND_SIM_TECHNOLOGY_ID CONFIG_FLEXONENAND
+#endif
+
+/* Initial boundary values for Flex-OneNAND Simulator */
+#ifndef CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY
+#define CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY	0x01
+#endif
+
+#ifndef CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY
+#define CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY	0x01
+#endif
+
 static int manuf_id	= CONFIG_ONENAND_SIM_MANUFACTURER;
 static int device_id	= CONFIG_ONENAND_SIM_DEVICE_ID;
 static int version_id	= CONFIG_ONENAND_SIM_VERSION_ID;
+static int technology_id = CONFIG_ONENAND_SIM_TECHNOLOGY_ID;
+static int boundary[] = {
+	CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY,
+	CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY,
+};
 
 struct onenand_flash {
 	void __iomem *base;
@@ -57,12 +83,18 @@ struct onenand_flash {
 	(writew(v, this->base + ONENAND_REG_WP_STATUS))
 
 /* It has all 0xff chars */
-#define MAX_ONENAND_PAGESIZE		(2048 + 64)
+#define MAX_ONENAND_PAGESIZE		(4096 + 128)
 static unsigned char *ffchars;
 
+#if CONFIG_FLEXONENAND
+#define PARTITION_NAME "Flex-OneNAND simulator partition"
+#else
+#define PARTITION_NAME "OneNAND simulator partition"
+#endif
+
 static struct mtd_partition os_partitions[] = {
 	{
-		.name		= "OneNAND simulator partition",
+		.name		= PARTITION_NAME,
 		.offset		= 0,
 		.size		= MTDPART_SIZ_FULL,
 	},
@@ -104,6 +136,7 @@ static void onenand_lock_handle(struct onenand_chip *this, int cmd)
 
 	switch (cmd) {
 	case ONENAND_CMD_UNLOCK:
+	case ONENAND_CMD_UNLOCK_ALL:
 		if (block_lock_scheme)
 			ONENAND_SET_WP_STATUS(ONENAND_WP_US, this);
 		else
@@ -228,10 +261,12 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 {
 	struct mtd_info *mtd = &info->mtd;
 	struct onenand_flash *flash = this->priv;
-	int main_offset, spare_offset;
+	int main_offset, spare_offset, die = 0;
 	void __iomem *src;
 	void __iomem *dest;
 	unsigned int i;
+	static int pi_operation;
+	int erasesize, rgn;
 
 	if (dataram) {
 		main_offset = mtd->writesize;
@@ -241,10 +276,27 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 		spare_offset = 0;
 	}
 
+	if (pi_operation) {
+		die = readw(this->base + ONENAND_REG_START_ADDRESS2);
+		die >>= ONENAND_DDP_SHIFT;
+	}
+
 	switch (cmd) {
+	case FLEXONENAND_CMD_PI_ACCESS:
+		pi_operation = 1;
+		break;
+
+	case ONENAND_CMD_RESET:
+		pi_operation = 0;
+		break;
+
 	case ONENAND_CMD_READ:
 		src = ONENAND_CORE(flash) + offset;
 		dest = ONENAND_MAIN_AREA(this, main_offset);
+		if (pi_operation) {
+			writew(boundary[die], this->base + ONENAND_DATARAM);
+			break;
+		}
 		memcpy(dest, src, mtd->writesize);
 		/* Fall through */
 
@@ -257,6 +309,10 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 	case ONENAND_CMD_PROG:
 		src = ONENAND_MAIN_AREA(this, main_offset);
 		dest = ONENAND_CORE(flash) + offset;
+		if (pi_operation) {
+			boundary[die] = readw(this->base + ONENAND_DATARAM);
+			break;
+		}
 		/* To handle partial write */
 		for (i = 0; i < (1 << mtd->subpage_sft); i++) {
 			int off = i * this->subpagesize;
@@ -284,9 +340,18 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd,
 		break;
 
 	case ONENAND_CMD_ERASE:
-		memset(ONENAND_CORE(flash) + offset, 0xff, mtd->erasesize);
+		if (pi_operation)
+			break;
+
+		if (FLEXONENAND(this)) {
+			rgn = flexonenand_region(mtd, offset);
+			erasesize = mtd->eraseregions[rgn].erasesize;
+		} else
+			erasesize = mtd->erasesize;
+
+		memset(ONENAND_CORE(flash) + offset, 0xff, erasesize);
 		memset(ONENAND_CORE_SPARE(flash, this, offset), 0xff,
-		       (mtd->erasesize >> 5));
+		       (erasesize >> 5));
 		break;
 
 	default:
@@ -339,7 +404,7 @@ static void onenand_command_handle(struct onenand_chip *this, int cmd)
 	}
 
 	if (block != -1)
-		offset += block << this->erase_shift;
+		offset = onenand_addr(this, block);
 
 	if (page != -1)
 		offset += page << this->page_shift;
@@ -390,6 +455,7 @@ static int __init flash_init(struct onenand_flash *flash)
 	}
 
 	density = device_id >> ONENAND_DEVICE_DENSITY_SHIFT;
+	density &= ONENAND_DEVICE_DENSITY_MASK;
 	size = ((16 << 20) << density);
 
 	ONENAND_CORE(flash) = vmalloc(size + (size >> 5));
@@ -405,8 +471,9 @@ static int __init flash_init(struct onenand_flash *flash)
 	writew(manuf_id, flash->base + ONENAND_REG_MANUFACTURER_ID);
 	writew(device_id, flash->base + ONENAND_REG_DEVICE_ID);
 	writew(version_id, flash->base + ONENAND_REG_VERSION_ID);
+	writew(technology_id, flash->base + ONENAND_REG_TECHNOLOGY);
 
-	if (density < 2)
+	if (density < 2 && (!CONFIG_FLEXONENAND))
 		buffer_size = 0x0400;	/* 1KiB page */
 	else
 		buffer_size = 0x0800;	/* 2KiB page */
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 0fa3ac4ad57..9aab82c1c74 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -17,6 +17,7 @@
 #include <linux/mtd/onenand_regs.h>
 #include <linux/mtd/bbm.h>
 
+#define MAX_DIES		2
 #define MAX_BUFFERRAM		2
 
 /* Scan and identify a OneNAND device */
@@ -51,7 +52,12 @@ struct onenand_bufferram {
 /**
  * struct onenand_chip - OneNAND Private Flash Chip Data
  * @base:		[BOARDSPECIFIC] address to access OneNAND
+ * @dies:		[INTERN][FLEX-ONENAND] number of dies on chip
+ * @boundary:		[INTERN][FLEX-ONENAND] Boundary of the dies
+ * @diesize:		[INTERN][FLEX-ONENAND] Size of the dies
  * @chipsize:		[INTERN] the size of one chip for multichip arrays
+ *			FIXME For Flex-OneNAND, chipsize holds maximum possible
+ *			device size ie when all blocks are considered MLC
  * @device_id:		[INTERN] device ID
  * @density_mask:	chip density, used for DDP devices
  * @verstion_id:	[INTERN] version ID
@@ -92,9 +98,13 @@ struct onenand_bufferram {
  */
 struct onenand_chip {
 	void __iomem		*base;
+	unsigned		dies;
+	unsigned		boundary[MAX_DIES];
+	loff_t			diesize[MAX_DIES];
 	unsigned int		chipsize;
 	unsigned int		device_id;
 	unsigned int		version_id;
+	unsigned int		technology;
 	unsigned int		density_mask;
 	unsigned int		options;
 
@@ -145,6 +155,8 @@ struct onenand_chip {
 #define ONENAND_SET_BUFFERRAM0(this)		(this->bufferram_index = 0)
 #define ONENAND_SET_BUFFERRAM1(this)		(this->bufferram_index = 1)
 
+#define FLEXONENAND(this)						\
+	(this->device_id & DEVICE_IS_FLEXONENAND)
 #define ONENAND_GET_SYS_CFG1(this)					\
 	(this->read_word(this->base + ONENAND_REG_SYS_CFG1))
 #define ONENAND_SET_SYS_CFG1(v, this)					\
@@ -153,6 +165,9 @@ struct onenand_chip {
 #define ONENAND_IS_DDP(this)						\
 	(this->device_id & ONENAND_DEVICE_IS_DDP)
 
+#define ONENAND_IS_MLC(this)						\
+	(this->technology & ONENAND_TECHNOLOGY_IS_MLC)
+
 #ifdef CONFIG_MTD_ONENAND_2X_PROGRAM
 #define ONENAND_IS_2PLANE(this)						\
 	(this->options & ONENAND_HAS_2PLANE)
@@ -190,5 +205,8 @@ struct onenand_manufacturers {
 
 int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 			 struct mtd_oob_ops *ops);
+unsigned onenand_block(struct onenand_chip *this, loff_t addr);
+loff_t onenand_addr(struct onenand_chip *this, int block);
+int flexonenand_region(struct mtd_info *mtd, loff_t addr);
 
 #endif	/* __LINUX_MTD_ONENAND_H */
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index 0c6bbe28f38..86a6bbef646 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -67,6 +67,9 @@
 /*
  * Device ID Register F001h (R)
  */
+#define DEVICE_IS_FLEXONENAND		(1 << 9)
+#define FLEXONENAND_PI_MASK		(0x3ff)
+#define FLEXONENAND_PI_UNLOCK_SHIFT	(14)
 #define ONENAND_DEVICE_DENSITY_MASK	(0xf)
 #define ONENAND_DEVICE_DENSITY_SHIFT	(4)
 #define ONENAND_DEVICE_IS_DDP		(1 << 3)
@@ -83,6 +86,11 @@
  */
 #define ONENAND_VERSION_PROCESS_SHIFT	(8)
 
+/*
+ * Technology Register F006h (R)
+ */
+#define ONENAND_TECHNOLOGY_IS_MLC	(1 << 0)
+
 /*
  * Start Address 1 F100h (R/W) & Start Address 2 F101h (R/W)
  */
@@ -93,7 +101,8 @@
 /*
  * Start Address 8 F107h (R/W)
  */
-#define ONENAND_FPA_MASK		(0x3f)
+/* Note: It's actually 0x3f in case of SLC */
+#define ONENAND_FPA_MASK		(0x7f)
 #define ONENAND_FPA_SHIFT		(2)
 #define ONENAND_FSA_MASK		(0x03)
 
@@ -105,7 +114,8 @@
 #define ONENAND_BSA_BOOTRAM		(0 << 2)
 #define ONENAND_BSA_DATARAM0		(2 << 2)
 #define ONENAND_BSA_DATARAM1		(3 << 2)
-#define ONENAND_BSC_MASK		(0x03)
+/* Note: It's actually 0x03 in case of SLC */
+#define ONENAND_BSC_MASK		(0x07)
 
 /*
  * Command Register F220h (R/W)
@@ -124,9 +134,13 @@
 #define ONENAND_CMD_RESET		(0xF0)
 #define ONENAND_CMD_OTP_ACCESS		(0x65)
 #define ONENAND_CMD_READID		(0x90)
+#define FLEXONENAND_CMD_PI_UPDATE	(0x05)
+#define FLEXONENAND_CMD_PI_ACCESS	(0x66)
+#define FLEXONENAND_CMD_RECOVER_LSB	(0x05)
 
 /* NOTE: Those are not *REAL* commands */
 #define ONENAND_CMD_BUFFERRAM		(0x1978)
+#define FLEXONENAND_CMD_READ_PI		(0x1985)
 
 /*
  * System Configuration 1 Register F221h (R, R/W)
@@ -192,10 +206,12 @@
 #define ONENAND_ECC_1BIT_ALL		(0x5555)
 #define ONENAND_ECC_2BIT		(1 << 1)
 #define ONENAND_ECC_2BIT_ALL		(0xAAAA)
+#define FLEXONENAND_UNCORRECTABLE_ERROR	(0x1010)
 
 /*
  * One-Time Programmable (OTP)
  */
+#define FLEXONENAND_OTP_LOCK_OFFSET		(2048)
 #define ONENAND_OTP_LOCK_OFFSET		(14)
 
 #endif	/* __ONENAND_REG_H */
-- 
cgit v1.2.3-70-g09d2


From 31bb999ee73748068ddc271dd99b22dcc418efe3 Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kmpark@infradead.org>
Date: Tue, 12 May 2009 13:46:57 -0700
Subject: mtd: onenand: add bbt_wait & unlock_all as replaceable for some
 platform

Add bbt_wait & unlock_all as replaceable for some platform such as
  s3c64xx s3c64xx has its own OneNAND controller and another interface

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/onenand/onenand_base.c | 12 ++++++++++--
 include/linux/mtd/onenand.h        |  5 +++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 8d4c9c25373..864327ed7fb 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1506,7 +1506,7 @@ int onenand_bbt_read_oob(struct mtd_info *mtd, loff_t from,
 
 		onenand_update_bufferram(mtd, from, 0);
 
-		ret = onenand_bbt_wait(mtd, FL_READING);
+		ret = this->bbt_wait(mtd, FL_READING);
 		if (unlikely(ret))
 			ret = onenand_recover_lsb(mtd, from, ret);
 
@@ -2527,6 +2527,10 @@ static void onenand_unlock_all(struct mtd_info *mtd)
 		    & ONENAND_CTRL_ONGO)
 			continue;
 
+		/* Don't check lock status */
+		if (this->options & ONENAND_SKIP_UNLOCK_CHECK)
+			return;
+
 		/* Check lock status */
 		if (onenand_check_lock_status(this))
 			return;
@@ -3442,6 +3446,10 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 		this->command = onenand_command;
 	if (!this->wait)
 		onenand_setup_wait(mtd);
+	if (!this->bbt_wait)
+		this->bbt_wait = onenand_bbt_wait;
+	if (!this->unlock_all)
+		this->unlock_all = onenand_unlock_all;
 
 	if (!this->read_bufferram)
 		this->read_bufferram = onenand_read_bufferram;
@@ -3559,7 +3567,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	mtd->owner = THIS_MODULE;
 
 	/* Unlock whole block */
-	onenand_unlock_all(mtd);
+	this->unlock_all(mtd);
 
 	ret = this->scan_bbt(mtd);
 	if ((!FLEXONENAND(this)) || ret)
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index 9aab82c1c74..8ed87337438 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -74,6 +74,8 @@ struct onenand_bufferram {
  * @command:		[REPLACEABLE] hardware specific function for writing
  *			commands to the chip
  * @wait:		[REPLACEABLE] hardware specific function for wait on ready
+ * @bbt_wait:		[REPLACEABLE] hardware specific function for bbt wait on ready
+ * @unlock_all:		[REPLACEABLE] hardware specific function for unlock all
  * @read_bufferram:	[REPLACEABLE] hardware specific function for BufferRAM Area
  * @write_bufferram:	[REPLACEABLE] hardware specific function for BufferRAM Area
  * @read_word:		[REPLACEABLE] hardware specific function for read
@@ -118,6 +120,8 @@ struct onenand_chip {
 
 	int (*command)(struct mtd_info *mtd, int cmd, loff_t address, size_t len);
 	int (*wait)(struct mtd_info *mtd, int state);
+	int (*bbt_wait)(struct mtd_info *mtd, int state);
+	void (*unlock_all)(struct mtd_info *mtd);
 	int (*read_bufferram)(struct mtd_info *mtd, int area,
 			unsigned char *buffer, int offset, size_t count);
 	int (*write_bufferram)(struct mtd_info *mtd, int area,
@@ -184,6 +188,7 @@ struct onenand_chip {
 #define ONENAND_HAS_CONT_LOCK		(0x0001)
 #define ONENAND_HAS_UNLOCK_ALL		(0x0002)
 #define ONENAND_HAS_2PLANE		(0x0004)
+#define ONENAND_SKIP_UNLOCK_CHECK	(0x0100)
 #define ONENAND_PAGEBUF_ALLOC		(0x1000)
 #define ONENAND_OOBBUF_ALLOC		(0x2000)
 
-- 
cgit v1.2.3-70-g09d2


From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 09:58:57 +0200
Subject: perf_counter: Separate out attr->type from attr->config

Counter type is a frequently used value and we do a lot of
bit juggling by encoding and decoding it from attr->config.

Clean this up by creating a separate attr->type field.

Also clean up the various similarly complex user-space bits
all around counter attribute management.

The net improvement is significant, and it will be easier
to add a new major type (which is what triggered this cleanup).

(This changes the ABI, all tools are adapted.)
(PowerPC build-tested.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c    | 105 ++++++++++------------
 Documentation/perf_counter/builtin-stat.c      |  76 +++++++---------
 Documentation/perf_counter/builtin-top.c       |  67 +++++---------
 Documentation/perf_counter/perf.h              |   2 -
 Documentation/perf_counter/util/parse-events.c | 120 ++++++++++++++-----------
 Documentation/perf_counter/util/parse-events.h |   7 +-
 arch/powerpc/kernel/perf_counter.c             |   6 +-
 arch/x86/kernel/cpu/perf_counter.c             |   8 +-
 include/linux/perf_counter.h                   |  65 +++-----------
 kernel/perf_counter.c                          |  14 ++-
 10 files changed, 196 insertions(+), 274 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index c22ea0c7472..130fd88266b 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -20,10 +20,10 @@
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static long			default_interval = 100000;
-static long			event_count[MAX_COUNTERS];
-
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static long			default_interval		= 100000;
+
 static int			nr_cpus				= 0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 128;
@@ -38,22 +38,44 @@ static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
 
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
+static long			samples;
+static struct timeval		last_read;
+static struct timeval		this_read;
+
+static __u64			bytes_written;
+
+static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
+
+static int			nr_poll;
+static int			nr_cpu;
+
+struct mmap_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
 };
 
+
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
+static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
@@ -65,11 +87,6 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
-static long samples;
-static struct timeval last_read, this_read;
-
-static __u64 bytes_written;
-
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -157,29 +174,6 @@ static void sig_handler(int sig)
 	done = 1;
 }
 
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int nr_poll;
-static int nr_cpu;
-
-struct mmap_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	__u64				start;
-	__u64				len;
-	__u64				pgoff;
-	char				filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	char				comm[16];
-};
-
 static void pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
@@ -341,24 +335,21 @@ static int group_fd;
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr = attrs + counter;
 	int track = 1;
 
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_period	= event_count[counter];
-	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
 	if (freq) {
-		attr.freq		= 1;
-		attr.sample_freq	= freq;
+		attr->freq		= 1;
+		attr->sample_freq	= freq;
 	}
-	attr.mmap		= track;
-	attr.comm		= track;
-	attr.inherit		= (cpu < 0) && inherit;
+	attr->mmap		= track;
+	attr->comm		= track;
+	attr->inherit		= (cpu < 0) && inherit;
 
 	track = 0; /* only the first counter needs these */
 
-	fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
+	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
@@ -542,16 +533,14 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	return __cmd_record(argc, argv);
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 4fc0d80440e..9711e552423 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -44,23 +44,22 @@
 
 #include <sys/prctl.h>
 
-static int			system_wide			=  0;
-static int			inherit				=  1;
+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-static __u64			default_event_id[MAX_COUNTERS]	= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
 };
 
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
+static int			system_wide			=  0;
+static int			inherit				=  1;
+
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int			target_pid			= -1;
@@ -86,22 +85,16 @@ static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
-	struct perf_counter_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_type	= 0;
-	attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
-	attr.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+	struct perf_counter_attr *attr = attrs + counter;
 
 	if (scale)
-		attr.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0) {
 				printf("perfstat error: syscall returned with %d (%s)\n",
 						fd[cpu][counter], strerror(errno));
@@ -109,10 +102,10 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		attr.inherit	= inherit;
-		attr.disabled	= 1;
+		attr->inherit	= inherit;
+		attr->disabled	= 1;
 
-		fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0);
+		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
 		if (fd[0][counter] < 0) {
 			printf("perfstat error: syscall returned with %d (%s)\n",
 					fd[0][counter], strerror(errno));
@@ -126,9 +119,13 @@ static void create_perfstat_counter(int counter)
  */
 static inline int nsec_counter(int counter)
 {
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK))
+	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
+		return 0;
+
+	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
 		return 1;
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+
+	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -177,7 +174,8 @@ static void read_counter(int counter)
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		runtime_nsecs = count[0];
 }
 
@@ -203,8 +201,8 @@ static void print_counter(int counter)
 
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
-		if (event_id[counter] ==
-				EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
 
 			fprintf(stderr, " # %11.3f CPU utilization factor",
 				(double)count[0] / (double)walltime_nsecs);
@@ -300,8 +298,6 @@ static char events_help_msg[EVENTS_HELP_MAX];
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
@@ -315,27 +311,19 @@ static const struct option options[] = {
 
 int cmd_stat(int argc, const char **argv, const char *prefix)
 {
-	int counter;
-
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	memcpy(attrs, default_attrs, sizeof(attrs));
 
 	argc = parse_options(argc, argv, options, stat_usage, 0);
 	if (!argc)
 		usage_with_options(stat_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 8;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
 
-		event_count[counter] = default_interval;
-	}
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b2f480b5a13..98a6d53e17b 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -48,22 +48,11 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static int			system_wide			=  0;
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static __u64			default_event_id[MAX_COUNTERS]		= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+static int			system_wide			=  0;
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
-};
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			default_interval		= 100000;
 
 static __u64			count_filter			=  5;
 static int			print_entries			= 15;
@@ -85,15 +74,6 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
 /*
  * Symbols
  */
@@ -112,7 +92,7 @@ struct sym_entry {
 
 struct sym_entry		*sym_filter_entry;
 
-struct dso *kernel_dso;
+struct dso			*kernel_dso;
 
 /*
  * Symbols will be added here in record_ip and will get out
@@ -213,7 +193,7 @@ static void print_sym_table(void)
 		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
-		printf("%d", event_count[0]);
+		printf("%Ld", attrs[0].sample_period);
 		if (freq)
 			printf("Hz ");
 		else
@@ -421,10 +401,10 @@ static void process_event(uint64_t ip, int counter)
 }
 
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
 static unsigned int mmap_read_head(struct mmap_data *md)
@@ -539,7 +519,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int __cmd_top(void)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
@@ -553,13 +533,12 @@ static int __cmd_top(void)
 			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
-			memset(&attr, 0, sizeof(attr));
-			attr.config		= event_id[counter];
-			attr.sample_period	= event_count[counter];
-			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr.freq		= freq;
+			attr = attrs + counter;
 
-			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
+			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr->freq		= freq;
+
+			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 
@@ -670,7 +649,6 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
@@ -688,19 +666,22 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 		profile_cpu = -1;
 	}
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	if (delay_secs < 1)
 		delay_secs = 1;
 
+	parse_symbols();
+
+	/*
+	 * Fill in the ones not specifically initialized via -c:
+	 */
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -710,7 +691,5 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	if (target_pid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
-	parse_symbols();
-
 	return __cmd_top();
 }
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 10622a48b40..af0a5046d74 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -64,6 +64,4 @@ sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
 #define MAX_COUNTERS			256
 #define MAX_NR_CPUS			256
 
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
 #endif
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 2fdfd1d923f..eb56bd99657 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,37 +6,39 @@
 #include "exec_cmd.h"
 #include "string.h"
 
-int nr_counters;
+int					nr_counters;
 
-__u64			event_id[MAX_COUNTERS]		= { };
-int			event_mask[MAX_COUNTERS];
+struct perf_counter_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
-	__u64 event;
-	char *symbol;
+	__u8	type;
+	__u64	config;
+	char	*symbol;
 };
 
+#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+
 static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
+  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
+  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
+  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -67,27 +69,26 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
-char *event_name(int ctr)
+char *event_name(int counter)
 {
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
+	__u64 config = attrs[counter].config;
+	int type = attrs[counter].type;
 	static char buf[32];
 
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+	if (attrs[counter].type == PERF_TYPE_RAW) {
+		sprintf(buf, "raw 0x%llx", config);
 		return buf;
 	}
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
+		if (config < PERF_HW_EVENTS_MAX)
+			return hw_event_names[config];
 		return "unknown-hardware";
 
 	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
+		if (config < PERF_SW_EVENTS_MAX)
+			return sw_event_names[config];
 		return "unknown-software";
 
 	default:
@@ -101,15 +102,19 @@ char *event_name(int ctr)
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static __u64 match_event_symbols(const char *str)
+static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
 	unsigned int i;
 	const char *sep, *pstr;
 
-	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0)
-		return config | PERF_COUNTER_RAW_MASK;
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
+		attr->type = PERF_TYPE_RAW;
+		attr->config = config;
+
+		return 0;
+	}
 
 	pstr = str;
 	sep = strchr(pstr, ':');
@@ -121,35 +126,45 @@ static __u64 match_event_symbols(const char *str)
 		if (sep) {
 			pstr = sep + 1;
 			if (strchr(pstr, 'k'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
+				attr->exclude_user = 1;
 			if (strchr(pstr, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+				attr->exclude_kernel = 1;
 		}
-		return EID(type, id);
+		attr->type = type;
+		attr->config = id;
+
+		return 0;
 	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
+			     strlen(event_symbols[i].symbol))) {
+
+			attr->type = event_symbols[i].type;
+			attr->config = event_symbols[i].config;
+
+			return 0;
+		}
 	}
 
-	return ~0ULL;
+	return -EINVAL;
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
 {
-	__u64 config;
+	struct perf_counter_attr attr;
+	int ret;
 
+	memset(&attr, 0, sizeof(attr));
 again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
+	ret = match_event_symbols(str, &attr);
+	if (ret < 0)
+		return ret;
 
-	event_id[nr_counters] = config;
+	attrs[nr_counters] = attr;
 	nr_counters++;
 
 	str = strstr(str, ",");
@@ -168,7 +183,6 @@ void create_events_help(char *events_help_msg)
 {
 	unsigned int i;
 	char *str;
-	__u64 e;
 
 	str = events_help_msg;
 
@@ -178,9 +192,8 @@ void create_events_help(char *events_help_msg)
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		int type, id;
 
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
+		type = event_symbols[i].type;
+		id = event_symbols[i].config;
 
 		if (i)
 			str += sprintf(str, "|");
@@ -191,4 +204,3 @@ void create_events_help(char *events_help_msg)
 
 	str += sprintf(str, "|rNNN]");
 }
-
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 0da306bb902..542971c495b 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -3,12 +3,9 @@
  * Parse symbolic events/counts passed in as options:
  */
 
-extern int nr_counters;
-extern __u64			event_id[MAX_COUNTERS];
-extern int			event_mask[MAX_COUNTERS];
+extern int			nr_counters;
 
-#define EVENT_MASK_KERNEL	1
-#define EVENT_MASK_USER		2
+extern struct perf_counter_attr attrs[MAX_COUNTERS];
 
 extern char *event_name(int ctr);
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 232b00a36f7..4786ad9a288 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->attr)) {
-		ev = perf_event_id(&counter->attr);
+	if (counter->attr.type != PERF_TYPE_RAW) {
+		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->attr);
+		ev = counter->attr.config;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8f53f3a7da2..430e048f285 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(attr)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
 	} else {
-		if (perf_event_id(attr) >= x86_pmu.max_events)
+		if (attr->config >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
+		hwc->config |= x86_pmu.event_map(attr->config);
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4f9d39ecdc0..f794c69b34c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,26 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name)			\
-	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
-	 PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW_BITS		1
-#define PERF_COUNTER_RAW_SHIFT		63
-#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
-
-#define PERF_COUNTER_CONFIG_BITS	63
-#define PERF_COUNTER_CONFIG_SHIFT	0
-#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
-
-#define PERF_COUNTER_TYPE_BITS		7
-#define PERF_COUNTER_TYPE_SHIFT		56
-#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
-
-#define PERF_COUNTER_EVENT_BITS		56
-#define PERF_COUNTER_EVENT_SHIFT	0
-#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
-
 /*
  * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
@@ -125,10 +105,13 @@ enum perf_counter_read_format {
  */
 struct perf_counter_attr {
 	/*
-	 * The MSB of the config word signifies if the rest contains cpu
-	 * specific (raw) counter configuration data, if unset, the next
-	 * 7 bits are an event type and the rest of the bits are the event
-	 * identifier.
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+	__u32			__reserved_1;
+
+	/*
+	 * Type specific configuration information.
 	 */
 	__u64			config;
 
@@ -152,12 +135,11 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 53;
+				__reserved_2   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
-	__u32			__reserved_2;
+	__u32			__reserved_3;
 
-	__u64			__reserved_3;
 	__u64			__reserved_4;
 };
 
@@ -278,8 +260,8 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u32				pid, ppid;
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
 	 * };
 	 */
 	PERF_EVENT_FORK			= 7,
@@ -331,27 +313,6 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_RAW_MASK;
-}
-
-static inline u64 perf_event_config(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_CONFIG_MASK;
-}
-
-static inline u64 perf_event_type(struct perf_counter_attr *attr)
-{
-	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
-		PERF_COUNTER_TYPE_SHIFT;
-}
-
-static inline u64 perf_event_id(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_EVENT_MASK;
-}
-
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -616,8 +577,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->attr) &&
-		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
+	return (counter->attr.type != PERF_TYPE_RAW) &&
+		(counter->attr.type != PERF_TYPE_HARDWARE);
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 47c92fb927f..75ae76796df 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3091,14 +3091,12 @@ static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_event_types type,
 				u32 event, struct pt_regs *regs)
 {
-	u64 event_config;
-
-	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
-
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->attr.config != event_config)
+	if (counter->attr.type != type)
+		return 0;
+	if (counter->attr.config != event)
 		return 0;
 
 	if (regs) {
@@ -3403,7 +3401,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->attr)) {
+	switch (counter->attr.config) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3496,12 +3494,12 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(attr)) {
+	if (attr->type == PERF_TYPE_RAW) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(attr)) {
+	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
-- 
cgit v1.2.3-70-g09d2


From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 20:22:46 +0200
Subject: perf_counter: Implement generalized cache event types

Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.

This is a 3-dimensional space:

       { L1-D, L1-I, L2, ITLB, DTLB, BPU } x
       { load, store, prefetch } x
       { accesses, misses }

User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)

Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.

Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.

( x86 is supported for now, with the Nehalem event table filled in,
  and with Core2 and Atom having placeholder tables. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/parse-events.c | 104 ++++++++++++-
 arch/x86/kernel/cpu/perf_counter.c             | 201 ++++++++++++++++++++++++-
 include/linux/perf_counter.h                   |  34 +++++
 kernel/perf_counter.c                          |   1 +
 4 files changed, 329 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index eb56bd99657..de9a77c4715 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,6 +6,8 @@
 #include "exec_cmd.h"
 #include "string.h"
 
+extern char *strcasestr(const char *haystack, const char *needle);
+
 int					nr_counters;
 
 struct perf_counter_attr		attrs[MAX_COUNTERS];
@@ -17,6 +19,7 @@ struct event_symbol {
 };
 
 #define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+#define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
   { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
@@ -69,6 +72,28 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
+#define MAX_ALIASES 8
+
+static char *hw_cache [][MAX_ALIASES] = {
+	{ "l1-d" ,	"l1d" ,	"l1", "l1-data-cache"			},
+	{ "l1-i" ,	"l1i" ,	"l1-instruction-cache"		},
+	{ "l2"  , },
+	{ "dtlb", },
+	{ "itlb", },
+	{ "bpu" , "btb", "branch-cache", NULL },
+};
+
+static char *hw_cache_op [][MAX_ALIASES] = {
+	{ "read"	, "load" },
+	{ "write"	, "store" },
+	{ "prefetch"	, "speculative-read", "speculative-load" },
+};
+
+static char *hw_cache_result [][MAX_ALIASES] = {
+	{ "access", "ops" },
+	{ "miss", },
+};
+
 char *event_name(int counter)
 {
 	__u64 config = attrs[counter].config;
@@ -86,6 +111,30 @@ char *event_name(int counter)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
+	case PERF_TYPE_HW_CACHE: {
+		__u8 cache_type, cache_op, cache_result;
+		static char name[100];
+
+		cache_type   = (config >>  0) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_MAX)
+			return "unknown-ext-hardware-cache-type";
+
+		cache_op     = (config >>  8) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op-type";
+
+		cache_result = (config >> 16) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result-type";
+
+		sprintf(name, "%s:%s:%s",
+			hw_cache[cache_type][0],
+			hw_cache_op[cache_op][0],
+			hw_cache_result[cache_result][0]);
+
+		return name;
+	}
+
 	case PERF_TYPE_SOFTWARE:
 		if (config < PERF_SW_EVENTS_MAX)
 			return sw_event_names[config];
@@ -98,11 +147,60 @@ char *event_name(int counter)
 	return "unknown";
 }
 
+static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
+{
+	int i, j;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < MAX_ALIASES; j++) {
+			if (!names[i][j])
+				break;
+			if (strcasestr(str, names[i][j]))
+				return i;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
+{
+	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+
+	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
+	/*
+	 * No fallback - if we cannot get a clear cache type
+	 * then bail out:
+	 */
+	if (cache_type == -1)
+		return -EINVAL;
+
+	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+	/*
+	 * Fall back to reads:
+	 */
+	if (cache_type == -1)
+		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+
+	cache_result = parse_aliases(str, hw_cache_result,
+					PERF_COUNT_HW_CACHE_RESULT_MAX);
+	/*
+	 * Fall back to accesses:
+	 */
+	if (cache_result == -1)
+		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
+
+	attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
+	attr->type = PERF_TYPE_HW_CACHE;
+
+	return 0;
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
+static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
@@ -147,7 +245,7 @@ static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 		}
 	}
 
-	return -EINVAL;
+	return parse_generic_hw_symbols(str, attr);
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
@@ -160,7 +258,7 @@ again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	ret = match_event_symbols(str, &attr);
+	ret = parse_event_symbols(str, &attr);
 	if (ret < 0)
 		return ret;
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 430e048f285..e86679fa521 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
+		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
 static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void)
 	return x86_pmu.handle_irq != NULL;
 }
 
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->sample_period);
+	counter->destroy = hw_perf_counter_destroy;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
 	if (attr->type == PERF_TYPE_RAW) {
 		hwc->config |= x86_pmu.raw_event(attr->config);
-	} else {
-		if (attr->config >= x86_pmu.max_events)
-			return -EINVAL;
-		/*
-		 * The generic map:
-		 */
-		hwc->config |= x86_pmu.event_map(attr->config);
+		return 0;
 	}
 
-	counter->destroy = hw_perf_counter_destroy;
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+	/*
+	 * The generic map:
+	 */
+	hwc->config |= x86_pmu.event_map(attr->config);
 
 	return 0;
 }
@@ -989,6 +1147,33 @@ static int intel_pmu_init(void)
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
+	/*
+	 * Nehalem:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 17:
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Core2 event tables\n");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Nehalem/Corei7 event tables\n");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Atom event tables\n");
+		break;
+	}
 	return 0;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f794c69b34c..3586df840f6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -28,6 +28,7 @@ enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
+	PERF_TYPE_HW_CACHE		= 3,
 
 	/*
 	 * available TYPE space, raw is the max value.
@@ -55,6 +56,39 @@ enum attr_ids {
 	PERF_HW_EVENTS_MAX		= 7,
 };
 
+/*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D,
+	PERF_COUNT_HW_CACHE_L1I,
+	PERF_COUNT_HW_CACHE_L2,
+	PERF_COUNT_HW_CACHE_DTLB,
+	PERF_COUNT_HW_CACHE_ITLB,
+	PERF_COUNT_HW_CACHE_BPU,
+
+	PERF_COUNT_HW_CACHE_MAX,
+};
+
+enum hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ,
+	PERF_COUNT_HW_CACHE_OP_WRITE,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,
+};
+
+enum hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
+	PERF_COUNT_HW_CACHE_RESULT_MISS,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,
+};
+
 /*
  * Special "software" counters provided by the kernel, even if the hardware
  * does not support performance counters. These counters measure various
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75ae76796df..5eacaaf3f9c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3501,6 +3501,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
 		pmu = hw_perf_counter_init(counter);
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From e5110d011e03030926872457f05e49e3d5031737 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 6 Jun 2009 18:35:27 +0200
Subject: firewire: add parent-of-unit accessor

Retrieval of an fw_unit's parent is a common pattern in high-level code.
Wrap it up as device = fw_parent_device(unit).

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-device.c |  4 ++--
 drivers/firewire/sbp2.c        | 37 +++++++++++++++++++++----------------
 include/linux/firewire.h       |  5 +++++
 3 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 65d84dd6c1d..3f4e646367b 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -93,7 +93,7 @@ static int fw_unit_match(struct device *dev, struct device_driver *drv)
 	if (!is_fw_unit(dev))
 		return 0;
 
-	device = fw_device(unit->device.parent);
+	device = fw_parent_device(unit);
 	id = container_of(drv, struct fw_driver, driver)->id_table;
 
 	for (; id->match_flags != 0; id++) {
@@ -114,7 +114,7 @@ static int fw_unit_match(struct device *dev, struct device_driver *drv)
 
 static int get_modalias(struct fw_unit *unit, char *buffer, size_t buffer_size)
 {
-	struct fw_device *device = fw_device(unit->device.parent);
+	struct fw_device *device = fw_parent_device(unit);
 	struct fw_csr_iterator ci;
 
 	int key, value;
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index d41cb6e455b..2353643721c 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -180,6 +180,11 @@ struct sbp2_target {
 	int blocked;	/* ditto */
 };
 
+static struct fw_device *target_device(struct sbp2_target *tgt)
+{
+	return fw_parent_device(tgt->unit);
+}
+
 /* Impossible login_id, to detect logout attempt before successful login */
 #define INVALID_LOGIN_ID 0x10000
 
@@ -488,7 +493,7 @@ static void complete_transaction(struct fw_card *card, int rcode,
 static void sbp2_send_orb(struct sbp2_orb *orb, struct sbp2_logical_unit *lu,
 			  int node_id, int generation, u64 offset)
 {
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	unsigned long flags;
 
 	orb->pointer.high = 0;
@@ -510,7 +515,7 @@ static void sbp2_send_orb(struct sbp2_orb *orb, struct sbp2_logical_unit *lu,
 
 static int sbp2_cancel_orbs(struct sbp2_logical_unit *lu)
 {
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	struct sbp2_orb *orb, *next;
 	struct list_head list;
 	unsigned long flags;
@@ -548,7 +553,7 @@ static int sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 				    int generation, int function,
 				    int lun_or_login_id, void *response)
 {
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	struct sbp2_management_orb *orb;
 	unsigned int timeout;
 	int retval = -ENOMEM;
@@ -644,7 +649,7 @@ static int sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 
 static void sbp2_agent_reset(struct sbp2_logical_unit *lu)
 {
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	__be32 d = 0;
 
 	fw_run_transaction(device->card, TCODE_WRITE_QUADLET_REQUEST,
@@ -661,7 +666,7 @@ static void complete_agent_reset_write_no_wait(struct fw_card *card,
 
 static void sbp2_agent_reset_no_wait(struct sbp2_logical_unit *lu)
 {
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	struct fw_transaction *t;
 	static __be32 d;
 
@@ -700,7 +705,7 @@ static inline void sbp2_allow_block(struct sbp2_logical_unit *lu)
 static void sbp2_conditionally_block(struct sbp2_logical_unit *lu)
 {
 	struct sbp2_target *tgt = lu->tgt;
-	struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
+	struct fw_card *card = target_device(tgt)->card;
 	struct Scsi_Host *shost =
 		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
 	unsigned long flags;
@@ -724,7 +729,7 @@ static void sbp2_conditionally_block(struct sbp2_logical_unit *lu)
 static void sbp2_conditionally_unblock(struct sbp2_logical_unit *lu)
 {
 	struct sbp2_target *tgt = lu->tgt;
-	struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
+	struct fw_card *card = target_device(tgt)->card;
 	struct Scsi_Host *shost =
 		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
 	unsigned long flags;
@@ -749,7 +754,7 @@ static void sbp2_conditionally_unblock(struct sbp2_logical_unit *lu)
  */
 static void sbp2_unblock(struct sbp2_target *tgt)
 {
-	struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
+	struct fw_card *card = target_device(tgt)->card;
 	struct Scsi_Host *shost =
 		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
 	unsigned long flags;
@@ -779,7 +784,7 @@ static void sbp2_release_target(struct kref *kref)
 	struct Scsi_Host *shost =
 		container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
 	struct scsi_device *sdev;
-	struct fw_device *device = fw_device(tgt->unit->device.parent);
+	struct fw_device *device = target_device(tgt);
 
 	/* prevent deadlocks */
 	sbp2_unblock(tgt);
@@ -852,7 +857,7 @@ static void sbp2_queue_work(struct sbp2_logical_unit *lu, unsigned long delay)
  */
 static void sbp2_set_busy_timeout(struct sbp2_logical_unit *lu)
 {
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	__be32 d = cpu_to_be32(SBP2_CYCLE_LIMIT | SBP2_RETRY_LIMIT);
 
 	fw_run_transaction(device->card, TCODE_WRITE_QUADLET_REQUEST,
@@ -868,7 +873,7 @@ static void sbp2_login(struct work_struct *work)
 	struct sbp2_logical_unit *lu =
 		container_of(work, struct sbp2_logical_unit, work.work);
 	struct sbp2_target *tgt = lu->tgt;
-	struct fw_device *device = fw_device(tgt->unit->device.parent);
+	struct fw_device *device = target_device(tgt);
 	struct Scsi_Host *shost;
 	struct scsi_device *sdev;
 	struct sbp2_login_response response;
@@ -1116,7 +1121,7 @@ static struct scsi_host_template scsi_driver_template;
 static int sbp2_probe(struct device *dev)
 {
 	struct fw_unit *unit = fw_unit(dev);
-	struct fw_device *device = fw_device(unit->device.parent);
+	struct fw_device *device = fw_parent_device(unit);
 	struct sbp2_target *tgt;
 	struct sbp2_logical_unit *lu;
 	struct Scsi_Host *shost;
@@ -1197,7 +1202,7 @@ static void sbp2_reconnect(struct work_struct *work)
 	struct sbp2_logical_unit *lu =
 		container_of(work, struct sbp2_logical_unit, work.work);
 	struct sbp2_target *tgt = lu->tgt;
-	struct fw_device *device = fw_device(tgt->unit->device.parent);
+	struct fw_device *device = target_device(tgt);
 	int generation, node_id, local_node_id;
 
 	if (fw_device_is_shutdown(device))
@@ -1249,7 +1254,7 @@ static void sbp2_update(struct fw_unit *unit)
 	struct sbp2_target *tgt = unit->device.driver_data;
 	struct sbp2_logical_unit *lu;
 
-	fw_device_enable_phys_dma(fw_device(unit->device.parent));
+	fw_device_enable_phys_dma(fw_parent_device(unit));
 
 	/*
 	 * Fw-core serializes sbp2_update() against sbp2_remove().
@@ -1342,7 +1347,7 @@ static void complete_command_orb(struct sbp2_orb *base_orb,
 {
 	struct sbp2_command_orb *orb =
 		container_of(base_orb, struct sbp2_command_orb, base);
-	struct fw_device *device = fw_device(orb->lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(orb->lu->tgt);
 	int result;
 
 	if (status != NULL) {
@@ -1449,7 +1454,7 @@ static int sbp2_map_scatterlist(struct sbp2_command_orb *orb,
 static int sbp2_scsi_queuecommand(struct scsi_cmnd *cmd, scsi_done_fn_t done)
 {
 	struct sbp2_logical_unit *lu = cmd->device->hostdata;
-	struct fw_device *device = fw_device(lu->tgt->unit->device.parent);
+	struct fw_device *device = target_device(lu->tgt);
 	struct sbp2_command_orb *orb;
 	int generation, retval = SCSI_MLQUEUE_HOST_BUSY;
 
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index e979f9b22cb..a69aea0394e 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -248,6 +248,11 @@ static inline void fw_unit_put(struct fw_unit *unit)
 	put_device(&unit->device);
 }
 
+static inline struct fw_device *fw_parent_device(struct fw_unit *unit)
+{
+	return fw_device(unit->device.parent);
+}
+
 struct ieee1394_device_id;
 
 struct fw_driver {
-- 
cgit v1.2.3-70-g09d2


From a93958ac980f0ce594ad90657ecbc595ff157a40 Mon Sep 17 00:00:00 2001
From: Ayaz Abdulla <aabdulla@nvidia.com>
Date: Sun, 7 Jun 2009 03:54:37 -0700
Subject: removal of forcedeth device ids

This patch removes the forcedeth device ids from pci_ids.h

The forcedeth driver uses the device id constants directly in its source
file.

[ Need to keep PCI_DEVICE_ID_NVIDIA_NVENET_15 in order to keep
  drivers/pci/quirks.c building -DaveM ]

Signed-off-by: Ayaz Abdulla <aabdulla@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pci_ids.h | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 12db06cf0e2..b87c51aea14 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1066,8 +1066,6 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SMBUS	0x0034
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE	0x0035
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SATA	0x0036
-#define PCI_DEVICE_ID_NVIDIA_NVENET_10		0x0037
-#define PCI_DEVICE_ID_NVIDIA_NVENET_11		0x0038
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SATA2	0x003e
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_6800_ULTRA 0x0040
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_6800       0x0041
@@ -1078,21 +1076,16 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE	0x0053
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_SATA	0x0054
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_SATA2	0x0055
-#define PCI_DEVICE_ID_NVIDIA_NVENET_8		0x0056
-#define PCI_DEVICE_ID_NVIDIA_NVENET_9		0x0057
 #define PCI_DEVICE_ID_NVIDIA_CK804_AUDIO	0x0059
 #define PCI_DEVICE_ID_NVIDIA_CK804_PCIE		0x005d
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2_SMBUS	0x0064
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE	0x0065
-#define PCI_DEVICE_ID_NVIDIA_NVENET_2		0x0066
 #define PCI_DEVICE_ID_NVIDIA_MCP2_MODEM		0x0069
 #define PCI_DEVICE_ID_NVIDIA_MCP2_AUDIO		0x006a
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2S_SMBUS	0x0084
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE	0x0085
-#define PCI_DEVICE_ID_NVIDIA_NVENET_4		0x0086
 #define PCI_DEVICE_ID_NVIDIA_MCP2S_MODEM	0x0089
 #define PCI_DEVICE_ID_NVIDIA_CK8_AUDIO		0x008a
-#define PCI_DEVICE_ID_NVIDIA_NVENET_5		0x008c
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA	0x008e
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_7800_GT   0x0090
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_7800_GTX	0x0091
@@ -1108,15 +1101,12 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3		0x00d1
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3_SMBUS	0x00d4
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE	0x00d5
-#define PCI_DEVICE_ID_NVIDIA_NVENET_3		0x00d6
 #define PCI_DEVICE_ID_NVIDIA_MCP3_MODEM		0x00d9
 #define PCI_DEVICE_ID_NVIDIA_MCP3_AUDIO		0x00da
-#define PCI_DEVICE_ID_NVIDIA_NVENET_7		0x00df
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3S		0x00e1
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA	0x00e3
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3S_SMBUS	0x00e4
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE	0x00e5
-#define PCI_DEVICE_ID_NVIDIA_NVENET_6		0x00e6
 #define PCI_DEVICE_ID_NVIDIA_CK8S_AUDIO		0x00ea
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2	0x00ee
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_ALT1 0x00f0
@@ -1176,7 +1166,6 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_SMBUS	0x01b4
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_IDE		0x01bc
 #define PCI_DEVICE_ID_NVIDIA_MCP1_MODEM		0x01c1
-#define PCI_DEVICE_ID_NVIDIA_NVENET_1		0x01c3
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2		0x01e0
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE3		0x0200
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE3_1		0x0201
@@ -1199,8 +1188,6 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE	0x036E
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA	0x037E
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA2	0x037F
-#define PCI_DEVICE_ID_NVIDIA_NVENET_12		0x0268
-#define PCI_DEVICE_ID_NVIDIA_NVENET_13		0x0269
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4800	0x0280
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4800_8X    0x0281
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4800SE     0x0282
@@ -1247,46 +1234,21 @@
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5700_2    0x0348
 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_GO1000       0x034C
 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1100         0x034E
-#define PCI_DEVICE_ID_NVIDIA_NVENET_14              0x0372
 #define PCI_DEVICE_ID_NVIDIA_NVENET_15              0x0373
-#define PCI_DEVICE_ID_NVIDIA_NVENET_16              0x03E5
-#define PCI_DEVICE_ID_NVIDIA_NVENET_17              0x03E6
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA      0x03E7
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SMBUS	    0x03EB
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE       0x03EC
-#define PCI_DEVICE_ID_NVIDIA_NVENET_18              0x03EE
-#define PCI_DEVICE_ID_NVIDIA_NVENET_19              0x03EF
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA2     0x03F6
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA3     0x03F7
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_SMBUS	    0x0446
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE	    0x0448
-#define PCI_DEVICE_ID_NVIDIA_NVENET_20              0x0450
-#define PCI_DEVICE_ID_NVIDIA_NVENET_21              0x0451
-#define PCI_DEVICE_ID_NVIDIA_NVENET_22              0x0452
-#define PCI_DEVICE_ID_NVIDIA_NVENET_23              0x0453
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_SMBUS     0x0542
-#define PCI_DEVICE_ID_NVIDIA_NVENET_24              0x054C
-#define PCI_DEVICE_ID_NVIDIA_NVENET_25              0x054D
-#define PCI_DEVICE_ID_NVIDIA_NVENET_26              0x054E
-#define PCI_DEVICE_ID_NVIDIA_NVENET_27              0x054F
-#define PCI_DEVICE_ID_NVIDIA_NVENET_28              0x07DC
-#define PCI_DEVICE_ID_NVIDIA_NVENET_29              0x07DD
-#define PCI_DEVICE_ID_NVIDIA_NVENET_30              0x07DE
-#define PCI_DEVICE_ID_NVIDIA_NVENET_31              0x07DF
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE       0x0560
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE       0x056C
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP78S_SMBUS    0x0752
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE       0x0759
-#define PCI_DEVICE_ID_NVIDIA_NVENET_32              0x0760
-#define PCI_DEVICE_ID_NVIDIA_NVENET_33              0x0761
-#define PCI_DEVICE_ID_NVIDIA_NVENET_34              0x0762
-#define PCI_DEVICE_ID_NVIDIA_NVENET_35              0x0763
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS     0x07D8
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS     0x0AA2
-#define PCI_DEVICE_ID_NVIDIA_NVENET_36              0x0AB0
-#define PCI_DEVICE_ID_NVIDIA_NVENET_37              0x0AB1
-#define PCI_DEVICE_ID_NVIDIA_NVENET_38              0x0AB2
-#define PCI_DEVICE_ID_NVIDIA_NVENET_39              0x0AB3
 
 #define PCI_VENDOR_ID_IMS		0x10e0
 #define PCI_DEVICE_ID_IMS_TT128		0x9128
-- 
cgit v1.2.3-70-g09d2


From db429e9ec0f9dee2d8e50c154f04f29f880fc9d6 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 7 Jun 2009 13:52:52 +0200
Subject: partitions: add ->set_capacity block device method

* Add ->set_capacity block device method and use it in rescan_partitions()
  to attempt enabling native capacity of the device upon detecting the
  partition which exceeds device capacity.

* Add GENHD_FL_NATIVE_CAPACITY flag to try limit attempts of enabling
  native capacity during partition scan.

Together with the consecutive patch implementing ->set_capacity method in
ide-gd device driver this allows automatic disabling of Host Protected Area
(HPA) if any partitions overlapping HPA are detected.

Cc: Robert Hancock <hancockrwd@gmail.com>
Cc: Frans Pop <elendil@planet.nl>
Cc: "Andries E. Brouwer" <Andries.Brouwer@cwi.nl>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Emphatically-Acked-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 fs/partitions/check.c  | 43 ++++++++++++++++++++++++++++++++-----------
 include/linux/blkdev.h |  2 ++
 include/linux/genhd.h  |  1 +
 3 files changed, 35 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 137a708bb21..4bc2c43fa08 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -546,28 +546,49 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
-		sector_t size = state->parts[p].size;
-		sector_t from = state->parts[p].from;
+		sector_t size, from;
+try_scan:
+		size = state->parts[p].size;
 		if (!size)
 			continue;
+
+		from = state->parts[p].from;
 		if (from >= get_capacity(disk)) {
 			printk(KERN_WARNING
 			       "%s: p%d ignored, start %llu is behind the end of the disk\n",
 			       disk->disk_name, p, (unsigned long long) from);
 			continue;
 		}
+
 		if (from + size > get_capacity(disk)) {
-			/*
-			 * we can not ignore partitions of broken tables
-			 * created by for example camera firmware, but we
-			 * limit them to the end of the disk to avoid
-			 * creating invalid block devices
-			 */
+			struct block_device_operations *bdops = disk->fops;
+			unsigned long long capacity;
+
 			printk(KERN_WARNING
-			       "%s: p%d size %llu exceeds device capacity, "
-			       "limited to end of disk\n",
+			       "%s: p%d size %llu exceeds device capacity, ",
 			       disk->disk_name, p, (unsigned long long) size);
-			size = get_capacity(disk) - from;
+
+			if (bdops->set_capacity &&
+			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+				printk(KERN_CONT "enabling native capacity\n");
+				capacity = bdops->set_capacity(disk, ~0ULL);
+				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+				if (capacity > get_capacity(disk)) {
+					set_capacity(disk, capacity);
+					check_disk_size_change(disk, bdev);
+					bdev->bd_invalidated = 0;
+				}
+				goto try_scan;
+			} else {
+				/*
+				 * we can not ignore partitions of broken tables
+				 * created by for example camera firmware, but
+				 * we limit them to the end of the disk to avoid
+				 * creating invalid block devices
+				 */
+				printk(KERN_CONT "limited to end of disk\n");
+				size = get_capacity(disk) - from;
+			}
 		}
 		part = add_partition(disk, p, from, size,
 				     state->parts[p].flags);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6f841fb1be3..a2d7298be35 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1106,6 +1106,8 @@ struct block_device_operations {
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
+	unsigned long long (*set_capacity) (struct gendisk *,
+						unsigned long long);
 	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	struct module *owner;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 634c53028fb..239e24b081a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -113,6 +113,7 @@ struct hd_struct {
 #define GENHD_FL_UP				16
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 #define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
+#define GENHD_FL_NATIVE_CAPACITY		128
 
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
-- 
cgit v1.2.3-70-g09d2


From e957b60d1583022a0f7c03267d37fcae2ddb78b1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 7 Jun 2009 13:52:52 +0200
Subject: ide-gd: implement block device ->set_capacity method (v2)

* Use ->probed_capacity to store native device capacity for ATA disks.

* Add ->set_capacity method to struct ide_disk_ops.

* Implement disk device ->set_capacity method for ATA disks.

* Implement block device ->set_capacity method.

v2:
* Check if LBA and HPA are supported in ide_disk_set_capacity().

* According to the spec the SET MAX ADDRESS command shall be
  immediately preceded by a READ NATIVE MAX ADDRESS command.

* Add ide_disk_hpa_{get_native,set}_capacity() helpers.

Together with the previous patch adding ->set_capacity block device
method this allows automatic disabling of Host Protected Area (HPA)
if any partitions overlapping HPA are detected.

Cc: Robert Hancock <hancockrwd@gmail.com>
Cc: Frans Pop <elendil@planet.nl>
Cc: "Andries E. Brouwer" <Andries.Brouwer@cwi.nl>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Emphatically-Acked-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-disk.c | 67 +++++++++++++++++++++++++++++++++++++++++---------
 drivers/ide/ide-gd.c   | 14 +++++++++++
 include/linux/ide.h    |  4 +--
 3 files changed, 72 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a9fbe2c3121..61a6d354622 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -302,14 +302,12 @@ static const struct drive_list_entry hpa_list[] = {
 	{ NULL,		NULL }
 };
 
-static void idedisk_check_hpa(ide_drive_t *drive)
+static u64 ide_disk_hpa_get_native_capacity(ide_drive_t *drive, int lba48)
 {
-	unsigned long long capacity, set_max;
-	int lba48 = ata_id_lba48_enabled(drive->id);
+	u64 capacity, set_max;
 
 	capacity = drive->capacity64;
-
-	set_max = idedisk_read_native_max_address(drive, lba48);
+	set_max  = idedisk_read_native_max_address(drive, lba48);
 
 	if (ide_in_drive_list(drive->id, hpa_list)) {
 		/*
@@ -320,9 +318,31 @@ static void idedisk_check_hpa(ide_drive_t *drive)
 			set_max--;
 	}
 
+	return set_max;
+}
+
+static u64 ide_disk_hpa_set_capacity(ide_drive_t *drive, u64 set_max, int lba48)
+{
+	set_max = idedisk_set_max_address(drive, set_max, lba48);
+	if (set_max)
+		drive->capacity64 = set_max;
+
+	return set_max;
+}
+
+static void idedisk_check_hpa(ide_drive_t *drive)
+{
+	u64 capacity, set_max;
+	int lba48 = ata_id_lba48_enabled(drive->id);
+
+	capacity = drive->capacity64;
+	set_max  = ide_disk_hpa_get_native_capacity(drive, lba48);
+
 	if (set_max <= capacity)
 		return;
 
+	drive->probed_capacity = set_max;
+
 	printk(KERN_INFO "%s: Host Protected Area detected.\n"
 			 "\tcurrent capacity is %llu sectors (%llu MB)\n"
 			 "\tnative  capacity is %llu sectors (%llu MB)\n",
@@ -330,13 +350,10 @@ static void idedisk_check_hpa(ide_drive_t *drive)
 			 capacity, sectors_to_MB(capacity),
 			 set_max, sectors_to_MB(set_max));
 
-	set_max = idedisk_set_max_address(drive, set_max, lba48);
-
-	if (set_max) {
-		drive->capacity64 = set_max;
+	set_max = ide_disk_hpa_set_capacity(drive, set_max, lba48);
+	if (set_max)
 		printk(KERN_INFO "%s: Host Protected Area disabled.\n",
 				 drive->name);
-	}
 }
 
 static int ide_disk_get_capacity(ide_drive_t *drive)
@@ -358,6 +375,8 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 		drive->capacity64 = drive->cyl * drive->head * drive->sect;
 	}
 
+	drive->probed_capacity = drive->capacity64;
+
 	if (lba) {
 		drive->dev_flags |= IDE_DFLAG_LBA;
 
@@ -376,7 +395,7 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 		       "%llu sectors (%llu MB)\n",
 		       drive->name, (unsigned long long)drive->capacity64,
 		       sectors_to_MB(drive->capacity64));
-		drive->capacity64 = 1ULL << 28;
+		drive->probed_capacity = drive->capacity64 = 1ULL << 28;
 	}
 
 	if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) &&
@@ -392,6 +411,31 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 	return 0;
 }
 
+static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
+{
+	u64 set = min(capacity, drive->probed_capacity);
+	u16 *id = drive->id;
+	int lba48 = ata_id_lba48_enabled(id);
+
+	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
+	    ata_id_hpa_enabled(id) == 0)
+		goto out;
+
+	/*
+	 * according to the spec the SET MAX ADDRESS command shall be
+	 * immediately preceded by a READ NATIVE MAX ADDRESS command
+	 */
+	capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
+	if (capacity == 0)
+		goto out;
+
+	set = ide_disk_hpa_set_capacity(drive, set, lba48);
+	if (set)
+		return set;
+out:
+	return drive->capacity64;
+}
+
 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 {
 	ide_drive_t *drive = q->queuedata;
@@ -741,6 +785,7 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 
 const struct ide_disk_ops ide_ata_disk_ops = {
 	.check		= ide_disk_check,
+	.set_capacity	= ide_disk_set_capacity,
 	.get_capacity	= ide_disk_get_capacity,
 	.setup		= ide_disk_setup,
 	.flush		= ide_disk_flush,
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 4b6b71e2cdf..214119026b3 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -287,6 +287,19 @@ static int ide_gd_media_changed(struct gendisk *disk)
 	return ret;
 }
 
+static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
+					      unsigned long long capacity)
+{
+	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
+	ide_drive_t *drive = idkp->drive;
+	const struct ide_disk_ops *disk_ops = drive->disk_ops;
+
+	if (disk_ops->set_capacity)
+		return disk_ops->set_capacity(drive, capacity);
+
+	return drive->capacity64;
+}
+
 static int ide_gd_revalidate_disk(struct gendisk *disk)
 {
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
@@ -315,6 +328,7 @@ static struct block_device_operations ide_gd_ops = {
 	.locked_ioctl		= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
 	.media_changed		= ide_gd_media_changed,
+	.set_capacity		= ide_gd_set_capacity,
 	.revalidate_disk	= ide_gd_revalidate_disk
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9fed365a598..e96ace12872 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -397,6 +397,7 @@ struct ide_drive_s;
 struct ide_disk_ops {
 	int		(*check)(struct ide_drive_s *, const char *);
 	int		(*get_capacity)(struct ide_drive_s *);
+	u64		(*set_capacity)(struct ide_drive_s *, u64);
 	void		(*setup)(struct ide_drive_s *);
 	void		(*flush)(struct ide_drive_s *);
 	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
@@ -568,8 +569,7 @@ struct ide_drive_s {
 	unsigned int	drive_data;	/* used by set_pio_mode/dev_select() */
 	unsigned int	failures;	/* current failure count */
 	unsigned int	max_failures;	/* maximum allowed failure count */
-	u64		probed_capacity;/* initial reported media capacity (ide-cd only currently) */
-
+	u64		probed_capacity;/* initial/native media capacity */
 	u64		capacity64;	/* total number of sectors */
 
 	int		lun;		/* logical unit */
-- 
cgit v1.2.3-70-g09d2


From 075affcbe01d4d7cefcd0e30a98df1253bcf8d92 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 7 Jun 2009 13:52:52 +0200
Subject: ide: preserve Host Protected Area by default (v2)

From the perspective of most users of recent systems, disabling Host
Protected Area (HPA) can break vendor RAID formats, GPT partitions and
risks corrupting firmware or overwriting vendor system recovery tools.

Unfortunately the original (kernels < 2.6.30) behavior (unconditionally
disabling HPA and using full disk capacity) was introduced at the time
when the main use of HPA was to make the drive look small enough for the
BIOS to allow the system to boot with large capacity drives.

Thus to allow the maximum compatibility with the existing setups (using
HPA and partitioned with HPA disabled) we automically disable HPA if
any partitions overlapping HPA are detected.  Additionally HPA can also
be disabled using the "nohpa" module parameter (i.e. "ide_core.nohpa=0.0"
to disable HPA on /dev/hda).

v2:
Fix ->resume HPA support.

While at it:
- remove stale "idebus=" entry from Documentation/kernel-parameters.txt

Cc: Robert Hancock <hancockrwd@gmail.com>
Cc: Frans Pop <elendil@planet.nl>
Cc: "Andries E. Brouwer" <Andries.Brouwer@cwi.nl>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
[patch description was based on input from Alan Cox and Frans Pop]
Emphatically-Acked-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 Documentation/ide/ide.txt           |  2 ++
 Documentation/kernel-parameters.txt |  7 ++-----
 drivers/ide/ide-disk.c              |  8 +++++++-
 drivers/ide/ide.c                   | 10 ++++++++++
 include/linux/ide.h                 |  2 ++
 5 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt
index 0c78f4b1d9d..e77bebfa7b0 100644
--- a/Documentation/ide/ide.txt
+++ b/Documentation/ide/ide.txt
@@ -216,6 +216,8 @@ Other kernel parameters for ide_core are:
 
 * "noflush=[interface_number.device_number]" to disable flush requests
 
+* "nohpa=[interface_number.device_number]" to disable Host Protected Area
+
 * "noprobe=[interface_number.device_number]" to skip probing
 
 * "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a19f021f081..e58c91ca802 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -835,11 +835,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	ide-core.nodma=	[HW] (E)IDE subsystem
 			Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
-			.vlb_clock .pci_clock .noflush .noprobe .nowerr .cdrom
-			.chs .ignore_cable are additional options
-			See Documentation/ide/ide.txt.
-
-	idebus=		[HW] (E)IDE subsystem - VLB/PCI bus speed
+			.vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
+			.cdrom .chs .ignore_cable are additional options
 			See Documentation/ide/ide.txt.
 
 	ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 61a6d354622..3d92c9d54d4 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -350,6 +350,9 @@ static void idedisk_check_hpa(ide_drive_t *drive)
 			 capacity, sectors_to_MB(capacity),
 			 set_max, sectors_to_MB(set_max));
 
+	if ((drive->dev_flags & IDE_DFLAG_NOHPA) == 0)
+		return;
+
 	set_max = ide_disk_hpa_set_capacity(drive, set_max, lba48);
 	if (set_max)
 		printk(KERN_INFO "%s: Host Protected Area disabled.\n",
@@ -430,8 +433,11 @@ static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
 		goto out;
 
 	set = ide_disk_hpa_set_capacity(drive, set, lba48);
-	if (set)
+	if (set) {
+		/* needed for ->resume to disable HPA */
+		drive->dev_flags |= IDE_DFLAG_NOHPA;
 		return set;
+	}
 out:
 	return drive->capacity64;
 }
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 92c9b90931e..16d056939f9 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -211,6 +211,11 @@ static unsigned int ide_noflush;
 module_param_call(noflush, ide_set_dev_param_mask, NULL, &ide_noflush, 0);
 MODULE_PARM_DESC(noflush, "disable flush requests for a device");
 
+static unsigned int ide_nohpa;
+
+module_param_call(nohpa, ide_set_dev_param_mask, NULL, &ide_nohpa, 0);
+MODULE_PARM_DESC(nohpa, "disable Host Protected Area for a device");
+
 static unsigned int ide_noprobe;
 
 module_param_call(noprobe, ide_set_dev_param_mask, NULL, &ide_noprobe, 0);
@@ -281,6 +286,11 @@ static void ide_dev_apply_params(ide_drive_t *drive, u8 unit)
 				 drive->name);
 		drive->dev_flags |= IDE_DFLAG_NOFLUSH;
 	}
+	if (ide_nohpa & (1 << i)) {
+		printk(KERN_INFO "ide: disabling Host Protected Area for %s\n",
+				 drive->name);
+		drive->dev_flags |= IDE_DFLAG_NOHPA;
+	}
 	if (ide_noprobe & (1 << i)) {
 		printk(KERN_INFO "ide: skipping probe for %s\n", drive->name);
 		drive->dev_flags |= IDE_DFLAG_NOPROBE;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e96ace12872..45dce3b4c88 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -475,6 +475,8 @@ enum {
 	IDE_DFLAG_NICE1			= (1 << 5),
 	/* device is physically present */
 	IDE_DFLAG_PRESENT		= (1 << 6),
+	/* disable Host Protected Area */
+	IDE_DFLAG_NOHPA			= (1 << 7),
 	/* id read from device (synthetic if not set) */
 	IDE_DFLAG_ID_READ		= (1 << 8),
 	IDE_DFLAG_NOPROBE		= (1 << 9),
-- 
cgit v1.2.3-70-g09d2


From 8bc1e5aa06a2a9a425c4a6795fc564cba1521487 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 7 Jun 2009 15:37:09 +0200
Subject: ide: respect quirk_drives[] list on all controllers

* Add ide_check_nien_quirk_list() helper to the core code
  and then use it in ide_port_tune_devices().

* Remove no longer needed ->quirkproc methods from hpt366.c
  and pdc202xx_{new,old}.c.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/hpt366.c       | 27 ---------------------------
 drivers/ide/ide-iops.c     | 25 +++++++++++++++++++++++++
 drivers/ide/ide-probe.c    |  2 ++
 drivers/ide/pdc202xx_new.c | 26 --------------------------
 drivers/ide/pdc202xx_old.c | 27 ---------------------------
 include/linux/ide.h        |  1 +
 6 files changed, 28 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index cb04523e31c..a2e9f6c65a9 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -138,18 +138,6 @@
 #undef	HPT_RESET_STATE_ENGINE
 #undef	HPT_DELAY_INTERRUPT
 
-static const char *quirk_drives[] = {
-	"QUANTUM FIREBALLlct08 08",
-	"QUANTUM FIREBALLP KA6.4",
-	"QUANTUM FIREBALLP KA9.1",
-	"QUANTUM FIREBALLP KX13.6",
-	"QUANTUM FIREBALLP KX20.5",
-	"QUANTUM FIREBALLP KX27.3",
-	"QUANTUM FIREBALLP LM20.4",
-	"QUANTUM FIREBALLP LM20.5",
-	NULL
-};
-
 static const char *bad_ata100_5[] = {
 	"IBM-DTLA-307075",
 	"IBM-DTLA-307060",
@@ -733,20 +721,6 @@ static void hpt3xx_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	hpt3xx_set_mode(drive, XFER_PIO_0 + pio);
 }
 
-static void hpt3xx_quirkproc(ide_drive_t *drive)
-{
-	char *m			= (char *)&drive->id[ATA_ID_PROD];
-	const  char **list	= quirk_drives;
-
-	while (*list)
-		if (strstr(m, *list++)) {
-			drive->quirk_list = 2;
-			return;
-		}
-
-	drive->quirk_list = 0;
-}
-
 static void hpt3xx_maskproc(ide_drive_t *drive, int mask)
 {
 	ide_hwif_t *hwif	= drive->hwif;
@@ -1408,7 +1382,6 @@ static int __devinit hpt36x_init(struct pci_dev *dev, struct pci_dev *dev2)
 static const struct ide_port_ops hpt3xx_port_ops = {
 	.set_pio_mode		= hpt3xx_set_pio_mode,
 	.set_dma_mode		= hpt3xx_set_mode,
-	.quirkproc		= hpt3xx_quirkproc,
 	.maskproc		= hpt3xx_maskproc,
 	.mdma_filter		= hpt3xx_mdma_filter,
 	.udma_filter		= hpt3xx_udma_filter,
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 8dff623f9da..c55349537c2 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -282,6 +282,31 @@ no_80w:
 	return 0;
 }
 
+static const char *nien_quirk_list[] = {
+	"QUANTUM FIREBALLlct08 08",
+	"QUANTUM FIREBALLP KA6.4",
+	"QUANTUM FIREBALLP KA9.1",
+	"QUANTUM FIREBALLP KX13.6",
+	"QUANTUM FIREBALLP KX20.5",
+	"QUANTUM FIREBALLP KX27.3",
+	"QUANTUM FIREBALLP LM20.4",
+	"QUANTUM FIREBALLP LM20.5",
+	NULL
+};
+
+void ide_check_nien_quirk_list(ide_drive_t *drive)
+{
+	const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
+
+	for (list = nien_quirk_list; *list != NULL; list++)
+		if (strstr(m, *list) != NULL) {
+			drive->quirk_list = 2;
+			return;
+		}
+
+	drive->quirk_list = 0;
+}
+
 int ide_driveid_update(ide_drive_t *drive)
 {
 	u16 *id;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 89574b0bd56..28f95cb41c2 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -732,6 +732,8 @@ static void ide_port_tune_devices(ide_hwif_t *hwif)
 	int i;
 
 	ide_port_for_each_present_dev(i, drive, hwif) {
+		ide_check_nien_quirk_list(drive);
+
 		if (port_ops && port_ops->quirkproc)
 			port_ops->quirkproc(drive);
 	}
diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c
index b68906c3c17..65ba8239e7b 100644
--- a/drivers/ide/pdc202xx_new.c
+++ b/drivers/ide/pdc202xx_new.c
@@ -40,18 +40,6 @@
 #define DBG(fmt, args...)
 #endif
 
-static const char *pdc_quirk_drives[] = {
-	"QUANTUM FIREBALLlct08 08",
-	"QUANTUM FIREBALLP KA6.4",
-	"QUANTUM FIREBALLP KA9.1",
-	"QUANTUM FIREBALLP LM20.4",
-	"QUANTUM FIREBALLP KX13.6",
-	"QUANTUM FIREBALLP KX20.5",
-	"QUANTUM FIREBALLP KX27.3",
-	"QUANTUM FIREBALLP LM20.5",
-	NULL
-};
-
 static u8 max_dma_rate(struct pci_dev *pdev)
 {
 	u8 mode;
@@ -200,19 +188,6 @@ static u8 pdcnew_cable_detect(ide_hwif_t *hwif)
 		return ATA_CBL_PATA80;
 }
 
-static void pdcnew_quirkproc(ide_drive_t *drive)
-{
-	const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
-
-	for (list = pdc_quirk_drives; *list != NULL; list++)
-		if (strstr(m, *list) != NULL) {
-			drive->quirk_list = 2;
-			return;
-		}
-
-	drive->quirk_list = 0;
-}
-
 static void pdcnew_reset(ide_drive_t *drive)
 {
 	/*
@@ -473,7 +448,6 @@ static struct pci_dev * __devinit pdc20270_get_dev2(struct pci_dev *dev)
 static const struct ide_port_ops pdcnew_port_ops = {
 	.set_pio_mode		= pdcnew_set_pio_mode,
 	.set_dma_mode		= pdcnew_set_dma_mode,
-	.quirkproc		= pdcnew_quirkproc,
 	.resetproc		= pdcnew_reset,
 	.cable_detect		= pdcnew_cable_detect,
 };
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 4980dd7b2e2..fe01db679a3 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -23,18 +23,6 @@
 
 #define PDC202XX_DEBUG_DRIVE_INFO	0
 
-static const char *pdc_quirk_drives[] = {
-	"QUANTUM FIREBALLlct08 08",
-	"QUANTUM FIREBALLP KA6.4",
-	"QUANTUM FIREBALLP KA9.1",
-	"QUANTUM FIREBALLP LM20.4",
-	"QUANTUM FIREBALLP KX13.6",
-	"QUANTUM FIREBALLP KX20.5",
-	"QUANTUM FIREBALLP KX27.3",
-	"QUANTUM FIREBALLP LM20.5",
-	NULL
-};
-
 static void pdc_old_disable_66MHz_clock(ide_hwif_t *);
 
 static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed)
@@ -151,19 +139,6 @@ static void pdc_old_disable_66MHz_clock(ide_hwif_t *hwif)
 	outb(clock & ~(hwif->channel ? 0x08 : 0x02), clock_reg);
 }
 
-static void pdc202xx_quirkproc(ide_drive_t *drive)
-{
-	const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
-
-	for (list = pdc_quirk_drives; *list != NULL; list++)
-		if (strstr(m, *list) != NULL) {
-			drive->quirk_list = 2;
-			return;
-		}
-
-	drive->quirk_list = 0;
-}
-
 static void pdc202xx_dma_start(ide_drive_t *drive)
 {
 	if (drive->current_speed > XFER_UDMA_2)
@@ -256,13 +231,11 @@ static void __devinit pdc202ata4_fixup_irq(struct pci_dev *dev,
 static const struct ide_port_ops pdc20246_port_ops = {
 	.set_pio_mode		= pdc202xx_set_pio_mode,
 	.set_dma_mode		= pdc202xx_set_mode,
-	.quirkproc		= pdc202xx_quirkproc,
 };
 
 static const struct ide_port_ops pdc2026x_port_ops = {
 	.set_pio_mode		= pdc202xx_set_pio_mode,
 	.set_dma_mode		= pdc202xx_set_mode,
-	.quirkproc		= pdc202xx_quirkproc,
 	.cable_detect		= pdc2026x_cable_detect,
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c8f7b967371..6caaae0c774 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1453,6 +1453,7 @@ static inline void ide_acpi_set_state(ide_hwif_t *hwif, int on) {}
 void ide_register_region(struct gendisk *);
 void ide_unregister_region(struct gendisk *);
 
+void ide_check_nien_quirk_list(ide_drive_t *);
 void ide_undecoded_slave(ide_drive_t *);
 
 void ide_port_apply_params(ide_hwif_t *);
-- 
cgit v1.2.3-70-g09d2


From 734affdcae20af4fec95e46a64fb29f063a15c19 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 7 Jun 2009 15:37:10 +0200
Subject: ide: add IDE_DFLAG_NIEN_QUIRK device flag

Add IDE_DFLAG_NIEN_QUIRK device flag and use it instead of
drive->quirk_list.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/hpt366.c   | 2 +-
 drivers/ide/ide-eh.c   | 5 +++--
 drivers/ide/ide-io.c   | 8 ++++++--
 drivers/ide/ide-iops.c | 6 ++----
 include/linux/ide.h    | 2 +-
 5 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index a2e9f6c65a9..7ce68ef6b90 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -727,7 +727,7 @@ static void hpt3xx_maskproc(ide_drive_t *drive, int mask)
 	struct pci_dev	*dev	= to_pci_dev(hwif->dev);
 	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
 
-	if (drive->quirk_list == 0)
+	if ((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0)
 		return;
 
 	if (info->chip_type >= HPT370) {
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 39d589254d4..2b914197961 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -407,8 +407,9 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	/* more than enough time */
 	udelay(10);
 	/* clear SRST, leave nIEN (unless device is on the quirk list) */
-	tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) |
-			     ATA_DEVCTL_OBS);
+	tp_ops->write_devctl(hwif,
+		((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) ? 0 : ATA_NIEN) |
+		 ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9654bd34cf5..243cf6561e7 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -488,11 +488,15 @@ repeat:
 
 		if ((hwif->host->host_flags & IDE_HFLAG_SERIALIZE) &&
 		    hwif != prev_port) {
+			ide_drive_t *cur_dev =
+				prev_port ? prev_port->cur_dev : NULL;
+
 			/*
 			 * set nIEN for previous port, drives in the
-			 * quirk_list may not like intr setups/cleanups
+			 * quirk list may not like intr setups/cleanups
 			 */
-			if (prev_port && prev_port->cur_dev->quirk_list == 0)
+			if (cur_dev &&
+			    (cur_dev->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0)
 				prev_port->tp_ops->write_devctl(prev_port,
 								ATA_NIEN |
 								ATA_DEVCTL_OBS);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index c55349537c2..fa047150a1c 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -300,11 +300,9 @@ void ide_check_nien_quirk_list(ide_drive_t *drive)
 
 	for (list = nien_quirk_list; *list != NULL; list++)
 		if (strstr(m, *list) != NULL) {
-			drive->quirk_list = 2;
+			drive->dev_flags |= IDE_DFLAG_NIEN_QUIRK;
 			return;
 		}
-
-	drive->quirk_list = 0;
 }
 
 int ide_driveid_update(ide_drive_t *drive)
@@ -389,7 +387,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 
 	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
-	if (drive->quirk_list == 2)
+	if (drive->dev_flags & IDE_DFLAG_NIEN_QUIRK)
 		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	error = __ide_wait_stat(drive, drive->ready_stat,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 6caaae0c774..a6c6a2fad7c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -499,6 +499,7 @@ enum {
 	/* write protect */
 	IDE_DFLAG_WP			= (1 << 29),
 	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 30),
+	IDE_DFLAG_NIEN_QUIRK		= (1 << 31),
 };
 
 struct ide_drive_s {
@@ -530,7 +531,6 @@ struct ide_drive_s {
 	u8	waiting_for_dma;	/* dma currently in progress */
 	u8	dma;			/* atapi dma flag */
 
-        u8	quirk_list;	/* considered quirky, set for a specific host */
         u8	init_speed;	/* transfer rate set at boot */
         u8	current_speed;	/* current transfer rate set */
 	u8	desired_speed;	/* desired transfer rate set */
-- 
cgit v1.2.3-70-g09d2


From eae3f29cc73f83cc3f1891d3ad40021b5172c630 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 5 Jun 2009 04:03:35 +0000
Subject: net: num_dma_maps is not used

Get rid of num_dma_maps in struct skb_shared_info, as it seems unused.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ---
 net/core/skb_dma_map.c | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7305da92be8..7485058125e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -194,9 +194,6 @@ struct skb_shared_info {
 	unsigned short  gso_type;
 	__be32          ip6_frag_id;
 	union skb_shared_tx tx_flags;
-#ifdef CONFIG_HAS_DMA
-	unsigned int	num_dma_maps;
-#endif
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
index 86234923a3b..7adb623ef66 100644
--- a/net/core/skb_dma_map.c
+++ b/net/core/skb_dma_map.c
@@ -30,7 +30,6 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb,
 			goto unwind;
 		sp->dma_maps[i + 1] = map;
 	}
-	sp->num_dma_maps = i + 1;
 
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 042a53a9e437feaf2230dd2cadcecfae9c7bfe05 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 5 Jun 2009 04:04:16 +0000
Subject: net: skb_shared_info optimization

skb_dma_unmap() is quite expensive for small packets,
because we use two different cache lines from skb_shared_info.

One to access nr_frags, one to access dma_maps[0]

Instead of dma_maps being an array of MAX_SKB_FRAGS + 1 elements,
let dma_head alone in a new dma_head field, close to nr_frags,
to reduce cache lines misses.

Tested on my dev machine (bnx2 & tg3 adapters), nice speedup !

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bnx2.c             |  6 +++---
 drivers/net/e1000/e1000_main.c |  4 ++--
 drivers/net/e1000e/netdev.c    |  4 ++--
 drivers/net/igb/igb_main.c     |  5 ++---
 drivers/net/igbvf/netdev.c     |  5 ++---
 drivers/net/ixgb/ixgb_main.c   |  4 ++--
 drivers/net/ixgbe/ixgbe_main.c |  4 ++--
 drivers/net/tg3.c              | 10 +++++-----
 include/linux/skbuff.h         |  5 ++++-
 net/core/skb_dma_map.c         | 12 ++++++------
 10 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index f53017250e0..391d2d47089 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -5487,7 +5487,7 @@ bnx2_run_loopback(struct bnx2 *bp, int loopback_mode)
 		dev_kfree_skb(skb);
 		return -EIO;
 	}
-	map = skb_shinfo(skb)->dma_maps[0];
+	map = skb_shinfo(skb)->dma_head;
 
 	REG_WR(bp, BNX2_HC_COMMAND,
 	       bp->hc_cmd | BNX2_HC_COMMAND_COAL_NOW_WO_INT);
@@ -6167,7 +6167,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	sp = skb_shinfo(skb);
-	mapping = sp->dma_maps[0];
+	mapping = sp->dma_head;
 
 	tx_buf = &txr->tx_buf_ring[ring_prod];
 	tx_buf->skb = skb;
@@ -6191,7 +6191,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		txbd = &txr->tx_desc_ring[ring_prod];
 
 		len = frag->size;
-		mapping = sp->dma_maps[i + 1];
+		mapping = sp->dma_maps[i];
 
 		txbd->tx_bd_haddr_hi = (u64) mapping >> 32;
 		txbd->tx_bd_haddr_lo = (u64) mapping & 0xffffffff;
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 05e87a59f1c..8d36743c814 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2998,7 +2998,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 			size -= 4;
 
 		buffer_info->length = size;
-		buffer_info->dma = map[0] + offset;
+		buffer_info->dma = skb_shinfo(skb)->dma_head + offset;
 		buffer_info->time_stamp = jiffies;
 		buffer_info->next_to_watch = i;
 
@@ -3039,7 +3039,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 				size -= 4;
 
 			buffer_info->length = size;
-			buffer_info->dma = map[f + 1] + offset;
+			buffer_info->dma = map[f] + offset;
 			buffer_info->time_stamp = jiffies;
 			buffer_info->next_to_watch = i;
 
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 38694c79edc..9043f1b845f 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -3916,7 +3916,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 		buffer_info->length = size;
 		buffer_info->time_stamp = jiffies;
 		buffer_info->next_to_watch = i;
-		buffer_info->dma = map[0] + offset;
+		buffer_info->dma = skb_shinfo(skb)->dma_head + offset;
 		count++;
 
 		len -= size;
@@ -3947,7 +3947,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 			buffer_info->length = size;
 			buffer_info->time_stamp = jiffies;
 			buffer_info->next_to_watch = i;
-			buffer_info->dma = map[f + 1] + offset;
+			buffer_info->dma = map[f] + offset;
 
 			len -= size;
 			offset += size;
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 958b2879da4..ea17319624a 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -3139,8 +3139,7 @@ static inline int igb_tx_map_adv(struct igb_adapter *adapter,
 	/* set time_stamp *before* dma to help avoid a possible race */
 	buffer_info->time_stamp = jiffies;
 	buffer_info->next_to_watch = i;
-	buffer_info->dma = map[count];
-	count++;
+	buffer_info->dma = skb_shinfo(skb)->dma_head;
 
 	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) {
 		struct skb_frag_struct *frag;
@@ -3164,7 +3163,7 @@ static inline int igb_tx_map_adv(struct igb_adapter *adapter,
 	tx_ring->buffer_info[i].skb = skb;
 	tx_ring->buffer_info[first].next_to_watch = i;
 
-	return count;
+	return count + 1;
 }
 
 static inline void igb_tx_queue_adv(struct igb_adapter *adapter,
diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
index 5f7ba1a4990..22aadb7884f 100644
--- a/drivers/net/igbvf/netdev.c
+++ b/drivers/net/igbvf/netdev.c
@@ -2119,8 +2119,7 @@ static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
 	/* set time_stamp *before* dma to help avoid a possible race */
 	buffer_info->time_stamp = jiffies;
 	buffer_info->next_to_watch = i;
-	buffer_info->dma = map[count];
-	count++;
+	buffer_info->dma = skb_shinfo(skb)->dma_head;
 
 	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) {
 		struct skb_frag_struct *frag;
@@ -2144,7 +2143,7 @@ static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
 	tx_ring->buffer_info[i].skb = skb;
 	tx_ring->buffer_info[first].next_to_watch = i;
 
-	return count;
+	return count + 1;
 }
 
 static inline void igbvf_tx_queue_adv(struct igbvf_adapter *adapter,
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 6eb7f37a113..9c897cf86b9 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1300,7 +1300,7 @@ ixgb_tx_map(struct ixgb_adapter *adapter, struct sk_buff *skb,
 		buffer_info->length = size;
 		WARN_ON(buffer_info->dma != 0);
 		buffer_info->time_stamp = jiffies;
-		buffer_info->dma = map[0] + offset;
+		buffer_info->dma = skb_shinfo(skb)->dma_head + offset;
 			pci_map_single(adapter->pdev,
 				skb->data + offset,
 				size,
@@ -1340,7 +1340,7 @@ ixgb_tx_map(struct ixgb_adapter *adapter, struct sk_buff *skb,
 
 			buffer_info->length = size;
 			buffer_info->time_stamp = jiffies;
-			buffer_info->dma = map[f + 1] + offset;
+			buffer_info->dma = map[f] + offset;
 			buffer_info->next_to_watch = 0;
 
 			len -= size;
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index d36003cbb6d..09994e920d5 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4837,7 +4837,7 @@ static int ixgbe_tx_map(struct ixgbe_adapter *adapter,
 		size = min(len, (uint)IXGBE_MAX_DATA_PER_TXD);
 
 		tx_buffer_info->length = size;
-		tx_buffer_info->dma = map[0] + offset;
+		tx_buffer_info->dma = skb_shinfo(skb)->dma_head + offset;
 		tx_buffer_info->time_stamp = jiffies;
 		tx_buffer_info->next_to_watch = i;
 
@@ -4869,7 +4869,7 @@ static int ixgbe_tx_map(struct ixgbe_adapter *adapter,
 			size = min(len, (uint)IXGBE_MAX_DATA_PER_TXD);
 
 			tx_buffer_info->length = size;
-			tx_buffer_info->dma = map[f + 1] + offset;
+			tx_buffer_info->dma = map[f] + offset;
 			tx_buffer_info->time_stamp = jiffies;
 			tx_buffer_info->next_to_watch = i;
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index a39b534fb43..46a3f86125b 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -5021,7 +5021,7 @@ static int tigon3_dma_hwbug_workaround(struct tg3 *tp, struct sk_buff *skb,
 		/* New SKB is guaranteed to be linear. */
 		entry = *start;
 		ret = skb_dma_map(&tp->pdev->dev, new_skb, DMA_TO_DEVICE);
-		new_addr = skb_shinfo(new_skb)->dma_maps[0];
+		new_addr = skb_shinfo(new_skb)->dma_head;
 
 		/* Make sure new skb does not cross any 4G boundaries.
 		 * Drop the packet if it does.
@@ -5155,7 +5155,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	sp = skb_shinfo(skb);
 
-	mapping = sp->dma_maps[0];
+	mapping = sp->dma_head;
 
 	tp->tx_buffers[entry].skb = skb;
 
@@ -5173,7 +5173,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 			len = frag->size;
-			mapping = sp->dma_maps[i + 1];
+			mapping = sp->dma_maps[i];
 			tp->tx_buffers[entry].skb = NULL;
 
 			tg3_set_txd(tp, entry, mapping, len,
@@ -5331,7 +5331,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev)
 
 	sp = skb_shinfo(skb);
 
-	mapping = sp->dma_maps[0];
+	mapping = sp->dma_head;
 
 	tp->tx_buffers[entry].skb = skb;
 
@@ -5356,7 +5356,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev)
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 			len = frag->size;
-			mapping = sp->dma_maps[i + 1];
+			mapping = sp->dma_maps[i];
 
 			tp->tx_buffers[entry].skb = NULL;
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7485058125e..aad484cd586 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -189,6 +189,9 @@ struct skb_shared_info {
 	atomic_t	dataref;
 	unsigned short	nr_frags;
 	unsigned short	gso_size;
+#ifdef CONFIG_HAS_DMA
+	dma_addr_t	dma_head;
+#endif
 	/* Warning: this field is not always filled in (UFO)! */
 	unsigned short	gso_segs;
 	unsigned short  gso_type;
@@ -198,7 +201,7 @@ struct skb_shared_info {
 	struct skb_shared_hwtstamps hwtstamps;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 #ifdef CONFIG_HAS_DMA
-	dma_addr_t	dma_maps[MAX_SKB_FRAGS + 1];
+	dma_addr_t	dma_maps[MAX_SKB_FRAGS];
 #endif
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
index 7adb623ef66..79687dfd695 100644
--- a/net/core/skb_dma_map.c
+++ b/net/core/skb_dma_map.c
@@ -20,7 +20,7 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb,
 	if (dma_mapping_error(dev, map))
 		goto out_err;
 
-	sp->dma_maps[0] = map;
+	sp->dma_head = map;
 	for (i = 0; i < sp->nr_frags; i++) {
 		skb_frag_t *fp = &sp->frags[i];
 
@@ -28,7 +28,7 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb,
 				   fp->size, dir);
 		if (dma_mapping_error(dev, map))
 			goto unwind;
-		sp->dma_maps[i + 1] = map;
+		sp->dma_maps[i] = map;
 	}
 
 	return 0;
@@ -37,10 +37,10 @@ unwind:
 	while (--i >= 0) {
 		skb_frag_t *fp = &sp->frags[i];
 
-		dma_unmap_page(dev, sp->dma_maps[i + 1],
+		dma_unmap_page(dev, sp->dma_maps[i],
 			       fp->size, dir);
 	}
-	dma_unmap_single(dev, sp->dma_maps[0],
+	dma_unmap_single(dev, sp->dma_head,
 			 skb_headlen(skb), dir);
 out_err:
 	return -ENOMEM;
@@ -53,12 +53,12 @@ void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
 	struct skb_shared_info *sp = skb_shinfo(skb);
 	int i;
 
-	dma_unmap_single(dev, sp->dma_maps[0],
+	dma_unmap_single(dev, sp->dma_head,
 			 skb_headlen(skb), dir);
 	for (i = 0; i < sp->nr_frags; i++) {
 		skb_frag_t *fp = &sp->frags[i];
 
-		dma_unmap_page(dev, sp->dma_maps[i + 1],
+		dma_unmap_page(dev, sp->dma_maps[i],
 			       fp->size, dir);
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 4e329972052c3649367b91de783f6293b8653cb2 Mon Sep 17 00:00:00 2001
From: Tilman Schmidt <tilman@imap.cc>
Date: Sun, 7 Jun 2009 09:09:23 +0000
Subject: isdn: rename capi_ctr_reseted() to capi_ctr_down()

Change the name of the Kernel CAPI exported function capi_ctr_reseted()
to something representing its purpose better.

Impact: renaming, no functional change
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/isdn/INTERFACE.CAPI | 4 ++--
 drivers/isdn/capi/kcapi.c         | 8 ++++----
 drivers/isdn/hardware/avm/b1.c    | 2 +-
 drivers/isdn/hardware/avm/b1dma.c | 2 +-
 drivers/isdn/hardware/avm/c4.c    | 4 ++--
 drivers/isdn/hardware/avm/t1isa.c | 2 +-
 drivers/isdn/hysdn/hycapi.c       | 4 ++--
 include/linux/isdn/capilli.h      | 2 +-
 net/bluetooth/cmtp/capi.c         | 2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
index 786d619b36e..46315381616 100644
--- a/Documentation/isdn/INTERFACE.CAPI
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -45,7 +45,7 @@ From then on, Kernel CAPI may call the registered callback functions for the
 device.
 
 If the device becomes unusable for any reason (shutdown, disconnect ...), the
-driver has to call capi_ctr_reseted(). This will prevent further calls to the
+driver has to call capi_ctr_down(). This will prevent further calls to the
 callback functions by Kernel CAPI.
 
 
@@ -166,7 +166,7 @@ int detach_capi_ctr(struct capi_ctr *ctrlr)
 	register/unregister a device (controller) with Kernel CAPI
 
 void capi_ctr_ready(struct capi_ctr *ctrlr)
-void capi_ctr_reseted(struct capi_ctr *ctrlr)
+void capi_ctr_down(struct capi_ctr *ctrlr)
 	signal controller ready/not ready
 
 void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index f33170368cd..57d26360f64 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -377,14 +377,14 @@ void capi_ctr_ready(struct capi_ctr * card)
 EXPORT_SYMBOL(capi_ctr_ready);
 
 /**
- * capi_ctr_reseted() - signal CAPI controller reset
+ * capi_ctr_down() - signal CAPI controller not ready
  * @card:	controller descriptor structure.
  *
  * Called by hardware driver to signal that the controller is down and
  * unavailable for use.
  */
 
-void capi_ctr_reseted(struct capi_ctr * card)
+void capi_ctr_down(struct capi_ctr * card)
 {
 	u16 appl;
 
@@ -413,7 +413,7 @@ void capi_ctr_reseted(struct capi_ctr * card)
 	notify_push(KCI_CONTRDOWN, card->cnr, 0, 0);
 }
 
-EXPORT_SYMBOL(capi_ctr_reseted);
+EXPORT_SYMBOL(capi_ctr_down);
 
 /**
  * capi_ctr_suspend_output() - suspend controller
@@ -517,7 +517,7 @@ EXPORT_SYMBOL(attach_capi_ctr);
 int detach_capi_ctr(struct capi_ctr *card)
 {
         if (card->cardstate != CARD_DETECTED)
-		capi_ctr_reseted(card);
+		capi_ctr_down(card);
 
 	ncards--;
 
diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
index abf05ec3176..a7c0083e78a 100644
--- a/drivers/isdn/hardware/avm/b1.c
+++ b/drivers/isdn/hardware/avm/b1.c
@@ -330,7 +330,7 @@ void b1_reset_ctr(struct capi_ctr *ctrl)
 	spin_lock_irqsave(&card->lock, flags);
 	capilib_release(&cinfo->ncci_head);
 	spin_unlock_irqrestore(&card->lock, flags);
-	capi_ctr_reseted(ctrl);
+	capi_ctr_down(ctrl);
 }
 
 void b1_register_appl(struct capi_ctr *ctrl,
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index da34b98e3de..0e84aaae43f 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -759,7 +759,7 @@ void b1dma_reset_ctr(struct capi_ctr *ctrl)
 	memset(cinfo->version, 0, sizeof(cinfo->version));
 	capilib_release(&cinfo->ncci_head);
 	spin_unlock_irqrestore(&card->lock, flags);
-	capi_ctr_reseted(ctrl);
+	capi_ctr_down(ctrl);
 }
 
 /* ------------------------------------------------------------- */
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 9df1d3f66c8..6833301a45f 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -681,7 +681,7 @@ static irqreturn_t c4_handle_interrupt(avmcard *card)
 			spin_lock_irqsave(&card->lock, flags);
 			capilib_release(&cinfo->ncci_head);
 			spin_unlock_irqrestore(&card->lock, flags);
-			capi_ctr_reseted(&cinfo->capi_ctrl);
+			capi_ctr_down(&cinfo->capi_ctrl);
 		}
 		card->nlogcontr = 0;
 		return IRQ_HANDLED;
@@ -909,7 +909,7 @@ static void c4_reset_ctr(struct capi_ctr *ctrl)
         for (i=0; i < card->nr_controllers; i++) {
 		cinfo = &card->ctrlinfo[i];
 		memset(cinfo->version, 0, sizeof(cinfo->version));
-		capi_ctr_reseted(&cinfo->capi_ctrl);
+		capi_ctr_down(&cinfo->capi_ctrl);
 	}
 	card->nlogcontr = 0;
 }
diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c
index e7724493738..1c53fd49adb 100644
--- a/drivers/isdn/hardware/avm/t1isa.c
+++ b/drivers/isdn/hardware/avm/t1isa.c
@@ -339,7 +339,7 @@ static void t1isa_reset_ctr(struct capi_ctr *ctrl)
 	spin_lock_irqsave(&card->lock, flags);
 	capilib_release(&cinfo->ncci_head);
 	spin_unlock_irqrestore(&card->lock, flags);
-	capi_ctr_reseted(ctrl);
+	capi_ctr_down(ctrl);
 }
 
 static void t1isa_remove(struct pci_dev *pdev)
diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c
index 53f6ad1235d..4ffaa14b9fc 100644
--- a/drivers/isdn/hysdn/hycapi.c
+++ b/drivers/isdn/hysdn/hycapi.c
@@ -67,7 +67,7 @@ hycapi_reset_ctr(struct capi_ctr *ctrl)
 	printk(KERN_NOTICE "HYCAPI hycapi_reset_ctr\n");
 #endif
 	capilib_release(&cinfo->ncci_head);
-	capi_ctr_reseted(ctrl);
+	capi_ctr_down(ctrl);
 }
 
 /******************************
@@ -347,7 +347,7 @@ int hycapi_capi_stop(hysdn_card *card)
 	if(cinfo) {
 		ctrl = &cinfo->capi_ctrl;
 /*		ctrl->suspend_output(ctrl); */
-		capi_ctr_reseted(ctrl);
+		capi_ctr_down(ctrl);
 	}
 	return 0;
 }
diff --git a/include/linux/isdn/capilli.h b/include/linux/isdn/capilli.h
index 35e9b0fd014..7acb87a4487 100644
--- a/include/linux/isdn/capilli.h
+++ b/include/linux/isdn/capilli.h
@@ -79,7 +79,7 @@ int attach_capi_ctr(struct capi_ctr *);
 int detach_capi_ctr(struct capi_ctr *);
 
 void capi_ctr_ready(struct capi_ctr * card);
-void capi_ctr_reseted(struct capi_ctr * card);
+void capi_ctr_down(struct capi_ctr * card);
 void capi_ctr_suspend_output(struct capi_ctr * card);
 void capi_ctr_resume_output(struct capi_ctr * card);
 void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb);
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 78958c0f9a4..97f8d68d574 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -382,7 +382,7 @@ static void cmtp_reset_ctr(struct capi_ctr *ctrl)
 
 	BT_DBG("ctrl %p", ctrl);
 
-	capi_ctr_reseted(ctrl);
+	capi_ctr_down(ctrl);
 
 	atomic_inc(&session->terminate);
 	cmtp_schedule(session);
-- 
cgit v1.2.3-70-g09d2


From 11eeef41d5f63c7d2f7fdfcc733eb7fb137cc384 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Mon, 8 Jun 2009 17:01:51 +0200
Subject: netfilter: passive OS fingerprint xtables match

Passive OS fingerprinting netfilter module allows to passively detect
remote OS and perform various netfilter actions based on that knowledge.
This module compares some data (WS, MSS, options and it's order, ttl, df
and others) from packets with SYN bit set with dynamically loaded OS
fingerprints.

Fingerprint matching rules can be downloaded from OpenBSD source tree
or found in archive and loaded via netfilter netlink subsystem into
the kernel via special util found in archive.

Archive contains library file (also attached), which was shipped
with iptables extensions some time ago (at least when ipt_osf existed
in patch-o-matic).

Following changes were made in this release:
 * added NLM_F_CREATE/NLM_F_EXCL checks
 * dropped _rcu list traversing helpers in the protected add/remove calls
 * dropped unneded structures, debug prints, obscure comment and check

Fingerprints can be downloaded from
http://www.openbsd.org/cgi-bin/cvsweb/src/etc/pf.os
or can be found in archive

Example usage:
-d switch removes fingerprints

Please consider for inclusion.
Thank you.

Passive OS fingerprint homepage (archives, examples):
http://www.ioremap.net/projects/osf

Signed-off-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild      |   1 +
 include/linux/netfilter/nfnetlink.h |   3 +-
 include/linux/netfilter/xt_osf.h    | 133 +++++++++++
 net/netfilter/Kconfig               |  13 ++
 net/netfilter/Makefile              |   1 +
 net/netfilter/xt_osf.c              | 428 ++++++++++++++++++++++++++++++++++++
 6 files changed, 578 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/netfilter/xt_osf.h
 create mode 100644 net/netfilter/xt_osf.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index af9d2fb9721..2aea50399c0 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -33,6 +33,7 @@ header-y += xt_limit.h
 header-y += xt_mac.h
 header-y += xt_mark.h
 header-y += xt_multiport.h
+header-y += xt_osf.h
 header-y += xt_owner.h
 header-y += xt_pkttype.h
 header-y += xt_quota.h
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 2214e516146..bff4d5741d9 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -46,7 +46,8 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_CTNETLINK_EXP	2
 #define NFNL_SUBSYS_QUEUE		3
 #define NFNL_SUBSYS_ULOG		4
-#define NFNL_SUBSYS_COUNT		5
+#define NFNL_SUBSYS_OSF			5
+#define NFNL_SUBSYS_COUNT		6
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/netfilter/xt_osf.h b/include/linux/netfilter/xt_osf.h
new file mode 100644
index 00000000000..fd2272e0959
--- /dev/null
+++ b/include/linux/netfilter/xt_osf.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2003+ Evgeniy Polyakov <johnpol@2ka.mxt.ru>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XT_OSF_H
+#define _XT_OSF_H
+
+#define MAXGENRELEN		32
+
+#define XT_OSF_GENRE		(1<<0)
+#define	XT_OSF_TTL		(1<<1)
+#define XT_OSF_LOG		(1<<2)
+#define XT_OSF_INVERT		(1<<3)
+
+#define XT_OSF_LOGLEVEL_ALL	0	/* log all matched fingerprints */
+#define XT_OSF_LOGLEVEL_FIRST	1	/* log only the first matced fingerprint */
+#define XT_OSF_LOGLEVEL_ALL_KNOWN	2 /* do not log unknown packets */
+
+#define XT_OSF_TTL_TRUE		0	/* True ip and fingerprint TTL comparison */
+#define XT_OSF_TTL_LESS		1	/* Check if ip TTL is less than fingerprint one */
+#define XT_OSF_TTL_NOCHECK	2	/* Do not compare ip and fingerprint TTL at all */
+
+struct xt_osf_info {
+	char			genre[MAXGENRELEN];
+	__u32			len;
+	__u32			flags;
+	__u32			loglevel;
+	__u32			ttl;
+};
+
+/*
+ * Wildcard MSS (kind of).
+ * It is used to implement a state machine for the different wildcard values
+ * of the MSS and window sizes.
+ */
+struct xt_osf_wc {
+	__u32			wc;
+	__u32			val;
+};
+
+/*
+ * This struct represents IANA options
+ * http://www.iana.org/assignments/tcp-parameters
+ */
+struct xt_osf_opt {
+	__u16			kind, length;
+	struct xt_osf_wc	wc;
+};
+
+struct xt_osf_user_finger {
+	struct xt_osf_wc	wss;
+
+	__u8			ttl, df;
+	__u16			ss, mss;
+	__u16			opt_num;
+
+	char			genre[MAXGENRELEN];
+	char			version[MAXGENRELEN];
+	char			subtype[MAXGENRELEN];
+
+	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
+	struct xt_osf_opt	opt[MAX_IPOPTLEN];
+};
+
+struct xt_osf_nlmsg {
+	struct xt_osf_user_finger	f;
+	struct iphdr		ip;
+	struct tcphdr		tcp;
+};
+
+/* Defines for IANA option kinds */
+
+enum iana_options {
+	OSFOPT_EOL = 0,		/* End of options */
+	OSFOPT_NOP, 		/* NOP */
+	OSFOPT_MSS, 		/* Maximum segment size */
+	OSFOPT_WSO, 		/* Window scale option */
+	OSFOPT_SACKP,		/* SACK permitted */
+	OSFOPT_SACK,		/* SACK */
+	OSFOPT_ECHO,
+	OSFOPT_ECHOREPLY,
+	OSFOPT_TS,		/* Timestamp option */
+	OSFOPT_POCP,		/* Partial Order Connection Permitted */
+	OSFOPT_POSP,		/* Partial Order Service Profile */
+
+	/* Others are not used in the current OSF */
+	OSFOPT_EMPTY = 255,
+};
+
+/*
+ * Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum xt_osf_window_size_options {
+	OSF_WSS_PLAIN	= 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
+/*
+ * Add/remove fingerprint from the kernel.
+ */
+enum xt_osf_msg_types {
+	OSF_MSG_ADD,
+	OSF_MSG_REMOVE,
+	OSF_MSG_MAX,
+};
+
+enum xt_osf_attr_type {
+	OSF_ATTR_UNSPEC,
+	OSF_ATTR_FINGER,
+	OSF_ATTR_MAX,
+};
+
+#endif				/* _XT_OSF_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index cb3ad741ebf..79ba47f042c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -917,6 +917,19 @@ config NETFILTER_XT_MATCH_U32
 
 	  Details and examples are in the kernel module source.
 
+config NETFILTER_XT_MATCH_OSF
+	tristate '"osf" Passive OS fingerprint match'
+	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+	help
+	  This option selects the Passive OS Fingerprinting match module
+	  that allows to passively match the remote operating system by
+	  analyzing incoming TCP SYN packets.
+
+	  Rules and loading software can be downloaded from
+	  http://www.ioremap.net/projects/osf
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 endif # NETFILTER_XTABLES
 
 endmenu
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 6282060fbda..49f62ee4e9f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -77,6 +77,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
new file mode 100644
index 00000000000..863e40977a4
--- /dev/null
+++ b/net/netfilter/xt_osf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/xt_osf.h>
+
+struct xt_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct xt_osf_user_finger	finger;
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+/*
+ * Indexed by dont-fragment bit.
+ * It is the only constant value in the fingerprint.
+ */
+static struct list_head xt_osf_fingers[2];
+
+static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
+	[OSF_ATTR_FINGER]	= { .len = sizeof(struct xt_osf_user_finger) },
+};
+
+static void xt_osf_finger_free_rcu(struct rcu_head *rcu_head)
+{
+	struct xt_osf_finger *f = container_of(rcu_head, struct xt_osf_finger, rcu_head);
+
+	kfree(f);
+}
+
+static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,
+			struct nlmsghdr *nlh, struct nlattr *osf_attrs[])
+{
+	struct xt_osf_user_finger *f;
+	struct xt_osf_finger *kf = NULL, *sf;
+	int err = 0;
+
+	if (!osf_attrs[OSF_ATTR_FINGER])
+		return -EINVAL;
+
+	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+		return -EINVAL;
+
+	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+	kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
+	if (!kf)
+		return -ENOMEM;
+
+	memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
+
+	list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
+		if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
+			continue;
+
+		kfree(kf);
+		kf = NULL;
+
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			err = -EEXIST;
+		break;
+	}
+
+	/*
+	 * We are protected by nfnl mutex.
+	 */
+	if (kf)
+		list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
+
+	return err;
+}
+
+static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb,
+			struct nlmsghdr *nlh, struct nlattr *osf_attrs[])
+{
+	struct xt_osf_user_finger *f;
+	struct xt_osf_finger *sf;
+	int err = ENOENT;
+
+	if (!osf_attrs[OSF_ATTR_FINGER])
+		return -EINVAL;
+
+	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+	list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
+		if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
+			continue;
+
+		/*
+		 * We are protected by nfnl mutex.
+		 */
+		list_del_rcu(&sf->finger_entry);
+		call_rcu(&sf->rcu_head, xt_osf_finger_free_rcu);
+
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
+	[OSF_MSG_ADD]	= {
+		.call		= xt_osf_add_callback,
+		.attr_count	= OSF_ATTR_MAX,
+		.policy		= xt_osf_policy,
+	},
+	[OSF_MSG_REMOVE]	= {
+		.call		= xt_osf_remove_callback,
+		.attr_count	= OSF_ATTR_MAX,
+		.policy		= xt_osf_policy,
+	},
+};
+
+static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
+	.name			= "osf",
+	.subsys_id		= NFNL_SUBSYS_OSF,
+	.cb_count		= OSF_MSG_MAX,
+	.cb			= xt_osf_nfnetlink_callbacks,
+};
+
+static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
+			    unsigned char f_ttl)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+
+	if (info->flags & XT_OSF_TTL) {
+		if (info->ttl == XT_OSF_TTL_TRUE)
+			return ip->ttl == f_ttl;
+		if (info->ttl == XT_OSF_TTL_NOCHECK)
+			return 1;
+		else if (ip->ttl <= f_ttl)
+			return 1;
+		else {
+			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+			int ret = 0;
+
+			for_ifa(in_dev) {
+				if (inet_ifa_match(ip->saddr, ifa)) {
+					ret = (ip->ttl == f_ttl);
+					break;
+				}
+			}
+			endfor_ifa(in_dev);
+
+			return ret;
+		}
+	}
+
+	return ip->ttl == f_ttl;
+}
+
+static bool xt_osf_match_packet(const struct sk_buff *skb,
+		const struct xt_match_param *p)
+{
+	const struct xt_osf_info *info = p->matchinfo;
+	const struct iphdr *ip = ip_hdr(skb);
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+	int fmatch = FMATCH_WRONG, fcount = 0;
+	unsigned int optsize = 0, check_WSS = 0;
+	u16 window, totlen, mss = 0;
+	bool df;
+	const unsigned char *optp = NULL, *_optp = NULL;
+	unsigned char opts[MAX_IPOPTLEN];
+	const struct xt_osf_finger *kf;
+	const struct xt_osf_user_finger *f;
+
+	if (!info)
+		return false;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	if (!tcp)
+		return false;
+
+	if (!tcp->syn)
+		return false;
+
+	totlen = ntohs(ip->tot_len);
+	df = ntohs(ip->frag_off) & IP_DF;
+	window = ntohs(tcp->window);
+
+	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+				sizeof(struct tcphdr), optsize, opts);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
+		f = &kf->finger;
+
+		if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
+			continue;
+
+		optp = _optp;
+		fmatch = FMATCH_WRONG;
+
+		if (totlen == f->ss && xt_osf_ttl(skb, info, f->ttl)) {
+			int foptsize, optnum;
+
+			/*
+			 * Should not happen if userspace parser was written correctly.
+			 */
+			if (f->wss.wc >= OSF_WSS_MAX)
+				continue;
+
+			/* Check options */
+
+			foptsize = 0;
+			for (optnum = 0; optnum < f->opt_num; ++optnum)
+				foptsize += f->opt[optnum].length;
+
+			if (foptsize > MAX_IPOPTLEN ||
+				optsize > MAX_IPOPTLEN ||
+				optsize != foptsize)
+				continue;
+
+			check_WSS = f->wss.wc;
+
+			for (optnum = 0; optnum < f->opt_num; ++optnum) {
+				if (f->opt[optnum].kind == (*optp)) {
+					__u32 len = f->opt[optnum].length;
+					const __u8 *optend = optp + len;
+					int loop_cont = 0;
+
+					fmatch = FMATCH_OK;
+
+					switch (*optp) {
+					case OSFOPT_MSS:
+						mss = optp[3];
+						mss <<= 8;
+						mss |= optp[2];
+
+						mss = ntohs(mss);
+						break;
+					case OSFOPT_TS:
+						loop_cont = 1;
+						break;
+					}
+
+					optp = optend;
+				} else
+					fmatch = FMATCH_OPT_WRONG;
+
+				if (fmatch != FMATCH_OK)
+					break;
+			}
+
+			if (fmatch != FMATCH_OPT_WRONG) {
+				fmatch = FMATCH_WRONG;
+
+				switch (check_WSS) {
+				case OSF_WSS_PLAIN:
+					if (f->wss.val == 0 || window == f->wss.val)
+						fmatch = FMATCH_OK;
+					break;
+				case OSF_WSS_MSS:
+					/*
+					 * Some smart modems decrease mangle MSS to 
+					 * SMART_MSS_2, so we check standard, decreased
+					 * and the one provided in the fingerprint MSS
+					 * values.
+					 */
+#define SMART_MSS_1	1460
+#define SMART_MSS_2	1448
+					if (window == f->wss.val * mss ||
+					    window == f->wss.val * SMART_MSS_1 ||
+					    window == f->wss.val * SMART_MSS_2)
+						fmatch = FMATCH_OK;
+					break;
+				case OSF_WSS_MTU:
+					if (window == f->wss.val * (mss + 40) ||
+					    window == f->wss.val * (SMART_MSS_1 + 40) ||
+					    window == f->wss.val * (SMART_MSS_2 + 40))
+						fmatch = FMATCH_OK;
+					break;
+				case OSF_WSS_MODULO:
+					if ((window % f->wss.val) == 0)
+						fmatch = FMATCH_OK;
+					break;
+				}
+			}
+
+			if (fmatch != FMATCH_OK)
+				continue;
+
+			fcount++;
+
+			if (info->flags & XT_OSF_LOG)
+				nf_log_packet(p->hooknum, 0, skb, p->in, p->out, NULL,
+					"%s [%s:%s] : %pi4:%d -> %pi4:%d hops=%d\n",
+					f->genre, f->version, f->subtype,
+					&ip->saddr, ntohs(tcp->source),
+					&ip->daddr, ntohs(tcp->dest),
+					f->ttl - ip->ttl);
+
+			if ((info->flags & XT_OSF_LOG) &&
+			    info->loglevel == XT_OSF_LOGLEVEL_FIRST)
+				break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (!fcount && (info->flags & XT_OSF_LOG))
+		nf_log_packet(p->hooknum, 0, skb, p->in, p->out, NULL,
+			"Remote OS is not known: %pi4:%u -> %pi4:%u\n",
+				&ip->saddr, ntohs(tcp->source),
+				&ip->daddr, ntohs(tcp->dest));
+
+	if (fcount)
+		fmatch = FMATCH_OK;
+
+	return fmatch == FMATCH_OK;
+}
+
+static struct xt_match xt_osf_match = {
+	.name 		= "osf",
+	.revision	= 0,
+	.family		= NFPROTO_IPV4,
+	.proto		= IPPROTO_TCP,
+	.hooks      	= (1 << NF_INET_LOCAL_IN) |
+				(1 << NF_INET_PRE_ROUTING) |
+				(1 << NF_INET_FORWARD),
+	.match 		= xt_osf_match_packet,
+	.matchsize	= sizeof(struct xt_osf_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_osf_init(void)
+{
+	int err = -EINVAL;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
+		INIT_LIST_HEAD(&xt_osf_fingers[i]);
+
+	err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
+	if (err < 0) {
+		printk(KERN_ERR "Failed (%d) to register OSF nsfnetlink helper.\n", err);
+		goto err_out_exit;
+	}
+
+	err = xt_register_match(&xt_osf_match);
+	if (err) {
+		printk(KERN_ERR "Failed (%d) to register OS fingerprint "
+				"matching module.\n", err);
+		goto err_out_remove;
+	}
+
+	return 0;
+
+err_out_remove:
+	nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
+err_out_exit:
+	return err;
+}
+
+static void __exit xt_osf_fini(void)
+{
+	struct xt_osf_finger *f;
+	int i;
+
+	nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
+	xt_unregister_match(&xt_osf_match);
+
+	rcu_read_lock();
+	for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
+
+		list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
+			list_del_rcu(&f->finger_entry);
+			call_rcu(&f->rcu_head, xt_osf_finger_free_rcu);
+		}
+	}
+	rcu_read_unlock();
+
+	rcu_barrier();
+}
+
+module_init(xt_osf_init);
+module_exit(xt_osf_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_DESCRIPTION("Passive OS fingerprint matching.");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
-- 
cgit v1.2.3-70-g09d2


From 226c7ffe74474257b4b87bd38ae8ba0030cf65e2 Mon Sep 17 00:00:00 2001
From: Joe Eykholt <jeykholt@cisco.com>
Date: Wed, 6 May 2009 10:52:51 -0700
Subject: [SCSI] net, libfcoe: Add the FCoE Initialization Protocol ethertype

FIP is the FCoE Initialization Protocol and this patch
adds the protocol ethertype to the kernel's list of
ethertypes.

Signed-off-by: Joe Eykholt <jeykholt@cisco.com>
Signed-off-by: Robert Love <robert.w.love@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/linux/if_ether.h | 1 +
 include/scsi/fc/fc_fip.h | 7 -------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index cfe4fe1b713..60e8934d10b 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -79,6 +79,7 @@
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
 #define ETH_P_FCOE	0x8906		/* Fibre Channel over Ethernet  */
+#define ETH_P_FIP	0x8914		/* FCoE Initialization Protocol */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 
 /*
diff --git a/include/scsi/fc/fc_fip.h b/include/scsi/fc/fc_fip.h
index 0627a9ae634..3d138c1fcf8 100644
--- a/include/scsi/fc/fc_fip.h
+++ b/include/scsi/fc/fc_fip.h
@@ -22,13 +22,6 @@
  * http://www.t11.org/ftp/t11/pub/fc/bb-5/08-543v1.pdf
  */
 
-/*
- * The FIP ethertype eventually goes in net/if_ether.h.
- */
-#ifndef ETH_P_FIP
-#define ETH_P_FIP	0x8914	/* FIP Ethertype */
-#endif
-
 #define FIP_DEF_PRI	128	/* default selection priority */
 #define FIP_DEF_FC_MAP	0x0efc00 /* default FCoE MAP (MAC OUI) value */
 #define FIP_DEF_FKA	8000	/* default FCF keep-alive/advert period (mS) */
-- 
cgit v1.2.3-70-g09d2


From c6f4a42de60b981dd210de01cd3e575835e3158e Mon Sep 17 00:00:00 2001
From: Minkyu Kang <mk7.kang@samsung.com>
Date: Fri, 5 Jun 2009 15:33:04 +0900
Subject: Add MAX17040 Fuel Gauge driver

The MAX17040 is a I2C interfaced Fuel Gauge systems for lithium-ion
batteries This patch adds support the MAX17040 Fuel Gauge

Signed-off-by: Minkyu Kang <mk7.kang@samsung.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/Kconfig            |   8 +
 drivers/power/Makefile           |   3 +-
 drivers/power/max17040_battery.c | 309 +++++++++++++++++++++++++++++++++++++++
 include/linux/max17040_battery.h |  19 +++
 4 files changed, 338 insertions(+), 1 deletion(-)
 create mode 100644 drivers/power/max17040_battery.c
 create mode 100644 include/linux/max17040_battery.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 33da1127992..7eda34838bf 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -82,6 +82,14 @@ config BATTERY_DA9030
 	  Say Y here to enable support for batteries charger integrated into
 	  DA9030 PMIC.
 
+config BATTERY_MAX17040
+	tristate "Maxim MAX17040 Fuel Gauge"
+	depends on I2C
+	help
+	  MAX17040 is fuel-gauge systems for lithium-ion (Li+) batteries
+	  in handheld and portable equipment. The MAX17040 is configured
+	  to operate with a single lithium cell
+
 config CHARGER_PCF50633
 	tristate "NXP PCF50633 MBC"
 	depends on MFD_PCF50633
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 2fcf41d13e5..daf3179689a 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -25,4 +25,5 @@ obj-$(CONFIG_BATTERY_TOSA)	+= tosa_battery.o
 obj-$(CONFIG_BATTERY_WM97XX)	+= wm97xx_battery.o
 obj-$(CONFIG_BATTERY_BQ27x00)	+= bq27x00_battery.o
 obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
-obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
\ No newline at end of file
+obj-$(CONFIG_BATTERY_MAX17040)	+= max17040_battery.o
+obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
diff --git a/drivers/power/max17040_battery.c b/drivers/power/max17040_battery.c
new file mode 100644
index 00000000000..87b98bf27ae
--- /dev/null
+++ b/drivers/power/max17040_battery.c
@@ -0,0 +1,309 @@
+/*
+ *  max17040_battery.c
+ *  fuel-gauge systems for lithium-ion (Li+) batteries
+ *
+ *  Copyright (C) 2009 Samsung Electronics
+ *  Minkyu Kang <mk7.kang@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/delay.h>
+#include <linux/power_supply.h>
+#include <linux/max17040_battery.h>
+
+#define MAX17040_VCELL_MSB	0x02
+#define MAX17040_VCELL_LSB	0x03
+#define MAX17040_SOC_MSB	0x04
+#define MAX17040_SOC_LSB	0x05
+#define MAX17040_MODE_MSB	0x06
+#define MAX17040_MODE_LSB	0x07
+#define MAX17040_VER_MSB	0x08
+#define MAX17040_VER_LSB	0x09
+#define MAX17040_RCOMP_MSB	0x0C
+#define MAX17040_RCOMP_LSB	0x0D
+#define MAX17040_CMD_MSB	0xFE
+#define MAX17040_CMD_LSB	0xFF
+
+#define MAX17040_DELAY		1000
+#define MAX17040_BATTERY_FULL	95
+
+struct max17040_chip {
+	struct i2c_client		*client;
+	struct delayed_work		work;
+	struct power_supply		battery;
+	struct max17040_platform_data	*pdata;
+
+	/* State Of Connect */
+	int online;
+	/* battery voltage */
+	int vcell;
+	/* battery capacity */
+	int soc;
+	/* State Of Charge */
+	int status;
+};
+
+static int max17040_get_property(struct power_supply *psy,
+			    enum power_supply_property psp,
+			    union power_supply_propval *val)
+{
+	struct max17040_chip *chip = container_of(psy,
+				struct max17040_chip, battery);
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		val->intval = chip->status;
+		break;
+	case POWER_SUPPLY_PROP_ONLINE:
+		val->intval = chip->online;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = chip->vcell;
+		break;
+	case POWER_SUPPLY_PROP_CAPACITY:
+		val->intval = chip->soc;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int max17040_write_reg(struct i2c_client *client, int reg, u8 value)
+{
+	int ret;
+
+	ret = i2c_smbus_write_byte_data(client, reg, value);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
+
+	return ret;
+}
+
+static int max17040_read_reg(struct i2c_client *client, int reg)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+
+	if (ret < 0)
+		dev_err(&client->dev, "%s: err %d\n", __func__, ret);
+
+	return ret;
+}
+
+static void max17040_reset(struct i2c_client *client)
+{
+	max17040_write_reg(client, MAX17040_CMD_MSB, 0x54);
+	max17040_write_reg(client, MAX17040_CMD_LSB, 0x00);
+}
+
+static void max17040_get_vcell(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+	u8 msb;
+	u8 lsb;
+
+	msb = max17040_read_reg(client, MAX17040_VCELL_MSB);
+	lsb = max17040_read_reg(client, MAX17040_VCELL_LSB);
+
+	chip->vcell = (msb << 4) + (lsb >> 4);
+}
+
+static void max17040_get_soc(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+	u8 msb;
+	u8 lsb;
+
+	msb = max17040_read_reg(client, MAX17040_SOC_MSB);
+	lsb = max17040_read_reg(client, MAX17040_SOC_LSB);
+
+	chip->soc = msb;
+}
+
+static void max17040_get_version(struct i2c_client *client)
+{
+	u8 msb;
+	u8 lsb;
+
+	msb = max17040_read_reg(client, MAX17040_VER_MSB);
+	lsb = max17040_read_reg(client, MAX17040_VER_LSB);
+
+	dev_info(&client->dev, "MAX17040 Fuel-Gauge Ver %d%d\n", msb, lsb);
+}
+
+static void max17040_get_online(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	if (chip->pdata->battery_online)
+		chip->online = chip->pdata->battery_online();
+	else
+		chip->online = 1;
+}
+
+static void max17040_get_status(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	if (!chip->pdata->charger_online || !chip->pdata->charger_enable) {
+		chip->status = POWER_SUPPLY_STATUS_UNKNOWN;
+		return;
+	}
+
+	if (chip->pdata->charger_online()) {
+		if (chip->pdata->charger_enable())
+			chip->status = POWER_SUPPLY_STATUS_CHARGING;
+		else
+			chip->status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+	} else {
+		chip->status = POWER_SUPPLY_STATUS_DISCHARGING;
+	}
+
+	if (chip->soc > MAX17040_BATTERY_FULL)
+		chip->status = POWER_SUPPLY_STATUS_FULL;
+}
+
+static void max17040_work(struct work_struct *work)
+{
+	struct max17040_chip *chip;
+
+	chip = container_of(work, struct max17040_chip, work.work);
+
+	max17040_get_vcell(chip->client);
+	max17040_get_soc(chip->client);
+	max17040_get_online(chip->client);
+	max17040_get_status(chip->client);
+
+	schedule_delayed_work(&chip->work, MAX17040_DELAY);
+}
+
+static enum power_supply_property max17040_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_ONLINE,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+};
+
+static int __devinit max17040_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct max17040_chip *chip;
+	int ret;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE))
+		return -EIO;
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	chip->client = client;
+	chip->pdata = client->dev.platform_data;
+
+	i2c_set_clientdata(client, chip);
+
+	chip->battery.name		= "battery";
+	chip->battery.type		= POWER_SUPPLY_TYPE_BATTERY;
+	chip->battery.get_property	= max17040_get_property;
+	chip->battery.properties	= max17040_battery_props;
+	chip->battery.num_properties	= ARRAY_SIZE(max17040_battery_props);
+
+	ret = power_supply_register(&client->dev, &chip->battery);
+	if (ret) {
+		dev_err(&client->dev, "failed: power supply register\n");
+		i2c_set_clientdata(client, NULL);
+		kfree(chip);
+		return ret;
+	}
+
+	max17040_reset(client);
+	max17040_get_version(client);
+
+	INIT_DELAYED_WORK_DEFERRABLE(&chip->work, max17040_work);
+	schedule_delayed_work(&chip->work, MAX17040_DELAY);
+
+	return 0;
+}
+
+static int __devexit max17040_remove(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	power_supply_unregister(&chip->battery);
+	cancel_delayed_work(&chip->work);
+	i2c_set_clientdata(client, NULL);
+	kfree(chip);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+
+static int max17040_suspend(struct i2c_client *client,
+		pm_message_t state)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	cancel_delayed_work(&chip->work);
+	return 0;
+}
+
+static int max17040_resume(struct i2c_client *client)
+{
+	struct max17040_chip *chip = i2c_get_clientdata(client);
+
+	schedule_delayed_work(&chip->work, MAX17040_DELAY);
+	return 0;
+}
+
+#else
+
+#define max17040_suspend NULL
+#define max17040_resume NULL
+
+#endif /* CONFIG_PM */
+
+static const struct i2c_device_id max17040_id[] = {
+	{ "max17040", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max17040_id);
+
+static struct i2c_driver max17040_i2c_driver = {
+	.driver	= {
+		.name	= "max17040",
+	},
+	.probe		= max17040_probe,
+	.remove		= __devexit_p(max17040_remove),
+	.suspend	= max17040_suspend,
+	.resume		= max17040_resume,
+	.id_table	= max17040_id,
+};
+
+static int __init max17040_init(void)
+{
+	return i2c_add_driver(&max17040_i2c_driver);
+}
+module_init(max17040_init);
+
+static void __exit max17040_exit(void)
+{
+	i2c_del_driver(&max17040_i2c_driver);
+}
+module_exit(max17040_exit);
+
+MODULE_AUTHOR("Minkyu Kang <mk7.kang@samsung.com>");
+MODULE_DESCRIPTION("MAX17040 Fuel Gauge");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/max17040_battery.h b/include/linux/max17040_battery.h
new file mode 100644
index 00000000000..ad97b06cf93
--- /dev/null
+++ b/include/linux/max17040_battery.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (C) 2009 Samsung Electronics
+ *  Minkyu Kang <mk7.kang@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MAX17040_BATTERY_H_
+#define __MAX17040_BATTERY_H_
+
+struct max17040_platform_data {
+	int (*battery_online)(void);
+	int (*charger_online)(void);
+	int (*charger_enable)(void);
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Jun 2009 18:18:39 +0200
Subject: ring-buffer: pass in lockdep class key for reader_lock

On Sun, 7 Jun 2009, Ingo Molnar wrote:
> Testing tracer sched_switch: <6>Starting ring buffer hammer
> PASSED
> Testing tracer sysprof: PASSED
> Testing tracer function: PASSED
> Testing tracer irqsoff:
> =============================================
> PASSED
> Testing tracer preemptoff: PASSED
> Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ]
> PASSED
> Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760
> ---------------------------------------------
> rb_consumer/431 is trying to acquire lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70
>
> but task is already holding lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0
>
> other info that might help us debug this:
> 1 lock held by rb_consumer/431:
>  #0:  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0

The ring buffer is a generic structure, and can be used outside of
ftrace. If ftrace traces within the use of the ring buffer, it can produce
false positives with lockdep.

This patch passes in a static lock key into the allocation of the ring
buffer, so that different ring buffers will have their own lock class.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1244477919.13761.9042.camel@twins>

[ store key in ring buffer descriptor ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 14 +++++++++++++-
 kernel/trace/ring_buffer.c  |  9 +++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f1345828c7c..8670f1575fe 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -105,7 +105,19 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
  * size is in bytes for each per CPU buffer.
  */
 struct ring_buffer *
-ring_buffer_alloc(unsigned long size, unsigned flags);
+__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc(size, flags)			\
+({							\
+	static struct lock_class_key __key;		\
+	__ring_buffer_alloc((size), (flags), &__key);	\
+})
+
 void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7102d7a2fad..22878b0d370 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -426,6 +426,8 @@ struct ring_buffer {
 	atomic_t			record_disabled;
 	cpumask_var_t			cpumask;
 
+	struct lock_class_key		*reader_lock_key;
+
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
@@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
 	spin_lock_init(&cpu_buffer->reader_lock);
+	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
  * when the buffer wraps. If this flag is not set, the buffer will
  * drop data when the tail hits the head.
  */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+					struct lock_class_key *key)
 {
 	struct ring_buffer *buffer;
 	int bsize;
@@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
+	buffer->reader_lock_key = key;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	kfree(buffer);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
-- 
cgit v1.2.3-70-g09d2


From 9df1bb9b516daeece159ab7fb262d01a0359247c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Jun 2009 06:22:57 +0200
Subject: Revert "block: Fix bounce limit setting in DM"

This reverts commit a05c0205ba031c01bba33a21bf0a35920eb64833.

DM doesn't need to access the bounce_pfn directly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 17 -----------------
 drivers/md/dm-table.c  |  2 +-
 include/linux/blkdev.h |  1 -
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9acd0b7e802..8d339349289 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -193,23 +193,6 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
-/**
- * blk_queue_bounce_pfn - set the bounce buffer limit for queue
- * @q: the request queue for the device
- * @pfn: max address
- *
- * Description:
- *    This function is similar to blk_queue_bounce_limit except it
- *    neither changes allocation flags, nor does it set up the ISA DMA
- *    pool. This function should only be used by stacking drivers.
- *    Hardware drivers should use blk_queue_bounce_limit instead.
- */
-void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
-{
-	q->limits.bounce_pfn = pfn;
-}
-EXPORT_SYMBOL(blk_queue_bounce_pfn);
-
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ca1604ddd5..e9a73bb242b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 989aa1790f4..5e740a135e7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,7 +910,6 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
cgit v1.2.3-70-g09d2


From ee0398717078260ee4ffa97d407071bc774e2dac Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Jun 2009 00:17:13 -0700
Subject: skbuff: Add frag list abstraction interfaces.

With the hope that these can be used to eliminate direct
references to the frag list implementation.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aad484cd586..f1c93b878b3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1077,7 +1077,7 @@ extern void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page,
 			    int off, int size);
 
 #define SKB_PAGE_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->nr_frags)
-#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->frag_list)
+#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_has_frags(skb))
 #define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))
 
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -1716,6 +1716,25 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 		     skb = skb->prev)
 
 
+static inline bool skb_has_frags(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->frag_list != NULL;
+}
+
+static inline void skb_frag_list_init(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->frag_list = NULL;
+}
+
+static inline void skb_frag_add_head(struct sk_buff *skb, struct sk_buff *frag)
+{
+	frag->next = skb_shinfo(skb)->frag_list;
+	skb_shinfo(skb)->frag_list = frag;
+}
+
+#define skb_walk_frags(skb, iter)	\
+	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
+
 extern struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 					   int *peeked, int *err);
 extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
-- 
cgit v1.2.3-70-g09d2


From a5bd8a13e9e0322c7f76b34790ba34e2e0ce2ac5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Jun 2009 00:17:27 -0700
Subject: netdevice.h: Use frag list abstraction interfaces.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2a801380b50..9ea8d6dfe54 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1904,7 +1904,7 @@ static inline int net_gso_ok(int features, int gso_type)
 static inline int skb_gso_ok(struct sk_buff *skb, int features)
 {
 	return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
-	       (!skb_shinfo(skb)->frag_list || (features & NETIF_F_FRAGLIST));
+	       (!skb_has_frags(skb) || (features & NETIF_F_FRAGLIST));
 }
 
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
-- 
cgit v1.2.3-70-g09d2


From 151060ac13144208bd7601d17e4c92c59b98072f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Apr 2009 10:54:54 +0900
Subject: CUSE: implement CUSE - Character device in Userspace

CUSE enables implementing character devices in userspace.  With recent
additions of ioctl and poll support, FUSE already has most of what's
necessary to implement character devices.  All CUSE has to do is
bonding all those components - FUSE, chardev and the driver model -
nicely.

When client opens /dev/cuse, kernel starts conversation with
CUSE_INIT.  The client tells CUSE which device it wants to create.  As
the previous patch made fuse_file usable without associated
fuse_inode, CUSE doesn't create super block or inodes.  It attaches
fuse_file to cdev file->private_data during open and set ff->fi to
NULL.  The rest of the operation is almost identical to FUSE direct IO
case.

Each CUSE device has a corresponding directory /sys/class/cuse/DEVNAME
(which is symlink to /sys/devices/virtual/class/DEVNAME if
SYSFS_DEPRECATED is turned off) which hosts "waiting" and "abort"
among other things.  Those two files have the same meaning as the FUSE
control files.

The only notable lacking feature compared to in-kernel implementation
is mmap support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/Kconfig           |  10 +
 fs/fuse/Makefile     |   1 +
 fs/fuse/cuse.c       | 610 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fuse.h |  31 +++
 4 files changed, 652 insertions(+)
 create mode 100644 fs/fuse/cuse.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9f7270f36b2..525da2e8f73 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -62,6 +62,16 @@ source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
+config CUSE
+	tristate "Character device in Userpace support"
+	depends on FUSE_FS
+	help
+	  This FUSE extension allows character devices to be
+	  implemented in userspace.
+
+	  If you want to develop or use userspace character device
+	  based on CUSE, answer Y or M.
+
 config GENERIC_ACL
 	bool
 	select FS_POSIX_ACL
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 72437065f6a..e95eeb445e5 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
 #
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
 
 fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 00000000000..de792dcf327
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,610 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009  SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009  Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems.  On initialization /dev/cuse is
+ * created.  By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device.  After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn	: contains fuse_conn and serves as bonding structure
+ * channel	: file handle connected to the userland CUSE server
+ * cdev		: the implemented character device
+ * dev		: generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE.  As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies.  When channel is
+ * closed, everything begins to destruct.  The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+
+#include "fuse_i.h"
+
+#define CUSE_CONNTBL_LEN	64
+
+struct cuse_conn {
+	struct list_head	list;	/* linked on cuse_conntbl */
+	struct fuse_conn	fc;	/* fuse connection */
+	struct cdev		*cdev;	/* associated character device */
+	struct device		*dev;	/* device representing @cdev */
+
+	/* init parameters, set once during initialization */
+	bool			unrestricted_ioctl;
+};
+
+static DEFINE_SPINLOCK(cuse_lock);		/* protects cuse_conntbl */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+	return container_of(fc, struct cuse_conn, fc);
+}
+
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+	return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+
+
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file.  All other ops call FUSE ops on the
+ * FUSE file.
+ */
+
+static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
+			 loff_t *ppos)
+{
+	loff_t pos = 0;
+
+	return fuse_direct_io(file, buf, count, &pos, 0);
+}
+
+static ssize_t cuse_write(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	loff_t pos = 0;
+	/*
+	 * No locking or generic_write_checks(), the server is
+	 * responsible for locking and sanity checks.
+	 */
+	return fuse_direct_io(file, buf, count, &pos, 1);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+	dev_t devt = inode->i_cdev->dev;
+	struct cuse_conn *cc = NULL, *pos;
+	int rc;
+
+	/* look up and get the connection */
+	spin_lock(&cuse_lock);
+	list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+		if (pos->dev->devt == devt) {
+			fuse_conn_get(&pos->fc);
+			cc = pos;
+			break;
+		}
+	spin_unlock(&cuse_lock);
+
+	/* dead? */
+	if (!cc)
+		return -ENODEV;
+
+	/*
+	 * Generic permission check is already done against the chrdev
+	 * file, proceed to open.
+	 */
+	rc = fuse_do_open(&cc->fc, 0, file, 0);
+	if (rc)
+		fuse_conn_put(&cc->fc);
+	return rc;
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+
+	fuse_sync_release(ff, file->f_flags);
+	fuse_conn_put(fc);
+
+	return 0;
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct fuse_file *ff = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(ff->fc);
+	unsigned int flags = 0;
+
+	if (cc->unrestricted_ioctl)
+		flags |= FUSE_IOCTL_UNRESTRICTED;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fuse_file *ff = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(ff->fc);
+	unsigned int flags = FUSE_IOCTL_COMPAT;
+
+	if (cc->unrestricted_ioctl)
+		flags |= FUSE_IOCTL_UNRESTRICTED;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+	.owner			= THIS_MODULE,
+	.read			= cuse_read,
+	.write			= cuse_write,
+	.open			= cuse_open,
+	.release		= cuse_release,
+	.unlocked_ioctl		= cuse_file_ioctl,
+	.compat_ioctl		= cuse_file_compat_ioctl,
+	.poll			= fuse_file_poll,
+};
+
+
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+
+struct cuse_devinfo {
+	const char		*name;
+};
+
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1.  This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value.  Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'.  *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+	char *p = *pp;
+	char *key, *val;
+
+	while (p < end && *p == '\0')
+		p++;
+	if (p == end)
+		return 0;
+
+	if (end[-1] != '\0') {
+		printk(KERN_ERR "CUSE: info not properly terminated\n");
+		return -EINVAL;
+	}
+
+	key = val = p;
+	p += strlen(p);
+
+	if (valp) {
+		strsep(&val, "=");
+		if (!val)
+			val = key + strlen(key);
+		key = strstrip(key);
+		val = strstrip(val);
+	} else
+		key = strstrip(key);
+
+	if (!strlen(key)) {
+		printk(KERN_ERR "CUSE: zero length info key specified\n");
+		return -EINVAL;
+	}
+
+	*pp = p;
+	*keyp = key;
+	if (valp)
+		*valp = val;
+
+	return 1;
+}
+
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo.  String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+	char *end = p + len;
+	char *key, *val;
+	int rc;
+
+	while (true) {
+		rc = cuse_parse_one(&p, end, &key, &val);
+		if (rc < 0)
+			return rc;
+		if (!rc)
+			break;
+		if (strcmp(key, "DEVNAME") == 0)
+			devinfo->name = val;
+		else
+			printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+			       key);
+	}
+
+	if (!devinfo->name || !strlen(devinfo->name)) {
+		printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it.  Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct cuse_conn *cc = fc_to_cc(fc);
+	struct cuse_init_out *arg = &req->misc.cuse_init_out;
+	struct page *page = req->pages[0];
+	struct cuse_devinfo devinfo = { };
+	struct device *dev;
+	struct cdev *cdev;
+	dev_t devt;
+	int rc;
+
+	if (req->out.h.error ||
+	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+		goto err;
+	}
+
+	fc->minor = arg->minor;
+	fc->max_read = max_t(unsigned, arg->max_read, 4096);
+	fc->max_write = max_t(unsigned, arg->max_write, 4096);
+
+	/* parse init reply */
+	cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+
+	rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+				&devinfo);
+	if (rc)
+		goto err;
+
+	/* determine and reserve devt */
+	devt = MKDEV(arg->dev_major, arg->dev_minor);
+	if (!MAJOR(devt))
+		rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+	else
+		rc = register_chrdev_region(devt, 1, devinfo.name);
+	if (rc) {
+		printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+		goto err;
+	}
+
+	/* devt determined, create device */
+	rc = -ENOMEM;
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		goto err_region;
+
+	device_initialize(dev);
+	dev_set_uevent_suppress(dev, 1);
+	dev->class = cuse_class;
+	dev->devt = devt;
+	dev->release = cuse_gendev_release;
+	dev_set_drvdata(dev, cc);
+	dev_set_name(dev, "%s", devinfo.name);
+
+	rc = device_add(dev);
+	if (rc)
+		goto err_device;
+
+	/* register cdev */
+	rc = -ENOMEM;
+	cdev = cdev_alloc();
+	if (!cdev)
+		goto err_device;
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = &cuse_frontend_fops;
+
+	rc = cdev_add(cdev, devt, 1);
+	if (rc)
+		goto err_cdev;
+
+	cc->dev = dev;
+	cc->cdev = cdev;
+
+	/* make the device available */
+	spin_lock(&cuse_lock);
+	list_add(&cc->list, cuse_conntbl_head(devt));
+	spin_unlock(&cuse_lock);
+
+	/* announce device availability */
+	dev_set_uevent_suppress(dev, 0);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+	__free_page(page);
+	return;
+
+err_cdev:
+	cdev_del(cdev);
+err_device:
+	put_device(dev);
+err_region:
+	unregister_chrdev_region(devt, 1);
+err:
+	fc->conn_error = 1;
+	goto out;
+}
+
+static int cuse_send_init(struct cuse_conn *cc)
+{
+	int rc;
+	struct fuse_req *req;
+	struct page *page;
+	struct fuse_conn *fc = &cc->fc;
+	struct cuse_init_in *arg;
+
+	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		goto err;
+	}
+
+	rc = -ENOMEM;
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto err_put_req;
+
+	arg = &req->misc.cuse_init_in;
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+	req->in.h.opcode = CUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct cuse_init_in);
+	req->in.args[0].value = arg;
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(struct cuse_init_out);
+	req->out.args[0].value = &req->misc.cuse_init_out;
+	req->out.args[1].size = CUSE_INIT_INFO_MAX;
+	req->out.argvar = 1;
+	req->out.argpages = 1;
+	req->pages[0] = page;
+	req->num_pages = 1;
+	req->end = cuse_process_init_reply;
+	fuse_request_send_background(fc, req);
+
+	return 0;
+
+err_put_req:
+	fuse_put_request(fc, req);
+err:
+	return rc;
+}
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+	struct cuse_conn *cc = fc_to_cc(fc);
+	kfree(cc);
+}
+
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initilaization request kernel sends.  This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init.  The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+	struct cuse_conn *cc;
+	int rc;
+
+	/* set up cuse_conn */
+	cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+	if (!cc)
+		return -ENOMEM;
+
+	fuse_conn_init(&cc->fc);
+
+	INIT_LIST_HEAD(&cc->list);
+	cc->fc.release = cuse_fc_release;
+
+	cc->fc.connected = 1;
+	cc->fc.blocked = 0;
+	rc = cuse_send_init(cc);
+	if (rc) {
+		fuse_conn_put(&cc->fc);
+		return rc;
+	}
+	file->private_data = &cc->fc;	/* channel owns base reference to cc */
+
+	return 0;
+}
+
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+	struct cuse_conn *cc = fc_to_cc(file->private_data);
+	int rc;
+
+	/* remove from the conntbl, no more access from this point on */
+	spin_lock(&cuse_lock);
+	list_del_init(&cc->list);
+	spin_unlock(&cuse_lock);
+
+	/* remove device */
+	if (cc->dev)
+		device_unregister(cc->dev);
+	if (cc->cdev) {
+		unregister_chrdev_region(cc->cdev->dev, 1);
+		cdev_del(cc->cdev);
+	}
+
+	/* kill connection and shutdown channel */
+	fuse_conn_kill(&cc->fc);
+	rc = fuse_dev_release(inode, file);	/* puts the base reference */
+
+	return rc;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+
+static ssize_t cuse_class_waiting_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct cuse_conn *cc = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+
+static ssize_t cuse_class_abort_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct cuse_conn *cc = dev_get_drvdata(dev);
+
+	fuse_abort_conn(&cc->fc);
+	return count;
+}
+
+static struct device_attribute cuse_class_dev_attrs[] = {
+	__ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
+	__ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
+	{ }
+};
+
+static struct miscdevice cuse_miscdev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "cuse",
+	.fops		= &cuse_channel_fops,
+};
+
+static int __init cuse_init(void)
+{
+	int i, rc;
+
+	/* init conntbl */
+	for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+		INIT_LIST_HEAD(&cuse_conntbl[i]);
+
+	/* inherit and extend fuse_dev_operations */
+	cuse_channel_fops		= fuse_dev_operations;
+	cuse_channel_fops.owner		= THIS_MODULE;
+	cuse_channel_fops.open		= cuse_channel_open;
+	cuse_channel_fops.release	= cuse_channel_release;
+
+	cuse_class = class_create(THIS_MODULE, "cuse");
+	if (IS_ERR(cuse_class))
+		return PTR_ERR(cuse_class);
+
+	cuse_class->dev_attrs = cuse_class_dev_attrs;
+
+	rc = misc_register(&cuse_miscdev);
+	if (rc) {
+		class_destroy(cuse_class);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void __exit cuse_exit(void)
+{
+	misc_deregister(&cuse_miscdev);
+	class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 162e5defe68..d41ed593f79 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -120,6 +120,13 @@ struct fuse_file_lock {
 #define FUSE_EXPORT_SUPPORT	(1 << 4)
 #define FUSE_BIG_WRITES		(1 << 5)
 
+/**
+ * CUSE INIT request/reply flags
+ *
+ * CUSE_UNRESTRICTED_IOCTL:  use unrestricted ioctl
+ */
+#define CUSE_UNRESTRICTED_IOCTL	(1 << 0)
+
 /**
  * Release flags
  */
@@ -210,6 +217,9 @@ enum fuse_opcode {
 	FUSE_DESTROY       = 38,
 	FUSE_IOCTL         = 39,
 	FUSE_POLL          = 40,
+
+	/* CUSE specific operations */
+	CUSE_INIT          = 4096,
 };
 
 enum fuse_notify_code {
@@ -401,6 +411,27 @@ struct fuse_init_out {
 	__u32	max_write;
 };
 
+#define CUSE_INIT_INFO_MAX 4096
+
+struct cuse_init_in {
+	__u32	major;
+	__u32	minor;
+	__u32	unused;
+	__u32	flags;
+};
+
+struct cuse_init_out {
+	__u32	major;
+	__u32	minor;
+	__u32	unused;
+	__u32	flags;
+	__u32	max_read;
+	__u32	max_write;
+	__u32	dev_major;		/* chardev major */
+	__u32	dev_minor;		/* chardev minor */
+	__u32	spare[10];
+};
+
 struct fuse_interrupt_in {
 	__u64	unique;
 };
-- 
cgit v1.2.3-70-g09d2


From fcb94e422479da52ed90bab230c59617a0462416 Mon Sep 17 00:00:00 2001
From: Sergey Lapin <slapin@ossfans.org>
Date: Mon, 8 Jun 2009 12:18:47 +0000
Subject: Add constants for the ieee 802.15.4 stack

IEEE 802.15.4 stack requires several constants to be defined/adjusted.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Sergey Lapin <slapin@ossfans.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h   | 2 ++
 include/linux/if_ether.h | 1 +
 include/linux/socket.h   | 4 +++-
 net/core/dev.c           | 6 ++++--
 net/core/sock.c          | 3 +++
 5 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 5ff89809a58..b554300ef8b 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -86,6 +86,8 @@
 #define ARPHRD_IEEE80211 801		/* IEEE 802.11			*/
 #define ARPHRD_IEEE80211_PRISM 802	/* IEEE 802.11 + Prism2 header  */
 #define ARPHRD_IEEE80211_RADIOTAP 803	/* IEEE 802.11 + radiotap header */
+#define ARPHRD_IEEE802154	  804
+#define ARPHRD_IEEE802154_PHY	  805
 
 #define ARPHRD_PHONET	820		/* PhoNet media type		*/
 #define ARPHRD_PHONET_PIPE 821		/* PhoNet pipe header		*/
diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index cfe4fe1b713..11a60e4f0a6 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -106,6 +106,7 @@
 #define ETH_P_DSA	0x001B		/* Distributed Switch Arch.	*/
 #define ETH_P_TRAILER	0x001C		/* Trailer switch tagging	*/
 #define ETH_P_PHONET	0x00F5		/* Nokia Phonet frames          */
+#define ETH_P_IEEE802154 0x00F6		/* IEEE802.15.4 frame		*/
 
 /*
  *	This is an Ethernet frame header.
diff --git a/include/linux/socket.h b/include/linux/socket.h
index d2310cb45d2..3b461dffe24 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -194,7 +194,8 @@ struct ucred {
 #define AF_RXRPC	33	/* RxRPC sockets 		*/
 #define AF_ISDN		34	/* mISDN sockets 		*/
 #define AF_PHONET	35	/* Phonet sockets		*/
-#define AF_MAX		36	/* For now.. */
+#define AF_IEEE802154	36	/* IEEE802154 sockets		*/
+#define AF_MAX		37	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -233,6 +234,7 @@ struct ucred {
 #define PF_RXRPC	AF_RXRPC
 #define PF_ISDN		AF_ISDN
 #define PF_PHONET	AF_PHONET
+#define PF_IEEE802154	AF_IEEE802154
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/net/core/dev.c b/net/core/dev.c
index 81b392ef511..11560e3258b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -269,7 +269,8 @@ static const unsigned short netdev_lock_type[] =
 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
-	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
+	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_IEEE802154_PHY,
+	 ARPHRD_VOID, ARPHRD_NONE};
 
 static const char *netdev_lock_name[] =
 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
@@ -286,7 +287,8 @@ static const char *netdev_lock_name[] =
 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
-	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
+	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_IEEE802154_PHY",
+	 "_xmit_VOID", "_xmit_NONE"};
 
 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
diff --git a/net/core/sock.c b/net/core/sock.c
index 58dec9dff99..04e35eb2e73 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -155,6 +155,7 @@ static const char *af_family_key_strings[AF_MAX+1] = {
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
+  "sk_lock-AF_IEEE802154",
   "sk_lock-AF_MAX"
 };
 static const char *af_family_slock_key_strings[AF_MAX+1] = {
@@ -170,6 +171,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
+  "slock-AF_IEEE802154",
   "slock-AF_MAX"
 };
 static const char *af_family_clock_key_strings[AF_MAX+1] = {
@@ -185,6 +187,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
+  "clock-AF_IEEE802154",
   "clock-AF_MAX"
 };
 
-- 
cgit v1.2.3-70-g09d2


From 2c21d11518b688cd4c8e7ddfcd4ba41482ad075b Mon Sep 17 00:00:00 2001
From: Sergey Lapin <slapin@ossfans.org>
Date: Mon, 8 Jun 2009 12:18:49 +0000
Subject: net: add NL802154 interface for configuration of 802.15.4 devices

Add a netlink interface for configuration of IEEE 802.15.4 device. Also this
interface specifies events notification sent by devices towards higher layers.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Sergey Lapin <slapin@ossfans.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/nl802154.h          | 119 +++++++++
 include/net/ieee802154/nl802154.h |  41 +++
 net/ieee802154/Makefile           |   3 +-
 net/ieee802154/netlink.c          | 523 ++++++++++++++++++++++++++++++++++++++
 net/ieee802154/nl_policy.c        |  52 ++++
 5 files changed, 737 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/nl802154.h
 create mode 100644 include/net/ieee802154/nl802154.h
 create mode 100644 net/ieee802154/netlink.c
 create mode 100644 net/ieee802154/nl_policy.c

(limited to 'include/linux')

diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
new file mode 100644
index 00000000000..2cda00ccfcc
--- /dev/null
+++ b/include/linux/nl802154.h
@@ -0,0 +1,119 @@
+/*
+ * nl802154.h
+ *
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef NL802154_H
+#define NL802154_H
+
+#define IEEE802154_NL_NAME "802.15.4 MAC"
+#define IEEE802154_MCAST_COORD_NAME "coordinator"
+#define IEEE802154_MCAST_BEACON_NAME "beacon"
+
+enum {
+	__IEEE802154_ATTR_INVALID,
+
+	IEEE802154_ATTR_DEV_NAME,
+	IEEE802154_ATTR_DEV_INDEX,
+
+	IEEE802154_ATTR_STATUS,
+
+	IEEE802154_ATTR_SHORT_ADDR,
+	IEEE802154_ATTR_HW_ADDR,
+	IEEE802154_ATTR_PAN_ID,
+
+	IEEE802154_ATTR_CHANNEL,
+
+	IEEE802154_ATTR_COORD_SHORT_ADDR,
+	IEEE802154_ATTR_COORD_HW_ADDR,
+	IEEE802154_ATTR_COORD_PAN_ID,
+
+	IEEE802154_ATTR_SRC_SHORT_ADDR,
+	IEEE802154_ATTR_SRC_HW_ADDR,
+	IEEE802154_ATTR_SRC_PAN_ID,
+
+	IEEE802154_ATTR_DEST_SHORT_ADDR,
+	IEEE802154_ATTR_DEST_HW_ADDR,
+	IEEE802154_ATTR_DEST_PAN_ID,
+
+	IEEE802154_ATTR_CAPABILITY,
+	IEEE802154_ATTR_REASON,
+	IEEE802154_ATTR_SCAN_TYPE,
+	IEEE802154_ATTR_CHANNELS,
+	IEEE802154_ATTR_DURATION,
+	IEEE802154_ATTR_ED_LIST,
+	IEEE802154_ATTR_BCN_ORD,
+	IEEE802154_ATTR_SF_ORD,
+	IEEE802154_ATTR_PAN_COORD,
+	IEEE802154_ATTR_BAT_EXT,
+	IEEE802154_ATTR_COORD_REALIGN,
+	IEEE802154_ATTR_SEC,
+
+	__IEEE802154_ATTR_MAX,
+};
+
+#define IEEE802154_ATTR_MAX (__IEEE802154_ATTR_MAX - 1)
+
+extern struct nla_policy ieee802154_policy[];
+
+/* commands */
+/* REQ should be responded with CONF
+ * and INDIC with RESP
+ */
+enum {
+	__IEEE802154_COMMAND_INVALID,
+
+	IEEE802154_ASSOCIATE_REQ,
+	IEEE802154_ASSOCIATE_CONF,
+	IEEE802154_DISASSOCIATE_REQ,
+	IEEE802154_DISASSOCIATE_CONF,
+	IEEE802154_GET_REQ,
+	IEEE802154_GET_CONF,
+	IEEE802154_RESET_REQ,
+	IEEE802154_RESET_CONF,
+	IEEE802154_SCAN_REQ,
+	IEEE802154_SCAN_CONF,
+	IEEE802154_SET_REQ,
+	IEEE802154_SET_CONF,
+	IEEE802154_START_REQ,
+	IEEE802154_START_CONF,
+	IEEE802154_SYNC_REQ,
+	IEEE802154_POLL_REQ,
+	IEEE802154_POLL_CONF,
+
+	IEEE802154_ASSOCIATE_INDIC,
+	IEEE802154_ASSOCIATE_RESP,
+	IEEE802154_DISASSOCIATE_INDIC,
+	IEEE802154_BEACON_NOTIFY_INDIC,
+	IEEE802154_ORPHAN_INDIC,
+	IEEE802154_ORPHAN_RESP,
+	IEEE802154_COMM_STATUS_INDIC,
+	IEEE802154_SYNC_LOSS_INDIC,
+
+	IEEE802154_GTS_REQ, /* Not supported yet */
+	IEEE802154_GTS_INDIC, /* Not supported yet */
+	IEEE802154_GTS_CONF, /* Not supported yet */
+	IEEE802154_RX_ENABLE_REQ, /* Not supported yet */
+	IEEE802154_RX_ENABLE_CONF, /* Not supported yet */
+
+	__IEEE802154_CMD_MAX,
+};
+
+#define IEEE802154_CMD_MAX (__IEEE802154_CMD_MAX - 1)
+
+#endif
diff --git a/include/net/ieee802154/nl802154.h b/include/net/ieee802154/nl802154.h
new file mode 100644
index 00000000000..78efcdf52b5
--- /dev/null
+++ b/include/net/ieee802154/nl802154.h
@@ -0,0 +1,41 @@
+/*
+ * nl802154.h
+ *
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef IEEE802154_NL_H
+#define IEEE802154_NL_H
+
+struct net_device;
+struct ieee802154_addr;
+
+int ieee802154_nl_assoc_indic(struct net_device *dev,
+		struct ieee802154_addr *addr, u8 cap);
+int ieee802154_nl_assoc_confirm(struct net_device *dev,
+		u16 short_addr, u8 status);
+int ieee802154_nl_disassoc_indic(struct net_device *dev,
+		struct ieee802154_addr *addr, u8 reason);
+int ieee802154_nl_disassoc_confirm(struct net_device *dev,
+		u8 status);
+int ieee802154_nl_scan_confirm(struct net_device *dev,
+		u8 status, u8 scan_type, u32 unscanned,
+		u8 *edl/*, struct list_head *pan_desc_list */);
+int ieee802154_nl_beacon_indic(struct net_device *dev, u16 panid,
+		u16 coord_addr);
+
+#endif
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 7451c7cc8c8..f99338a2610 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -1,4 +1,5 @@
-obj-$(CONFIG_IEEE802154) +=	af_802154.o
+obj-$(CONFIG_IEEE802154) +=	nl802154.o af_802154.o
+nl802154-y		:= netlink.o nl_policy.o
 af_802154-y		:= af_ieee802154.o raw.o dgram.o
 
 ccflags-y += -Wall -DDEBUG
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
new file mode 100644
index 00000000000..105ad10876a
--- /dev/null
+++ b/net/ieee802154/netlink.c
@@ -0,0 +1,523 @@
+/*
+ * Netlink inteface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <linux/nl802154.h>
+#include <net/ieee802154/af_ieee802154.h>
+#include <net/ieee802154/nl802154.h>
+#include <net/ieee802154/netdevice.h>
+
+static unsigned int ieee802154_seq_num;
+
+static struct genl_family ieee802154_coordinator_family = {
+	.id		= GENL_ID_GENERATE,
+	.hdrsize	= 0,
+	.name		= IEEE802154_NL_NAME,
+	.version	= 1,
+	.maxattr	= IEEE802154_ATTR_MAX,
+};
+
+static struct genl_multicast_group ieee802154_coord_mcgrp = {
+	.name		= IEEE802154_MCAST_COORD_NAME,
+};
+
+static struct genl_multicast_group ieee802154_beacon_mcgrp = {
+	.name		= IEEE802154_MCAST_BEACON_NAME,
+};
+
+/* Requests to userspace */
+static struct sk_buff *ieee802154_nl_create(int flags, u8 req)
+{
+	void *hdr;
+	struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+
+	if (!msg)
+		return NULL;
+
+	hdr = genlmsg_put(msg, 0, ieee802154_seq_num++,
+			&ieee802154_coordinator_family, flags, req);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return NULL;
+	}
+
+	return msg;
+}
+
+static int ieee802154_nl_finish(struct sk_buff *msg)
+{
+	/* XXX: nlh is right at the start of msg */
+	void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
+
+	if (!genlmsg_end(msg, hdr))
+		goto out;
+
+	return genlmsg_multicast(msg, 0, ieee802154_coord_mcgrp.id,
+			GFP_ATOMIC);
+out:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+int ieee802154_nl_assoc_indic(struct net_device *dev,
+		struct ieee802154_addr *addr, u8 cap)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	if (addr->addr_type != IEEE802154_ADDR_LONG) {
+		pr_err("%s: received non-long source address!\n", __func__);
+		return -EINVAL;
+	}
+
+	msg = ieee802154_nl_create(0, IEEE802154_ASSOCIATE_INDIC);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN,
+			addr->hwaddr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_CAPABILITY, cap);
+
+	return ieee802154_nl_finish(msg);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_assoc_indic);
+
+int ieee802154_nl_assoc_confirm(struct net_device *dev, u16 short_addr,
+		u8 status)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_ASSOCIATE_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U16(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+
+	return ieee802154_nl_finish(msg);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_assoc_confirm);
+
+int ieee802154_nl_disassoc_indic(struct net_device *dev,
+		struct ieee802154_addr *addr, u8 reason)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_DISASSOCIATE_INDIC);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	if (addr->addr_type == IEEE802154_ADDR_LONG)
+		NLA_PUT(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN,
+				addr->hwaddr);
+	else
+		NLA_PUT_U16(msg, IEEE802154_ATTR_SRC_SHORT_ADDR,
+				addr->short_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_REASON, reason);
+
+	return ieee802154_nl_finish(msg);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_disassoc_indic);
+
+int ieee802154_nl_disassoc_confirm(struct net_device *dev, u8 status)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_DISASSOCIATE_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+
+	return ieee802154_nl_finish(msg);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_disassoc_confirm);
+
+int ieee802154_nl_beacon_indic(struct net_device *dev,
+		u16 panid, u16 coord_addr)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_BEACON_NOTIFY_INDIC);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+	NLA_PUT_U16(msg, IEEE802154_ATTR_COORD_SHORT_ADDR, coord_addr);
+	NLA_PUT_U16(msg, IEEE802154_ATTR_COORD_PAN_ID, panid);
+
+	return ieee802154_nl_finish(msg);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_beacon_indic);
+
+int ieee802154_nl_scan_confirm(struct net_device *dev,
+		u8 status, u8 scan_type, u32 unscanned,
+		u8 *edl/* , struct list_head *pan_desc_list */)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_SCAN_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_SCAN_TYPE, scan_type);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_CHANNELS, unscanned);
+
+	if (edl)
+		NLA_PUT(msg, IEEE802154_ATTR_ED_LIST, 27, edl);
+
+	return ieee802154_nl_finish(msg);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_scan_confirm);
+
+/* Requests from userspace */
+static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
+{
+	struct net_device *dev;
+
+	if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+		char name[IFNAMSIZ + 1];
+		nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+				sizeof(name));
+		dev = dev_get_by_name(&init_net, name);
+	} else if (info->attrs[IEEE802154_ATTR_DEV_INDEX])
+		dev = dev_get_by_index(&init_net,
+				nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX]));
+	else
+		return NULL;
+
+	if (dev->type != ARPHRD_IEEE802154) {
+		dev_put(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+
+static int ieee802154_associate_req(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+	int ret = -EINVAL;
+
+	if (!info->attrs[IEEE802154_ATTR_CHANNEL] ||
+	    !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+	    (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] &&
+		!info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) ||
+	    !info->attrs[IEEE802154_ATTR_CAPABILITY])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) {
+		addr.addr_type = IEEE802154_ADDR_LONG;
+		nla_memcpy(addr.hwaddr,
+				info->attrs[IEEE802154_ATTR_COORD_HW_ADDR],
+				IEEE802154_ADDR_LEN);
+	} else {
+		addr.addr_type = IEEE802154_ADDR_SHORT;
+		addr.short_addr = nla_get_u16(
+				info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+	}
+	addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+	ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr,
+			nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]),
+			nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY]));
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_associate_resp(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+	int ret = -EINVAL;
+
+	if (!info->attrs[IEEE802154_ATTR_STATUS] ||
+	    !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] ||
+	    !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	addr.addr_type = IEEE802154_ADDR_LONG;
+	nla_memcpy(addr.hwaddr, info->attrs[IEEE802154_ATTR_DEST_HW_ADDR],
+			IEEE802154_ADDR_LEN);
+	addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+
+
+	ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr,
+		nla_get_u16(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
+		nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS]));
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_disassociate_req(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+	int ret = -EINVAL;
+
+	if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] &&
+		!info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) ||
+	    !info->attrs[IEEE802154_ATTR_REASON])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) {
+		addr.addr_type = IEEE802154_ADDR_LONG;
+		nla_memcpy(addr.hwaddr,
+				info->attrs[IEEE802154_ATTR_DEST_HW_ADDR],
+				IEEE802154_ADDR_LEN);
+	} else {
+		addr.addr_type = IEEE802154_ADDR_SHORT;
+		addr.short_addr = nla_get_u16(
+				info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]);
+	}
+	addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+
+	ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
+			nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
+
+	dev_put(dev);
+	return ret;
+}
+
+/*
+ * PANid, channel, beacon_order = 15, superframe_order = 15,
+ * PAN_coordinator, battery_life_extension = 0,
+ * coord_realignment = 0, security_enable = 0
+*/
+static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+
+	u8 channel, bcn_ord, sf_ord;
+	int pan_coord, blx, coord_realign;
+	int ret;
+
+	if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+	    !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] ||
+	    !info->attrs[IEEE802154_ATTR_CHANNEL] ||
+	    !info->attrs[IEEE802154_ATTR_BCN_ORD] ||
+	    !info->attrs[IEEE802154_ATTR_SF_ORD] ||
+	    !info->attrs[IEEE802154_ATTR_PAN_COORD] ||
+	    !info->attrs[IEEE802154_ATTR_BAT_EXT] ||
+	    !info->attrs[IEEE802154_ATTR_COORD_REALIGN]
+	 )
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	addr.addr_type = IEEE802154_ADDR_SHORT;
+	addr.short_addr = nla_get_u16(
+			info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+	addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+	channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]);
+	bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]);
+	sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]);
+	pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]);
+	blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]);
+	coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]);
+
+	ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel,
+		bcn_ord, sf_ord, pan_coord, blx, coord_realign);
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device *dev;
+	int ret;
+	u8 type;
+	u32 channels;
+	u8 duration;
+
+	if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] ||
+	    !info->attrs[IEEE802154_ATTR_CHANNELS] ||
+	    !info->attrs[IEEE802154_ATTR_DURATION])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]);
+	channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]);
+	duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]);
+
+	ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels,
+			duration);
+
+	dev_put(dev);
+	return ret;
+}
+
+#define IEEE802154_OP(_cmd, _func)			\
+	{						\
+		.cmd	= _cmd,				\
+		.policy	= ieee802154_policy,		\
+		.doit	= _func,			\
+		.dumpit	= NULL,				\
+		.flags	= GENL_ADMIN_PERM,		\
+	}
+
+static struct genl_ops ieee802154_coordinator_ops[] = {
+	IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req),
+	IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp),
+	IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req),
+	IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req),
+	IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req),
+};
+
+static int __init ieee802154_nl_init(void)
+{
+	int rc;
+	int i;
+
+	rc = genl_register_family(&ieee802154_coordinator_family);
+	if (rc)
+		goto fail;
+
+	rc = genl_register_mc_group(&ieee802154_coordinator_family,
+			&ieee802154_coord_mcgrp);
+	if (rc)
+		goto fail;
+
+	rc = genl_register_mc_group(&ieee802154_coordinator_family,
+			&ieee802154_beacon_mcgrp);
+	if (rc)
+		goto fail;
+
+
+	for (i = 0; i < ARRAY_SIZE(ieee802154_coordinator_ops); i++) {
+		rc = genl_register_ops(&ieee802154_coordinator_family,
+				&ieee802154_coordinator_ops[i]);
+		if (rc)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	genl_unregister_family(&ieee802154_coordinator_family);
+	return rc;
+}
+module_init(ieee802154_nl_init);
+
+static void __exit ieee802154_nl_exit(void)
+{
+	genl_unregister_family(&ieee802154_coordinator_family);
+}
+module_exit(ieee802154_nl_exit);
+
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
new file mode 100644
index 00000000000..c7d71d1adca
--- /dev/null
+++ b/net/ieee802154/nl_policy.c
@@ -0,0 +1,52 @@
+/*
+ * nl802154.h
+ *
+ * Copyright (C) 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/nl802154.h>
+
+#define NLA_HW_ADDR NLA_U64
+
+struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
+	[IEEE802154_ATTR_DEV_NAME] = { .type = NLA_STRING, },
+	[IEEE802154_ATTR_DEV_INDEX] = { .type = NLA_U32, },
+
+	[IEEE802154_ATTR_STATUS] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_SRC_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_SRC_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_SRC_PAN_ID] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_DEST_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_DEST_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_DEST_PAN_ID] = { .type = NLA_U16, },
+
+	[IEEE802154_ATTR_CAPABILITY] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_REASON] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_SCAN_TYPE] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_CHANNELS] = { .type = NLA_U32, },
+	[IEEE802154_ATTR_DURATION] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_ED_LIST] = { .len = 27 },
+};
-- 
cgit v1.2.3-70-g09d2


From 18760f1e74e8dfe8f30d4891e66163d1e6feb893 Mon Sep 17 00:00:00 2001
From: Chaitanya Lala <clala@riverbed.com>
Date: Mon, 8 Jun 2009 14:28:54 +0000
Subject: e1000e: Expose MDI-X status via ethtool change

Ethtool is a standard way of getting information about
ethernet interfaces.  We enhance ethtool kernel interface
& e1000e to make the MDI-X status readable via ethtool in
userspace.

Signed-off-by: Chaitanya Lala <clala@riverbed.com>
Signed-off-by: Arthur Jones <ajones@riverbed.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/e1000e/ethtool.c | 9 +++++++++
 include/linux/ethtool.h      | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index 3d643561752..1bf4d2a5d34 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -167,6 +167,15 @@ static int e1000_get_settings(struct net_device *netdev,
 
 	ecmd->autoneg = ((hw->phy.media_type == e1000_media_type_fiber) ||
 			 hw->mac.autoneg) ? AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	/* MDI-X => 2; MDI =>1; Invalid =>0 */
+	if ((hw->phy.media_type == e1000_media_type_copper) &&
+	    !hw->mac.get_link_status)
+		ecmd->eth_tp_mdix = hw->phy.is_mdix ? ETH_TP_MDI_X :
+		                                      ETH_TP_MDI;
+	else
+		ecmd->eth_tp_mdix = ETH_TP_MDI_INVALID;
+
 	return 0;
 }
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 380b04272bf..9b660bd2e2b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -30,7 +30,8 @@ struct ethtool_cmd {
 	__u32	maxtxpkt;	/* Tx pkts before generating tx int */
 	__u32	maxrxpkt;	/* Rx pkts before generating rx int */
 	__u16	speed_hi;
-	__u16	reserved2;
+	__u8	eth_tp_mdix;
+	__u8	reserved2;
 	__u32	lp_advertising;	/* Features the link partner advertises */
 	__u32	reserved[2];
 };
@@ -632,6 +633,11 @@ struct ethtool_ops {
 #define AUTONEG_DISABLE		0x00
 #define AUTONEG_ENABLE		0x01
 
+/* Mode MDI or MDI-X */
+#define ETH_TP_MDI_INVALID	0x00
+#define ETH_TP_MDI		0x01
+#define ETH_TP_MDI_X		0x02
+
 /* Wake-On-Lan options. */
 #define WAKE_PHY		(1 << 0)
 #define WAKE_UCAST		(1 << 1)
-- 
cgit v1.2.3-70-g09d2


From 1d589bb16b825b3a7b4edd34d997f1f1f953033d Mon Sep 17 00:00:00 2001
From: john cooper <john.cooper@redhat.com>
Date: Tue, 9 Jun 2009 14:41:40 +0200
Subject: Add serial number support for virtio_blk, V4a

This patch extracts the opaque data from pci i/o
region 0 via the added VIRTIO_BLK_F_IDENTIFY
field.  By convention this data takes the form of
that returned by an ATA IDENTIFY DEVICE command,
however the driver (except for structure size)
makes no interpretation of the data.  The structure
data is copied wholesale to userspace via a
HDIO_GET_IDENTITY ioctl command (eg: hdparm -i <dev>).

Signed-off-by: john cooper <john.cooper@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 37 ++++++++++++++++++++++++++++++++++---
 include/linux/virtio_blk.h |  4 ++++
 2 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c4845b16946..c0facaa55cf 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -171,11 +171,43 @@ static void do_virtblk_request(struct request_queue *q)
 		vblk->vq->vq_ops->kick(vblk->vq);
 }
 
+/* return ATA identify data
+ */
+static int virtblk_identify(struct gendisk *disk, void *argp)
+{
+	struct virtio_blk *vblk = disk->private_data;
+	void *opaque;
+	int err = -ENOMEM;
+
+	opaque = kmalloc(VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+	if (!opaque)
+		goto out;
+
+	err = virtio_config_buf(vblk->vdev, VIRTIO_BLK_F_IDENTIFY,
+		offsetof(struct virtio_blk_config, identify), opaque,
+		VIRTIO_BLK_ID_BYTES);
+
+	if (err)
+		goto out_kfree;
+
+	if (copy_to_user(argp, opaque, VIRTIO_BLK_ID_BYTES))
+		err = -EFAULT;
+
+out_kfree:
+	kfree(opaque);
+out:
+	return err;
+}
+
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct virtio_blk *vblk = disk->private_data;
+	void __user *argp = (void __user *)data;
+
+	if (cmd == HDIO_GET_IDENTITY)
+		return virtblk_identify(disk, argp);
 
 	/*
 	 * Only allow the generic SCSI ioctls if the host can support it.
@@ -183,8 +215,7 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOIOCTLCMD;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
-			      (void __user *)data);
+	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
@@ -390,7 +421,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_SCSI,
+	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 4dbcbc1c348..be7d255fc7c 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -16,6 +16,9 @@
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
+#define VIRTIO_BLK_F_IDENTIFY	8	/* ATA IDENTIFY supported */
+
+#define VIRTIO_BLK_ID_BYTES	(sizeof(__u16[256]))	/* IDENTIFY DATA */
 
 struct virtio_blk_config
 {
@@ -33,6 +36,7 @@ struct virtio_blk_config
 	} geometry;
 	/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
 	__u32 blk_size;
+	__u8 identify[VIRTIO_BLK_ID_BYTES];
 } __attribute__((packed));
 
 /* These two define direction. */
-- 
cgit v1.2.3-70-g09d2


From a31e1ffd2231b8fdf7eda9ed750a4a0df9bcad4e Mon Sep 17 00:00:00 2001
From: Laszlo Attila Toth <panther@balabit.hu>
Date: Tue, 9 Jun 2009 15:16:34 +0200
Subject: netfilter: xt_socket: added new revision of the 'socket' match
 supporting flags

If the XT_SOCKET_TRANSPARENT flag is set, enabled 'transparent'
socket option is required for the socket to be matched.

Signed-off-by: Laszlo Attila Toth <panther@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_socket.h | 12 +++++++
 net/netfilter/xt_socket.c           | 63 ++++++++++++++++++++++++++++++-------
 2 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/netfilter/xt_socket.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_socket.h b/include/linux/netfilter/xt_socket.h
new file mode 100644
index 00000000000..6f475b8ff34
--- /dev/null
+++ b/include/linux/netfilter/xt_socket.h
@@ -0,0 +1,12 @@
+#ifndef _XT_SOCKET_H
+#define _XT_SOCKET_H
+
+enum {
+	XT_SOCKET_TRANSPARENT = 1 << 0,
+};
+
+struct xt_socket_mtinfo1 {
+	__u8 flags;
+};
+
+#endif /* _XT_SOCKET_H */
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1acc089be7e..ebf00ad5b19 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,6 +22,8 @@
 #include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 
+#include <linux/netfilter/xt_socket.h>
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #define XT_SOCKET_HAVE_CONNTRACK 1
 #include <net/netfilter/nf_conntrack.h>
@@ -86,7 +88,8 @@ extract_icmp_fields(const struct sk_buff *skb,
 
 
 static bool
-socket_mt(const struct sk_buff *skb, const struct xt_match_param *par)
+socket_match(const struct sk_buff *skb, const struct xt_match_param *par,
+	     const struct xt_socket_mtinfo1 *info)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct udphdr _hdr, *hp = NULL;
@@ -141,10 +144,24 @@ socket_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
 				   saddr, daddr, sport, dport, par->in, false);
 	if (sk != NULL) {
-		bool wildcard = (sk->sk_state != TCP_TIME_WAIT && inet_sk(sk)->rcv_saddr == 0);
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    inet_sk(sk)->rcv_saddr == 0);
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
 
 		nf_tproxy_put_sock(sk);
-		if (wildcard)
+
+		if (wildcard || !transparent)
 			sk = NULL;
 	}
 
@@ -157,23 +174,47 @@ socket_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	return (sk != NULL);
 }
 
-static struct xt_match socket_mt_reg __read_mostly = {
-	.name		= "socket",
-	.family		= AF_INET,
-	.match		= socket_mt,
-	.hooks		= 1 << NF_INET_PRE_ROUTING,
-	.me		= THIS_MODULE,
+static bool
+socket_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	return socket_match(skb, par, NULL);
+}
+
+static bool
+socket_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	return socket_match(skb, par, par->matchinfo);
+}
+
+static struct xt_match socket_mt_reg[] __read_mostly = {
+	{
+		.name		= "socket",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.match		= socket_mt_v0,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV4,
+		.match		= socket_mt_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init socket_mt_init(void)
 {
 	nf_defrag_ipv4_enable();
-	return xt_register_match(&socket_mt_reg);
+	return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
 static void __exit socket_mt_exit(void)
 {
-	xt_unregister_match(&socket_mt_reg);
+	xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
 module_init(socket_mt_init);
-- 
cgit v1.2.3-70-g09d2


From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Jun 2009 13:43:05 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 block/blk-core.c             |  16 +-
 block/elevator.c             |   8 +-
 drivers/md/dm.c              |   5 +-
 fs/bio.c                     |   3 +-
 include/linux/blktrace_api.h |  13 ++
 include/trace/block.h        |  76 -------
 include/trace/events/block.h | 483 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/Makefile        |   5 +-
 kernel/trace/blktrace.c      |  78 ++++++-
 mm/bounce.c                  |   5 +-
 10 files changed, 588 insertions(+), 104 deletions(-)
 delete mode 100644 include/trace/block.h
 create mode 100644 include/trace/events/block.h

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1306de9cce0..9475bf99b89 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,22 +28,14 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
 
 #include "blk.h"
 
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a907257..e220f0c543e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,17 +33,16 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
-DEFINE_TRACE(block_rq_abort);
-
 /*
  * Merge hash stuff.
  */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e2ee4a79ea2..3fd8b1e6548 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -53,8 +54,6 @@ struct dm_target_io {
 	union map_info info;
 };
 
-DEFINE_TRACE(block_bio_complete);
-
 /*
  * For request-based dm.
  * One of these is allocated per request.
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece..740699c4f90 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 82b4636030e..c7ec31dd04c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
+#ifdef CONFIG_EVENT_TRACING
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+	return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644
index 5b12efa096b..00000000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-	      TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from),
-	      TP_ARGS(q, bio, dev, from));
-
-#endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644
index 00000000000..a99d1e565bb
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,483 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,         comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,		comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors	   = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned,	nr_sector	)
+		__field( int,		error		)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+        ),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			      bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			    bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio,
+		 unsigned int new_sector),
+
+	TP_ARGS(q, bio, new_sector),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev				)
+		__field( sector_t,	sector				)
+		__field( sector_t,	new_sector			)
+		__array( char,		rwbs,		6		)
+		__array( char,		comm,		TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->new_sector	= new_sector;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu / %llu [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->new_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, bio, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  __entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 06b85850fab..844164dca90 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55bc8e..7bd6a9893c2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq))
+		bytes = rq->data_len;
+	else
+		bytes = rq->hard_nr_sectors << 9;
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a..65f5e17e411 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,16 +14,15 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/block.h>
+
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
 static mempool_t *page_pool, *isa_page_pool;
 
-DEFINE_TRACE(block_bio_bounce);
-
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 8 Jun 2009 19:09:45 -0400
Subject: tracing: add trace_seq_vprint interface

The code to update the print formats for events requires a vprintf
format in the trace_seq. This patch adds that interface.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h   |  2 ++
 kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index ba9627f00d3..c68bccba207 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -27,6 +27,8 @@ trace_seq_init(struct trace_seq *s)
 #ifdef CONFIG_TRACING
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
 extern int
 trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 425725c1622..c05aff465dc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
-- 
cgit v1.2.3-70-g09d2


From fd6c3a8dc44329d3aff9a578b5120982f63711ee Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 10:58:33 +0000
Subject: initconst adjustments

- add .init.rodata to INIT_DATA, and group all initconst flavors
  together
- move strings generated from __setup_param() into .init.rodata
- add .*init.rodata to modpost's sets of init sections
- make modpost warn about references between meminit and cpuinit
  as well as memexit and cpuexit sections (as CPU and memory
  hotplug are independently selectable features)

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/asm-generic/vmlinux.lds.h |  5 ++--
 include/linux/init.h              |  3 ++-
 scripts/mod/modpost.c             | 48 +++++++++++++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 89853bcd27a..3edb1149974 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -336,10 +336,11 @@
 #define INIT_DATA							\
 	*(.init.data)							\
 	DEV_DISCARD(init.data)						\
-	DEV_DISCARD(init.rodata)					\
 	CPU_DISCARD(init.data)						\
-	CPU_DISCARD(init.rodata)					\
 	MEM_DISCARD(init.data)						\
+	*(.init.rodata)							\
+	DEV_DISCARD(init.rodata)					\
+	CPU_DISCARD(init.rodata)					\
 	MEM_DISCARD(init.rodata)
 
 #define INIT_TEXT							\
diff --git a/include/linux/init.h b/include/linux/init.h
index 0e06c176f18..9f70c9f25d4 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -225,7 +225,8 @@ struct obs_kernel_param {
  * obs_kernel_param "array" too far apart in .init.setup.
  */
 #define __setup_param(str, unique_id, fn, early)			\
-	static char __setup_str_##unique_id[] __initdata __aligned(1) = str; \
+	static const char __setup_str_##unique_id[] __initconst	\
+		__aligned(1) = str; \
 	static struct obs_kernel_param __setup_##unique_id	\
 		__used __section(.init.setup)			\
 		__attribute__((aligned((sizeof(long)))))	\
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 161b7846733..94e71efb2c1 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -763,6 +763,8 @@ static void check_section(const char *modname, struct elf_info *elf,
 
 
 #define ALL_INIT_DATA_SECTIONS \
+	".init.setup$", ".init.rodata$", \
+	".devinit.rodata$", ".cpuinit.rodata$", ".meminit.rodata$" \
 	".init.data$", ".devinit.data$", ".cpuinit.data$", ".meminit.data$"
 #define ALL_EXIT_DATA_SECTIONS \
 	".exit.data$", ".devexit.data$", ".cpuexit.data$", ".memexit.data$"
@@ -772,21 +774,23 @@ static void check_section(const char *modname, struct elf_info *elf,
 #define ALL_EXIT_TEXT_SECTIONS \
 	".exit.text$", ".devexit.text$", ".cpuexit.text$", ".memexit.text$"
 
-#define ALL_INIT_SECTIONS ALL_INIT_DATA_SECTIONS, ALL_INIT_TEXT_SECTIONS
-#define ALL_EXIT_SECTIONS ALL_EXIT_DATA_SECTIONS, ALL_EXIT_TEXT_SECTIONS
+#define ALL_INIT_SECTIONS INIT_SECTIONS, DEV_INIT_SECTIONS, \
+	CPU_INIT_SECTIONS, MEM_INIT_SECTIONS
+#define ALL_EXIT_SECTIONS EXIT_SECTIONS, DEV_EXIT_SECTIONS, \
+	CPU_EXIT_SECTIONS, MEM_EXIT_SECTIONS
 
 #define DATA_SECTIONS ".data$", ".data.rel$"
 #define TEXT_SECTIONS ".text$"
 
-#define INIT_SECTIONS      ".init.data$", ".init.text$"
-#define DEV_INIT_SECTIONS  ".devinit.data$", ".devinit.text$"
-#define CPU_INIT_SECTIONS  ".cpuinit.data$", ".cpuinit.text$"
-#define MEM_INIT_SECTIONS  ".meminit.data$", ".meminit.text$"
+#define INIT_SECTIONS      ".init.*"
+#define DEV_INIT_SECTIONS  ".devinit.*"
+#define CPU_INIT_SECTIONS  ".cpuinit.*"
+#define MEM_INIT_SECTIONS  ".meminit.*"
 
-#define EXIT_SECTIONS      ".exit.data$", ".exit.text$"
-#define DEV_EXIT_SECTIONS  ".devexit.data$", ".devexit.text$"
-#define CPU_EXIT_SECTIONS  ".cpuexit.data$", ".cpuexit.text$"
-#define MEM_EXIT_SECTIONS  ".memexit.data$", ".memexit.text$"
+#define EXIT_SECTIONS      ".exit.*"
+#define DEV_EXIT_SECTIONS  ".devexit.*"
+#define CPU_EXIT_SECTIONS  ".cpuexit.*"
+#define MEM_EXIT_SECTIONS  ".memexit.*"
 
 /* init data sections */
 static const char *init_data_sections[] = { ALL_INIT_DATA_SECTIONS, NULL };
@@ -869,12 +873,36 @@ const struct sectioncheck sectioncheck[] = {
 	.tosec   = { INIT_SECTIONS, NULL },
 	.mismatch = XXXINIT_TO_INIT,
 },
+/* Do not reference cpuinit code/data from meminit code/data */
+{
+	.fromsec = { MEM_INIT_SECTIONS, NULL },
+	.tosec   = { CPU_INIT_SECTIONS, NULL },
+	.mismatch = XXXINIT_TO_INIT,
+},
+/* Do not reference meminit code/data from cpuinit code/data */
+{
+	.fromsec = { CPU_INIT_SECTIONS, NULL },
+	.tosec   = { MEM_INIT_SECTIONS, NULL },
+	.mismatch = XXXINIT_TO_INIT,
+},
 /* Do not reference exit code/data from devexit/cpuexit/memexit code/data */
 {
 	.fromsec = { DEV_EXIT_SECTIONS, CPU_EXIT_SECTIONS, MEM_EXIT_SECTIONS, NULL },
 	.tosec   = { EXIT_SECTIONS, NULL },
 	.mismatch = XXXEXIT_TO_EXIT,
 },
+/* Do not reference cpuexit code/data from memexit code/data */
+{
+	.fromsec = { MEM_EXIT_SECTIONS, NULL },
+	.tosec   = { CPU_EXIT_SECTIONS, NULL },
+	.mismatch = XXXEXIT_TO_EXIT,
+},
+/* Do not reference memexit code/data from cpuexit code/data */
+{
+	.fromsec = { CPU_EXIT_SECTIONS, NULL },
+	.tosec   = { MEM_EXIT_SECTIONS, NULL },
+	.mismatch = XXXEXIT_TO_EXIT,
+},
 /* Do not use exit code/data from init code */
 {
 	.fromsec = { ALL_INIT_SECTIONS, NULL },
-- 
cgit v1.2.3-70-g09d2


From 1506e30b5f25f6c3357167a18f0e4ae6f5662a28 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 9 Jun 2009 17:49:06 -0700
Subject: rfkill: include err.h

Since we use ERR_PTR and similar macros, we need to include
linux/err.h.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rfkill.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index ee3eddea856..d7e818ad0bc 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -105,6 +105,7 @@ enum rfkill_user_states {
 #include <linux/mutex.h>
 #include <linux/device.h>
 #include <linux/leds.h>
+#include <linux/err.h>
 
 /* this is opaque */
 struct rfkill;
-- 
cgit v1.2.3-70-g09d2


From cf9e4e15e8f6306b2559979269ead7c02e6b2b95 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:36 +0800
Subject: KVM: Split IOAPIC structure

Prepared for reuse ioapic_redir_entry for MSI.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_types.h | 17 +++++++++++++++++
 virt/kvm/ioapic.c         |  6 +++---
 virt/kvm/ioapic.h         | 17 +----------------
 3 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2b8318c83e5..b84aca3c4ad 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -40,4 +40,21 @@ typedef unsigned long  hfn_t;
 
 typedef hfn_t pfn_t;
 
+union kvm_ioapic_redirect_entry {
+	u64 bits;
+	struct {
+		u8 vector;
+		u8 delivery_mode:3;
+		u8 dest_mode:1;
+		u8 delivery_status:1;
+		u8 polarity:1;
+		u8 remote_irr:1;
+		u8 trig_mode:1;
+		u8 mask:1;
+		u8 reserve:7;
+		u8 reserved[4];
+		u8 dest_id;
+	} fields;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c3b99def9cb..812801317e3 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 
 static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
-	union ioapic_redir_entry *pent;
+	union kvm_ioapic_redirect_entry *pent;
 	int injected = -1;
 
 	pent = &ioapic->redirtbl[idx];
@@ -284,7 +284,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
 	u32 mask = 1 << irq;
-	union ioapic_redir_entry entry;
+	union kvm_ioapic_redirect_entry entry;
 	int ret = 1;
 
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
@@ -305,7 +305,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
 				    int trigger_mode)
 {
-	union ioapic_redir_entry *ent;
+	union kvm_ioapic_redirect_entry *ent;
 
 	ent = &ioapic->redirtbl[pin];
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a34bd5e6436..008ec873d01 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -40,22 +40,7 @@ struct kvm_ioapic {
 	u32 id;
 	u32 irr;
 	u32 pad;
-	union ioapic_redir_entry {
-		u64 bits;
-		struct {
-			u8 vector;
-			u8 delivery_mode:3;
-			u8 dest_mode:1;
-			u8 delivery_status:1;
-			u8 polarity:1;
-			u8 remote_irr:1;
-			u8 trig_mode:1;
-			u8 mask:1;
-			u8 reserve:7;
-			u8 reserved[4];
-			u8 dest_id;
-		} fields;
-	} redirtbl[IOAPIC_NUM_PINS];
+	union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
 	struct kvm_io_device dev;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
-- 
cgit v1.2.3-70-g09d2


From 116191b69b608d0f1513e3abe71d6a46800f2bd6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:37 +0800
Subject: KVM: Unify the delivery of IOAPIC and MSI interrupts

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  3 ++
 virt/kvm/ioapic.c        | 91 +++++++++++++++++-----------------------------
 virt/kvm/irq_comm.c      | 95 +++++++++++++++++++++++++++++-------------------
 3 files changed, 95 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 894a56e365e..1a2f98fbece 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -352,6 +352,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 812801317e3..883fd0dc9b7 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -203,79 +203,56 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-	u8 dest = ioapic->redirtbl[irq].fields.dest_id;
-	u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
-	u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
-	u8 vector = ioapic->redirtbl[irq].fields.vector;
-	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
-	u32 deliver_bitmask;
+	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+	unsigned long deliver_bitmask;
 	struct kvm_vcpu *vcpu;
 	int vcpu_id, r = -1;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
-		     dest, dest_mode, delivery_mode, vector, trig_mode);
+		     entry.fields.dest, entry.fields.dest_mode,
+		     entry.fields.delivery_mode, entry.fields.vector,
+		     entry.fields.trig_mode);
 
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
-							  dest_mode);
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
 	if (!deliver_bitmask) {
 		ioapic_debug("no target on destination\n");
 		return 0;
 	}
 
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
+	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
-		if (irq == 0)
-			vcpu = ioapic->kvm->vcpus[0];
+	if (irq == 0)
+		deliver_bitmask = 1;
 #endif
-		if (vcpu != NULL)
-			r = ioapic_inj_irq(ioapic, vcpu, vector,
-				       trig_mode, delivery_mode);
-		else
-			ioapic_debug("null lowest prio vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
-		break;
-	case IOAPIC_FIXED:
-#ifdef CONFIG_X86
-		if (irq == 0)
-			deliver_bitmask = 1;
-#endif
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
+
+	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+		if (!(deliver_bitmask & (1 << vcpu_id)))
+			continue;
+		deliver_bitmask &= ~(1 << vcpu_id);
+		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		if (vcpu) {
+			if (entry.fields.delivery_mode ==
+					IOAPIC_LOWEST_PRIORITY ||
+			    entry.fields.delivery_mode == IOAPIC_FIXED) {
 				if (r < 0)
 					r = 0;
-				r += ioapic_inj_irq(ioapic, vcpu, vector,
-					       trig_mode, delivery_mode);
-			}
-		}
-		break;
-	case IOAPIC_NMI:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				ioapic_inj_nmi(vcpu);
+				r += ioapic_inj_irq(ioapic, vcpu,
+						    entry.fields.vector,
+						    entry.fields.trig_mode,
+						    entry.fields.delivery_mode);
+			} else if (entry.fields.delivery_mode == IOAPIC_NMI) {
 				r = 1;
-			}
-			else
-				ioapic_debug("NMI to vcpu %d failed\n",
-						vcpu->vcpu_id);
-		}
-		break;
-	default:
-		printk(KERN_WARNING "Unsupported delivery mode %d\n",
-		       delivery_mode);
-		break;
+				ioapic_inj_nmi(vcpu);
+			} else
+				ioapic_debug("unsupported delivery mode %x!\n",
+					     entry.fields.delivery_mode);
+		} else
+			ioapic_debug("null destination vcpu: "
+				     "mask=%x vector=%x delivery_mode=%x\n",
+				     entry.fields.deliver_bitmask,
+				     entry.fields.vector,
+				     entry.fields.delivery_mode);
 	}
 	return r;
 }
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 864ac5483ba..aec7a0d93a3 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,53 +43,74 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask)
+{
+	struct kvm_vcpu *vcpu;
+
+	*deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+				entry->fields.dest_id, entry->fields.dest_mode);
+	switch (entry->fields.delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
+				entry->fields.vector, *deliver_bitmask);
+		*deliver_bitmask = 1 << vcpu->vcpu_id;
+		break;
+	case IOAPIC_FIXED:
+	case IOAPIC_NMI:
+		break;
+	default:
+		if (printk_ratelimit())
+			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
+				entry->fields.delivery_mode);
+		*deliver_bitmask = 0;
+	}
+}
+
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
 	int vcpu_id, r = -1;
 	struct kvm_vcpu *vcpu;
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-	int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK)
-			>> MSI_ADDR_DEST_ID_SHIFT;
-	int vector = (e->msi.data & MSI_DATA_VECTOR_MASK)
-			>> MSI_DATA_VECTOR_SHIFT;
-	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-				(unsigned long *)&e->msi.address_lo);
-	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-				(unsigned long *)&e->msi.data);
-	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
-				(unsigned long *)&e->msi.data);
-	u32 deliver_bitmask;
+	union kvm_ioapic_redirect_entry entry;
+	unsigned long deliver_bitmask;
 
 	BUG_ON(!ioapic);
 
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				dest_id, dest_mode);
-	/* IOAPIC delivery mode value is the same as MSI here */
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
-		if (vcpu != NULL)
-			r = kvm_apic_set_irq(vcpu, vector, trig_mode);
-		else
-			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
-		break;
-	case IOAPIC_FIXED:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				if (r < 0)
-					r = 0;
-				r += kvm_apic_set_irq(vcpu, vector, trig_mode);
-			}
+	entry.bits = 0;
+	entry.fields.dest_id = (e->msi.address_lo &
+			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+	entry.fields.vector = (e->msi.data &
+			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+	entry.fields.dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+			(unsigned long *)&e->msi.address_lo);
+	entry.fields.trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+			(unsigned long *)&e->msi.data);
+	entry.fields.delivery_mode = test_bit(
+			MSI_DATA_DELIVERY_MODE_SHIFT,
+			(unsigned long *)&e->msi.data);
+
+	/* TODO Deal with RH bit of MSI message address */
+
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
+
+	if (!deliver_bitmask) {
+		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
+		return -1;
+	}
+	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+		if (!(deliver_bitmask & (1 << vcpu_id)))
+			continue;
+		deliver_bitmask &= ~(1 << vcpu_id);
+		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		if (vcpu) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
+					      entry.fields.trig_mode);
 		}
-		break;
-	default:
-		break;
 	}
 	return r;
 }
-- 
cgit v1.2.3-70-g09d2


From c1e01514296e8a4a43ff0c88dcff635cb90feb5f Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:26 +0800
Subject: KVM: Ioctls for init MSI-X entry

Introduce KVM_SET_MSIX_NR and KVM_SET_MSIX_ENTRY two ioctls.

This two ioctls are used by userspace to specific guest device MSI-X entry
number and correlate MSI-X entry with GSI during the initialization stage.

MSI-X should be well initialzed before enabling.

Don't support change MSI-X entry number for now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h      |  18 ++++++++
 include/linux/kvm_host.h |  10 +++++
 virt/kvm/kvm_main.c      | 104 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8cc137911b3..78cdee8c635 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -487,6 +487,10 @@ struct kvm_irq_routing {
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR \
+			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY \
+			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 
 /*
  * ioctls for vcpu fds
@@ -607,4 +611,18 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
+struct kvm_assigned_msix_nr {
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV		512
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a2f98fbece..432edc27e82 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -319,6 +319,12 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+struct kvm_guest_msix_entry {
+	u32 vector;
+	u16 entry;
+	u16 flags;
+};
+
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct work_struct interrupt_work;
@@ -326,13 +332,17 @@ struct kvm_assigned_dev_kernel {
 	int assigned_dev_id;
 	int host_busnr;
 	int host_devfn;
+	unsigned int entries_nr;
 	int host_irq;
 	bool host_irq_disabled;
+	struct msix_entry *host_msix_entries;
 	int guest_irq;
+	struct kvm_guest_msix_entry *guest_msix_entries;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
 #define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
 #define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
+#define KVM_ASSIGNED_DEV_MSIX		((1 << 2) | (1 << 10))
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d0dd390aa5..1ceb96901f3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1593,6 +1593,88 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+				    struct kvm_assigned_msix_nr *entry_nr)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry_nr->assigned_dev_id);
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_nr_out;
+	}
+
+	if (adev->entries_nr == 0) {
+		adev->entries_nr = entry_nr->entry_nr;
+		if (adev->entries_nr == 0 ||
+		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+			r = -EINVAL;
+			goto msix_nr_out;
+		}
+
+		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+						entry_nr->entry_nr,
+						GFP_KERNEL);
+		if (!adev->host_msix_entries) {
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+		adev->guest_msix_entries = kzalloc(
+				sizeof(struct kvm_guest_msix_entry) *
+				entry_nr->entry_nr, GFP_KERNEL);
+		if (!adev->guest_msix_entries) {
+			kfree(adev->host_msix_entries);
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+	} else /* Not allowed set MSI-X number twice */
+		r = -EINVAL;
+msix_nr_out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+				       struct kvm_assigned_msix_entry *entry)
+{
+	int r = 0, i;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry->assigned_dev_id);
+
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_entry_out;
+	}
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->guest_msix_entries[i].vector == 0 ||
+		    adev->guest_msix_entries[i].entry == entry->entry) {
+			adev->guest_msix_entries[i].entry = entry->entry;
+			adev->guest_msix_entries[i].vector = entry->gsi;
+			adev->host_msix_entries[i].entry = entry->entry;
+			break;
+		}
+	if (i == adev->entries_nr) {
+		r = -ENOSPC;
+		goto msix_entry_out;
+	}
+
+msix_entry_out:
+	mutex_unlock(&kvm->lock);
+
+	return r;
+}
+#endif
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -1917,7 +1999,29 @@ static long kvm_vm_ioctl(struct file *filp,
 		vfree(entries);
 		break;
 	}
+#ifdef __KVM_HAVE_MSIX
+	case KVM_ASSIGN_SET_MSIX_NR: {
+		struct kvm_assigned_msix_nr entry_nr;
+		r = -EFAULT;
+		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_SET_MSIX_ENTRY: {
+		struct kvm_assigned_msix_entry entry;
+		r = -EFAULT;
+		if (copy_from_user(&entry, argp, sizeof entry))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+		if (r)
+			goto out;
+		break;
+	}
 #endif
+#endif /* KVM_CAP_IRQ_ROUTING */
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
cgit v1.2.3-70-g09d2


From 2350bd1f62c8706c22b8e58c3bfff10806c0a31b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:27 +0800
Subject: KVM: Add MSI-X interrupt injection logic

We have to handle more than one interrupt with one handler for MSI-X. Avi
suggested to use a flag to indicate the pending. So here is it.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 66 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 432edc27e82..3832243625d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -319,6 +319,7 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+#define KVM_ASSIGNED_MSIX_PENDING		0x1
 struct kvm_guest_msix_entry {
 	u32 vector;
 	u16 entry;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1ceb96901f3..8bd44d6985c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,25 +95,69 @@ static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *h
 	return NULL;
 }
 
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+				    *assigned_dev, int irq)
+{
+	int i, index;
+	struct msix_entry *host_msix_entries;
+
+	host_msix_entries = assigned_dev->host_msix_entries;
+
+	index = -1;
+	for (i = 0; i < assigned_dev->entries_nr; i++)
+		if (irq == host_msix_entries[i].vector) {
+			index = i;
+			break;
+		}
+	if (index < 0) {
+		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+		return 0;
+	}
+
+	return index;
+}
+
 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev;
+	struct kvm *kvm;
+	int irq, i;
 
 	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
 				    interrupt_work);
+	kvm = assigned_dev->kvm;
 
 	/* This is taken to safely inject irq inside the guest. When
 	 * the interrupt injection (or the ioapic code) uses a
 	 * finer-grained lock, update this
 	 */
-	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
-
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) {
-		enable_irq(assigned_dev->host_irq);
-		assigned_dev->host_irq_disabled = false;
+	mutex_lock(&kvm->lock);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+		struct kvm_guest_msix_entry *guest_entries =
+			assigned_dev->guest_msix_entries;
+		for (i = 0; i < assigned_dev->entries_nr; i++) {
+			if (!(guest_entries[i].flags &
+					KVM_ASSIGNED_MSIX_PENDING))
+				continue;
+			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id,
+				    guest_entries[i].vector, 1);
+			irq = assigned_dev->host_msix_entries[i].vector;
+			if (irq != 0)
+				enable_irq(irq);
+			assigned_dev->host_irq_disabled = false;
+		}
+	} else {
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+		if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_GUEST_MSI) {
+			enable_irq(assigned_dev->host_irq);
+			assigned_dev->host_irq_disabled = false;
+		}
 	}
+
 	mutex_unlock(&assigned_dev->kvm->lock);
 }
 
@@ -122,6 +166,14 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
+	if (assigned_dev->irq_requested_type == KVM_ASSIGNED_DEV_MSIX) {
+		int index = find_index_from_host_irq(assigned_dev, irq);
+		if (index < 0)
+			return IRQ_HANDLED;
+		assigned_dev->guest_msix_entries[index].flags |=
+			KVM_ASSIGNED_MSIX_PENDING;
+	}
+
 	schedule_work(&assigned_dev->interrupt_work);
 
 	disable_irq_nosync(irq);
-- 
cgit v1.2.3-70-g09d2


From d510d6cc653bc4b3094ea73afe12600d0ab445b3 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:28 +0800
Subject: KVM: Enable MSI-X for KVM assigned device

This patch finally enable MSI-X.

What we need for MSI-X:
1. Intercept one page in MMIO region of device. So that we can get guest desired
MSI-X table and set up the real one. Now this have been done by guest, and
transfer to kernel using ioctl KVM_SET_MSIX_NR and KVM_SET_MSIX_ENTRY.

2. Information for incoming interrupt. Now one device can have more than one
interrupt, and they are all handled by one workqueue structure. So we need to
identify them. The previous patch enable gsi_msg_pending_bitmap get this done.

3. Mapping from host IRQ to guest gsi as well as guest gsi to real MSI/MSI-X
message address/data. We used same entry number for the host and guest here, so
that it's easy to find the correlated guest gsi.

What we lack for now:
1. The PCI spec said nothing can existed with MSI-X table in the same page of
MMIO region, except pending bits. The patch ignore pending bits as the first
step (so they are always 0 - no pending).

2. The PCI spec allowed to change MSI-X table dynamically. That means, the OS
can enable MSI-X, then mask one MSI-X entry, modify it, and unmask it. The patch
didn't support this, and Linux also don't work in this way.

3. The patch didn't implement MSI-X mask all and mask single entry. I would
implement the former in driver/pci/msi.c later. And for single entry, userspace
should have reposibility to handle it.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h |  1 +
 include/linux/kvm.h        |  8 ++++
 virt/kvm/kvm_main.c        | 98 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 101 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf1170..125be8b1956 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 78cdee8c635..640835ed270 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -409,6 +409,9 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
 #endif
+#ifdef __KVM_HAVE_MSIX
+#define KVM_CAP_DEVICE_MSIX 28
+#endif
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
@@ -611,6 +614,11 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION  (KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX |\
+					KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX  (1 << 1)
+#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX    (1 << 2)
+
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
 	__u16 entry_nr;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8bd44d6985c..3bed82754a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -236,13 +236,33 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	 * now, the kvm state is still legal for probably we also have to wait
 	 * interrupt_work done.
 	 */
-	disable_irq_nosync(assigned_dev->host_irq);
-	cancel_work_sync(&assigned_dev->interrupt_work);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+		int i;
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			disable_irq_nosync(assigned_dev->
+					   host_msix_entries[i].vector);
+
+		cancel_work_sync(&assigned_dev->interrupt_work);
 
-	free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			free_irq(assigned_dev->host_msix_entries[i].vector,
+				 (void *)assigned_dev);
 
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		pci_disable_msi(assigned_dev->dev);
+		assigned_dev->entries_nr = 0;
+		kfree(assigned_dev->host_msix_entries);
+		kfree(assigned_dev->guest_msix_entries);
+		pci_disable_msix(assigned_dev->dev);
+	} else {
+		/* Deal with MSI and INTx */
+		disable_irq_nosync(assigned_dev->host_irq);
+		cancel_work_sync(&assigned_dev->interrupt_work);
+
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+		if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_HOST_MSI)
+			pci_disable_msi(assigned_dev->dev);
+	}
 
 	assigned_dev->irq_requested_type = 0;
 }
@@ -373,6 +393,60 @@ static int assigned_device_update_msi(struct kvm *kvm,
 }
 #endif
 
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_update_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *adev,
+			struct kvm_assigned_irq *airq)
+{
+	/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+	int i, r;
+
+	adev->ack_notifier.gsi = -1;
+
+	if (irqchip_in_kernel(kvm)) {
+		if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
+			return -ENOTTY;
+
+		if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) {
+			/* Guest disable MSI-X */
+			kvm_free_assigned_irq(kvm, adev);
+			if (msi2intx) {
+				pci_enable_msi(adev->dev);
+				if (adev->dev->msi_enabled)
+					return assigned_device_update_msi(kvm,
+							adev, airq);
+			}
+			return assigned_device_update_intx(kvm, adev, airq);
+		}
+
+		/* host_msix_entries and guest_msix_entries should have been
+		 * initialized */
+		if (adev->entries_nr == 0)
+			return -EINVAL;
+
+		kvm_free_assigned_irq(kvm, adev);
+
+		r = pci_enable_msix(adev->dev, adev->host_msix_entries,
+				    adev->entries_nr);
+		if (r)
+			return r;
+
+		for (i = 0; i < adev->entries_nr; i++) {
+			r = request_irq((adev->host_msix_entries + i)->vector,
+					kvm_assigned_dev_intr, 0,
+					"kvm_assigned_msix_device",
+					(void *)adev);
+			if (r)
+				return r;
+		}
+	}
+
+	adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX;
+
+	return 0;
+}
+#endif
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -417,12 +491,24 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
+	if (match->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)
+		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX;
+	else if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
 		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
 		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
 
 	changed_flags = assigned_irq->flags ^ current_flags;
 
+#ifdef __KVM_HAVE_MSIX
+	if (changed_flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) {
+		r = assigned_device_update_msix(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to execute "
+					"MSI-X action!\n");
+			goto out_release;
+		}
+	} else
+#endif
 	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
 	    (msi2intx && match->dev->msi_enabled)) {
 #ifdef CONFIG_X86
-- 
cgit v1.2.3-70-g09d2


From b95b51d580bff9376850eef29d34c3aa08c26db7 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 26 Feb 2009 13:55:33 +0100
Subject: KVM: declare ioapic functions only on affected hardware

Since "KVM: Unify the delivery of IOAPIC and MSI interrupts"
I get the following warnings:

  CC [M]  arch/s390/kvm/kvm-s390.o
In file included from arch/s390/kvm/kvm-s390.c:22:
include/linux/kvm_host.h:357: warning: 'struct kvm_ioapic' declared inside parameter list
include/linux/kvm_host.h:357: warning: its scope is only this definition or declaration, which is probably not what you want

This patch limits IOAPIC functions for architectures that have one.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3832243625d..3b91ec9982c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -363,9 +363,11 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
+#ifdef __KVM_HAVE_IOAPIC
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
+#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
-- 
cgit v1.2.3-70-g09d2


From a53c17d21c46a752f5ac6695376481bc27865b04 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:49 +0200
Subject: KVM: ioapic/msi interrupt delivery consolidation

ioapic_deliver() and kvm_set_msi() have code duplication. Move
the code into ioapic_deliver_entry() function and call it from
both places.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/ioapic.c        | 61 +++++++++++++++++++++++++-----------------------
 virt/kvm/ioapic.h        |  4 ++--
 virt/kvm/irq_comm.c      | 32 +++----------------------
 4 files changed, 38 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b91ec9982c..ec9d078b1e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -364,7 +364,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
 #ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index d4a7948b010..b71c0442cec 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,54 +142,57 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
 {
-	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
 	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	struct kvm_vcpu *vcpu;
-	int vcpu_id, r = -1;
+	int i, r = -1;
 
-	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-		     "vector=%x trig_mode=%x\n",
-		     entry.fields.dest, entry.fields.dest_mode,
-		     entry.fields.delivery_mode, entry.fields.vector,
-		     entry.fields.trig_mode);
-
-	/* Always delivery PIT interrupt to vcpu 0 */
-#ifdef CONFIG_X86
-	if (irq == 0) {
-                bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		__set_bit(0, deliver_bitmask);
-        } else
-#endif
-		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(kvm, e, deliver_bitmask);
 
 	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		ioapic_debug("no target on destination\n");
-		return 0;
+		return r;
 	}
 
-	while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
 			< KVM_MAX_VCPUS) {
-		__clear_bit(vcpu_id, deliver_bitmask);
-		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		struct kvm_vcpu *vcpu = kvm->vcpus[i];
+		__clear_bit(i, deliver_bitmask);
 		if (vcpu) {
 			if (r < 0)
 				r = 0;
-			r += kvm_apic_set_irq(vcpu,
-					entry.fields.vector,
-					entry.fields.trig_mode,
-					entry.fields.delivery_mode);
+			r += kvm_apic_set_irq(vcpu, e->fields.vector,
+					e->fields.delivery_mode,
+					e->fields.trig_mode);
 		} else
 			ioapic_debug("null destination vcpu: "
 				     "mask=%x vector=%x delivery_mode=%x\n",
-				     entry.fields.deliver_bitmask,
-				     entry.fields.vector,
-				     entry.fields.delivery_mode);
+				     e->fields.deliver_bitmask,
+				     e->fields.vector, e->fields.delivery_mode);
 	}
 	return r;
 }
 
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+
+	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+		     "vector=%x trig_mode=%x\n",
+		     entry.fields.dest, entry.fields.dest_mode,
+		     entry.fields.delivery_mode, entry.fields.vector,
+		     entry.fields.trig_mode);
+
+#ifdef CONFIG_X86
+	/* Always delivery PIT interrupt to vcpu 0 */
+	if (irq == 0) {
+		entry.fields.dest_mode = 0; /* Physical mode. */
+		entry.fields.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+	}
+#endif
+	return ioapic_deliver_entry(ioapic->kvm, &entry);
+}
+
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index c8032ab2a4e..bedeea59cc1 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,8 +70,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
-
+int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 325c6685f20..35397a569b2 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,12 +43,11 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask)
 {
 	int i;
-	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
 	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
@@ -90,7 +89,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		/* Select one in deliver_bitmask */
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
+		vcpu = kvm_get_lowest_prio_vcpu(kvm,
 				entry->fields.vector, deliver_bitmask);
 		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 		if (!vcpu)
@@ -111,13 +110,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
-	int vcpu_id, r = -1;
-	struct kvm_vcpu *vcpu;
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
 	union kvm_ioapic_redirect_entry entry;
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-
-	BUG_ON(!ioapic);
 
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
@@ -133,26 +126,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 			(unsigned long *)&e->msi.data);
 
 	/* TODO Deal with RH bit of MSI message address */
-
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
-
-	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
-		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
-		return -1;
-	}
-	while ((vcpu_id = find_first_bit(deliver_bitmask,
-					KVM_MAX_VCPUS)) < KVM_MAX_VCPUS) {
-		__clear_bit(vcpu_id, deliver_bitmask);
-		vcpu = ioapic->kvm->vcpus[vcpu_id];
-		if (vcpu) {
-			if (r < 0)
-				r = 0;
-			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
-					      entry.fields.dest_mode,
-					      entry.fields.trig_mode);
-		}
-	}
-	return r;
+	return ioapic_deliver_entry(kvm, &entry);
 }
 
 /* This should be called with the kvm->lock mutex held
-- 
cgit v1.2.3-70-g09d2


From 343f94fe4d16ec898da77720c03da9e09f8523d2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:54 +0200
Subject: KVM: consolidate ioapic/ipi interrupt delivery logic

Use kvm_apic_match_dest() in kvm_get_intr_delivery_bitmask() instead
of duplicating the same code. Use kvm_get_intr_delivery_bitmask() in
apic_send_ipi() to figure out ipi destination instead of reimplementing
the logic.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  8 ++++++
 arch/ia64/kvm/lapic.h    |  3 ++
 arch/x86/kvm/lapic.c     | 69 +++++++++++++++++---------------------------
 arch/x86/kvm/lapic.h     |  2 ++
 include/linux/kvm_host.h |  5 ----
 virt/kvm/ioapic.c        |  5 +++-
 virt/kvm/ioapic.h        | 10 ++++---
 virt/kvm/irq_comm.c      | 74 ++++++++++++++----------------------------------
 8 files changed, 70 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 99d6d174d93..8eea9cba7b7 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1852,6 +1852,14 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 	return lvcpu;
 }
 
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode)
+{
+	return (dest_mode == 0) ?
+		kvm_apic_match_physical_addr(target, dest) :
+		kvm_apic_match_logical_addr(target, dest);
+}
+
 static int find_highest_bits(int *dat)
 {
 	u32  bits, bitnum;
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index cbcfaa6195c..31602e7338d 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -20,6 +20,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a42f968a23e..998862a3c26 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -260,7 +260,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
 {
-	return kvm_apic_id(apic) == dest;
+	return dest == 0xff || kvm_apic_id(apic) == dest;
 }
 
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -289,37 +289,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 	return result;
 }
 
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, int dest, int dest_mode)
 {
 	int result = 0;
 	struct kvm_lapic *target = vcpu->arch.apic;
 
 	apic_debug("target %p, source %p, dest 0x%x, "
-		   "dest_mode 0x%x, short_hand 0x%x",
+		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   target, source, dest, dest_mode, short_hand);
 
 	ASSERT(!target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
-		if (dest_mode == 0) {
+		if (dest_mode == 0)
 			/* Physical mode. */
-			if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
-				result = 1;
-		} else
+			result = kvm_apic_match_physical_addr(target, dest);
+		else
 			/* Logical mode. */
 			result = kvm_apic_match_logical_addr(target, dest);
 		break;
 	case APIC_DEST_SELF:
-		if (target == source)
-			result = 1;
+		result = (target == source);
 		break;
 	case APIC_DEST_ALLINC:
 		result = 1;
 		break;
 	case APIC_DEST_ALLBUT:
-		if (target != source)
-			result = 1;
+		result = (target != source);
 		break;
 	default:
 		printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -492,38 +489,26 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
 	unsigned int vector = icr_low & APIC_VECTOR_MASK;
 
-	struct kvm_vcpu *target;
-	struct kvm_vcpu *vcpu;
-	DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS);
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	int i;
 
-	bitmap_zero(lpr_map, KVM_MAX_VCPUS);
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
 		   icr_high, icr_low, short_hand, dest,
 		   trig_mode, level, dest_mode, delivery_mode, vector);
 
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
-		vcpu = apic->vcpu->kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-
-		if (vcpu->arch.apic &&
-		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
-			if (delivery_mode == APIC_DM_LOWEST)
-				__set_bit(vcpu->vcpu_id, lpr_map);
-			else
-				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
-						  vector, level, trig_mode);
-		}
-	}
-
-	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
-		if (target != NULL)
-			__apic_accept_irq(target->arch.apic, delivery_mode,
-					  vector, level, trig_mode);
+	kvm_get_intr_delivery_bitmask(apic->vcpu->kvm, apic, dest, dest_mode,
+			delivery_mode == APIC_DM_LOWEST, short_hand,
+			deliver_bitmask);
+
+	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+			< KVM_MAX_VCPUS) {
+		struct kvm_vcpu *vcpu = apic->vcpu->kvm->vcpus[i];
+		__clear_bit(i, deliver_bitmask);
+		if (vcpu)
+			__apic_accept_irq(vcpu->arch.apic, delivery_mode,
+					vector, level, trig_mode);
 	}
 }
 
@@ -930,16 +915,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int ret = 0;
-
-	if (!apic)
-		return 0;
-	ret = apic_enabled(apic);
+	return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+}
 
-	return ret;
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1b0e3c03cb3..b66dc14a969 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -37,6 +37,8 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
+bool kvm_lapic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ec9d078b1e8..fb60f31c4fb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -363,11 +363,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-#ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
-#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index b71c0442cec..43969bbf127 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -147,7 +147,10 @@ int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
 	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	int i, r = -1;
 
-	kvm_get_intr_delivery_bitmask(kvm, e, deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(kvm, NULL, e->fields.dest_id,
+			e->fields.dest_mode,
+			e->fields.delivery_mode == IOAPIC_LOWEST_PRIORITY,
+			0, deliver_bitmask);
 
 	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		ioapic_debug("no target on destination\n");
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index bedeea59cc1..d996c7abc46 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -65,13 +65,15 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap);
+		unsigned long *bitmap);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
+		int dest_id, int dest_mode, bool low_prio, int short_hand,
+		unsigned long *deliver_bitmask);
 int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 35397a569b2..e43701c0a5c 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,67 +43,35 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask)
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
+		int dest_id, int dest_mode, bool low_prio, int short_hand,
+		unsigned long *deliver_bitmask)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 
+	if (dest_mode == 0 && dest_id == 0xff && low_prio)
+		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+
 	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		vcpu = kvm->vcpus[i];
 
-	if (entry->fields.dest_mode == 0) {	/* Physical mode. */
-		if (entry->fields.dest_id == 0xFF) {	/* Broadcast. */
-			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					__set_bit(i, deliver_bitmask);
-			/* Lowest priority shouldn't combine with broadcast */
-			if (entry->fields.delivery_mode ==
-			    IOAPIC_LOWEST_PRIORITY && printk_ratelimit())
-				printk(KERN_INFO "kvm: apic: phys broadcast "
-						  "and lowest prio\n");
-			return;
-		}
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (kvm_apic_match_physical_addr(vcpu->arch.apic,
-					entry->fields.dest_id)) {
-				if (vcpu->arch.apic)
-					__set_bit(i, deliver_bitmask);
-				break;
-			}
-		}
-	} else if (entry->fields.dest_id != 0) /* Logical mode, MDA non-zero. */
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (vcpu->arch.apic &&
-			    kvm_apic_match_logical_addr(vcpu->arch.apic,
-					entry->fields.dest_id))
-				__set_bit(i, deliver_bitmask);
-		}
+		if (!vcpu || !kvm_apic_present(vcpu))
+			continue;
 
-	switch (entry->fields.delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		/* Select one in deliver_bitmask */
-		vcpu = kvm_get_lowest_prio_vcpu(kvm,
-				entry->fields.vector, deliver_bitmask);
-		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		if (!vcpu)
-			return;
-		__set_bit(vcpu->vcpu_id, deliver_bitmask);
-		break;
-	case IOAPIC_FIXED:
-	case IOAPIC_NMI:
-		break;
-	default:
-		if (printk_ratelimit())
-			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
-				entry->fields.delivery_mode);
+		if (!kvm_apic_match_dest(vcpu, src, short_hand, dest_id,
+					dest_mode))
+			continue;
+
+		__set_bit(i, deliver_bitmask);
+	}
+
+	if (low_prio) {
+		vcpu = kvm_get_lowest_prio_vcpu(kvm, 0, deliver_bitmask);
 		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+		if (vcpu)
+			__set_bit(vcpu->vcpu_id, deliver_bitmask);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 58c2dde17d6eb6c8c0566e52d184aa16755d890f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:35:04 +0200
Subject: KVM: APIC: get rid of deliver_bitmask

Deliver interrupt during destination matching loop.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c  | 33 ++++++++++++----------
 arch/ia64/kvm/lapic.h     |  4 +--
 arch/x86/kvm/lapic.c      | 59 +++++++++++----------------------------
 arch/x86/kvm/lapic.h      |  3 +-
 include/linux/kvm_types.h | 10 +++++++
 virt/kvm/ioapic.c         | 57 +++++++++++--------------------------
 virt/kvm/ioapic.h         |  6 ++--
 virt/kvm/irq_comm.c       | 71 ++++++++++++++++++++++++++++-------------------
 8 files changed, 108 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 1887a93a2bd..acf43ec4270 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -283,6 +283,18 @@ static int handle_sal_call(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 }
 
+static int __apic_accept_irq(struct kvm_vcpu *vcpu, uint64_t vector)
+{
+	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+
+	if (!test_and_set_bit(vector, &vpd->irr[0])) {
+		vcpu->arch.irq_new_pending = 1;
+		kvm_vcpu_kick(vcpu);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  *  offset: address offset to IPI space.
  *  value:  deliver value.
@@ -292,20 +304,20 @@ static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm,
 {
 	switch (dm) {
 	case SAPIC_FIXED:
-		kvm_apic_set_irq(vcpu, vector, dm, 0);
 		break;
 	case SAPIC_NMI:
-		kvm_apic_set_irq(vcpu, 2, dm, 0);
+		vector = 2;
 		break;
 	case SAPIC_EXTINT:
-		kvm_apic_set_irq(vcpu, 0, dm, 0);
+		vector = 0;
 		break;
 	case SAPIC_INIT:
 	case SAPIC_PMI:
 	default:
 		printk(KERN_ERR"kvm: Unimplemented Deliver reserved IPI!\n");
-		break;
+		return;
 	}
+	__apic_accept_irq(vcpu, vector);
 }
 
 static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
@@ -1813,17 +1825,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
-
-	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
-
-	if (!test_and_set_bit(vec, &vpd->irr[0])) {
-		vcpu->arch.irq_new_pending = 1;
-		kvm_vcpu_kick(vcpu);
-		return 1;
-	}
-	return 0;
+	return __apic_accept_irq(vcpu, irq->vector);
 }
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
@@ -1844,6 +1848,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode)
 {
+	struct kvm_lapic *target = vcpu->arch.apic;
 	return (dest_mode == 0) ?
 		kvm_apic_match_physical_addr(target, dest) :
 		kvm_apic_match_logical_addr(target, dest);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index e42109e6ca4..ee541cebcd7 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -23,7 +23,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+#define kvm_apic_present(x) (true)
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 814466f455d..dd934d27040 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -199,27 +199,12 @@ EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int lapic_dmode;
 
-	switch (dmode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		lapic_dmode = APIC_DM_LOWEST;
-		break;
-	case IOAPIC_FIXED:
-		lapic_dmode = APIC_DM_FIXED;
-		break;
-	case IOAPIC_NMI:
-		lapic_dmode = APIC_DM_NMI;
-		break;
-	default:
-		printk(KERN_DEBUG"Ignoring delivery mode %d\n", dmode);
-		return 0;
-		break;
-	}
-	return __apic_accept_irq(apic, lapic_dmode, vec, 1, trig);
+	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+			irq->level, irq->trig_mode);
 }
 
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -447,36 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 {
 	u32 icr_low = apic_get_reg(apic, APIC_ICR);
 	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+	struct kvm_lapic_irq irq;
 
-	unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
-	unsigned int short_hand = icr_low & APIC_SHORT_MASK;
-	unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
-	unsigned int level = icr_low & APIC_INT_ASSERT;
-	unsigned int dest_mode = icr_low & APIC_DEST_MASK;
-	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
-	unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	int i;
+	irq.vector = icr_low & APIC_VECTOR_MASK;
+	irq.delivery_mode = icr_low & APIC_MODE_MASK;
+	irq.dest_mode = icr_low & APIC_DEST_MASK;
+	irq.level = icr_low & APIC_INT_ASSERT;
+	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+	irq.shorthand = icr_low & APIC_SHORT_MASK;
+	irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
 
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-		   icr_high, icr_low, short_hand, dest,
-		   trig_mode, level, dest_mode, delivery_mode, vector);
-
-	kvm_get_intr_delivery_bitmask(apic->vcpu->kvm, apic, dest, dest_mode,
-			delivery_mode == APIC_DM_LOWEST, short_hand,
-			deliver_bitmask);
-
-	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
-			< KVM_MAX_VCPUS) {
-		struct kvm_vcpu *vcpu = apic->vcpu->kvm->vcpus[i];
-		__clear_bit(i, deliver_bitmask);
-		if (vcpu)
-			__apic_accept_irq(vcpu->arch.apic, delivery_mode,
-					vector, level, trig_mode);
-	}
+		   icr_high, icr_low, irq.shorthand, irq.dest,
+		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+		   irq.vector);
+
+	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index b66dc14a969..a587f8349c4 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,14 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
-bool kvm_lapic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index b84aca3c4ad..fb46efbeabe 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -57,4 +57,14 @@ union kvm_ioapic_redirect_entry {
 	} fields;
 };
 
+struct kvm_lapic_irq {
+	u32 vector;
+	u32 delivery_mode;
+	u32 dest_mode;
+	u32 level;
+	u32 trig_mode;
+	u32 shorthand;
+	u32 dest_id;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 43969bbf127..1eddae94bab 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,58 +142,33 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
-{
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	int i, r = -1;
-
-	kvm_get_intr_delivery_bitmask(kvm, NULL, e->fields.dest_id,
-			e->fields.dest_mode,
-			e->fields.delivery_mode == IOAPIC_LOWEST_PRIORITY,
-			0, deliver_bitmask);
-
-	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
-		ioapic_debug("no target on destination\n");
-		return r;
-	}
-
-	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
-			< KVM_MAX_VCPUS) {
-		struct kvm_vcpu *vcpu = kvm->vcpus[i];
-		__clear_bit(i, deliver_bitmask);
-		if (vcpu) {
-			if (r < 0)
-				r = 0;
-			r += kvm_apic_set_irq(vcpu, e->fields.vector,
-					e->fields.delivery_mode,
-					e->fields.trig_mode);
-		} else
-			ioapic_debug("null destination vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     e->fields.deliver_bitmask,
-				     e->fields.vector, e->fields.delivery_mode);
-	}
-	return r;
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+	struct kvm_lapic_irq irqe;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
-		     entry.fields.dest, entry.fields.dest_mode,
-		     entry.fields.delivery_mode, entry.fields.vector,
-		     entry.fields.trig_mode);
+		     entry->fields.dest, entry->fields.dest_mode,
+		     entry->fields.delivery_mode, entry->fields.vector,
+		     entry->fields.trig_mode);
+
+	irqe.dest_id = entry->fields.dest_id;
+	irqe.vector = entry->fields.vector;
+	irqe.dest_mode = entry->fields.dest_mode;
+	irqe.trig_mode = entry->fields.trig_mode;
+	irqe.delivery_mode = entry->fields.delivery_mode << 8;
+	irqe.level = 1;
+	irqe.shorthand = 0;
 
 #ifdef CONFIG_X86
 	/* Always delivery PIT interrupt to vcpu 0 */
 	if (irq == 0) {
-		entry.fields.dest_mode = 0; /* Physical mode. */
-		entry.fields.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+		irqe.dest_mode = 0; /* Physical mode. */
+		irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
 	}
 #endif
-	return ioapic_deliver_entry(ioapic->kvm, &entry);
+	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index e7bc92d895f..7080b713c16 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,8 +71,6 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
-		int dest_id, int dest_mode, bool low_prio, int short_hand,
-		unsigned long *deliver_bitmask);
-int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index f5e059b67cd..4fa1f604b42 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -22,6 +22,9 @@
 #include <linux/kvm_host.h>
 
 #include <asm/msidef.h>
+#ifdef CONFIG_IA64
+#include <asm/iosapic.h>
+#endif
 
 #include "irq.h"
 
@@ -43,61 +46,71 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
-		int dest_id, int dest_mode, bool low_prio, int short_hand,
-		unsigned long *deliver_bitmask)
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
 {
-	int i, lowest = -1;
-	struct kvm_vcpu *vcpu;
+#ifdef CONFIG_IA64
+	return irq->delivery_mode ==
+		(IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+#else
+	return irq->delivery_mode == APIC_DM_LOWEST;
+#endif
+}
 
-	if (dest_mode == 0 && dest_id == 0xff && low_prio)
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq)
+{
+	int i, r = -1;
+	struct kvm_vcpu *vcpu, *lowest = NULL;
+
+	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+			kvm_is_dm_lowest_prio(irq))
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 	for (i = 0; i < KVM_MAX_VCPUS; i++) {
 		vcpu = kvm->vcpus[i];
 
 		if (!vcpu || !kvm_apic_present(vcpu))
 			continue;
 
-		if (!kvm_apic_match_dest(vcpu, src, short_hand, dest_id,
-					dest_mode))
+		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
 			continue;
 
-		if (!low_prio) {
-			__set_bit(i, deliver_bitmask);
+		if (!kvm_is_dm_lowest_prio(irq)) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, irq);
 		} else {
-			if (lowest < 0)
-				lowest = i;
-			if (kvm_apic_compare_prio(vcpu, kvm->vcpus[lowest]) < 0)
-				lowest = i;
+			if (!lowest)
+				lowest = vcpu;
+			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+				lowest = vcpu;
 		}
 	}
 
-	if (lowest != -1)
-		__set_bit(lowest, deliver_bitmask);
+	if (lowest)
+		r = kvm_apic_set_irq(lowest, irq);
+
+	return r;
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
-	union kvm_ioapic_redirect_entry entry;
+	struct kvm_lapic_irq irq;
 
-	entry.bits = 0;
-	entry.fields.dest_id = (e->msi.address_lo &
+	irq.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-	entry.fields.vector = (e->msi.data &
+	irq.vector = (e->msi.data &
 			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
-	entry.fields.dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-			(unsigned long *)&e->msi.address_lo);
-	entry.fields.trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-			(unsigned long *)&e->msi.data);
-	entry.fields.delivery_mode = test_bit(
-			MSI_DATA_DELIVERY_MODE_SHIFT,
-			(unsigned long *)&e->msi.data);
+	irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+	irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+	irq.delivery_mode = e->msi.data & 0x700;
+	irq.level = 1;
+	irq.shorthand = 0;
 
 	/* TODO Deal with RH bit of MSI message address */
-	return ioapic_deliver_entry(kvm, &entry);
+	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
 /* This should be called with the kvm->lock mutex held
-- 
cgit v1.2.3-70-g09d2


From e56d532f20c890a06bbe7cd479f4201e3a03cd73 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 12 Mar 2009 21:45:39 +0800
Subject: KVM: Device assignment framework rework

After discussion with Marcelo, we decided to rework device assignment framework
together. The old problems are kernel logic is unnecessary complex. So Marcelo
suggest to split it into a more elegant way:

1. Split host IRQ assign and guest IRQ assign. And userspace determine the
combination. Also discard msi2intx parameter, userspace can specific
KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_INTX in assigned_irq->flags to
enable MSI to INTx convertion.

2. Split assign IRQ and deassign IRQ. Import two new ioctls:
KVM_ASSIGN_DEV_IRQ and KVM_DEASSIGN_DEV_IRQ.

This patch also fixed the reversed _IOR vs _IOW in definition(by deprecated the
old interface).

[avi: replace homemade bitcount() by hweight_long()]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm.h      |  26 ++-
 include/linux/kvm_host.h |   5 -
 virt/kvm/kvm_main.c      | 486 +++++++++++++++++++++++++----------------------
 4 files changed, 276 insertions(+), 242 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43e049a2ccf..41123fc8613 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1026,6 +1026,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
+	case KVM_CAP_ASSIGN_DEV_IRQ:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 640835ed270..644e3a9f47d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -412,6 +412,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_MSIX
 #define KVM_CAP_DEVICE_MSIX 28
 #endif
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
@@ -485,8 +486,10 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
 				   struct kvm_assigned_pci_dev)
 #define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
@@ -494,6 +497,7 @@ struct kvm_irq_routing {
 			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 
 /*
  * ioctls for vcpu fds
@@ -584,6 +588,8 @@ struct kvm_debug_guest {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
 	__u32 busnr;
@@ -594,6 +600,17 @@ struct kvm_assigned_pci_dev {
 	};
 };
 
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK   0xff00
+
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
 	__u32 host_irq;
@@ -609,15 +626,6 @@ struct kvm_assigned_irq {
 	};
 };
 
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-
-#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
-
-#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION  (KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX |\
-					KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX  (1 << 1)
-#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX    (1 << 2)
 
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb60f31c4fb..40e49ede8f9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -339,11 +339,6 @@ struct kvm_assigned_dev_kernel {
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	struct kvm_guest_msix_entry *guest_msix_entries;
-#define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
-#define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
-#define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
-#define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
-#define KVM_ASSIGNED_DEV_MSIX		((1 << 2) | (1 << 10))
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3bed82754a5..792fb7fae0a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -41,6 +41,7 @@
 #include <linux/pagemap.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/bitops.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -60,9 +61,6 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int msi2intx = 1;
-module_param(msi2intx, bool, 0);
-
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
@@ -132,7 +130,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&kvm->lock);
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
 			assigned_dev->guest_msix_entries;
 		for (i = 0; i < assigned_dev->entries_nr; i++) {
@@ -152,7 +150,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 			    assigned_dev->guest_irq, 1);
 		if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_GUEST_MSI) {
+				KVM_DEV_IRQ_GUEST_MSI) {
 			enable_irq(assigned_dev->host_irq);
 			assigned_dev->host_irq_disabled = false;
 		}
@@ -166,7 +164,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
-	if (assigned_dev->irq_requested_type == KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int index = find_index_from_host_irq(assigned_dev, irq);
 		if (index < 0)
 			return IRQ_HANDLED;
@@ -204,22 +202,22 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	}
 }
 
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
+static void deassign_guest_irq(struct kvm *kvm,
+			       struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	if (!irqchip_in_kernel(kvm))
-		return;
-
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
+	assigned_dev->ack_notifier.gsi = -1;
 
 	if (assigned_dev->irq_source_id != -1)
 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 	assigned_dev->irq_source_id = -1;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
 
-	if (!assigned_dev->irq_requested_type)
-		return;
-
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+			      struct kvm_assigned_dev_kernel *assigned_dev)
+{
 	/*
 	 * In kvm_free_device_irq, cancel_work_sync return true if:
 	 * 1. work is scheduled, and then cancelled.
@@ -236,7 +234,7 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	 * now, the kvm state is still legal for probably we also have to wait
 	 * interrupt_work done.
 	 */
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int i;
 		for (i = 0; i < assigned_dev->entries_nr; i++)
 			disable_irq_nosync(assigned_dev->
@@ -259,14 +257,41 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
-		if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_HOST_MSI)
+		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
 			pci_disable_msi(assigned_dev->dev);
 	}
 
-	assigned_dev->irq_requested_type = 0;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *assigned_dev,
+			    unsigned long irq_requested_type)
+{
+	unsigned long guest_irq_type, host_irq_type;
+
+	if (!irqchip_in_kernel(kvm))
+		return -EINVAL;
+	/* no irq assignment to deassign */
+	if (!assigned_dev->irq_requested_type)
+		return -ENXIO;
+
+	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+	if (host_irq_type)
+		deassign_host_irq(kvm, assigned_dev);
+	if (guest_irq_type)
+		deassign_guest_irq(kvm, assigned_dev);
+
+	return 0;
 }
 
+static void kvm_free_assigned_irq(struct kvm *kvm,
+				  struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
 
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
@@ -298,256 +323,244 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 	}
 }
 
-static int assigned_device_update_intx(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
 {
-	adev->guest_irq = airq->guest_irq;
-	adev->ack_notifier.gsi = airq->guest_irq;
-
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
-		return 0;
-
-	if (irqchip_in_kernel(kvm)) {
-		if (!msi2intx &&
-		    (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) {
-			free_irq(adev->host_irq, (void *)adev);
-			pci_disable_msi(adev->dev);
-		}
+	dev->host_irq = dev->dev->irq;
+	/* Even though this is PCI, we don't want to use shared
+	 * interrupts. Sharing host devices with guest-assigned devices
+	 * on the same interrupt line is not a happy situation: there
+	 * are going to be long delays in accepting, acking, etc.
+	 */
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+			0, "kvm_assigned_intx_device", (void *)dev))
+		return -EIO;
+	return 0;
+}
 
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+					   struct kvm_assigned_dev_kernel *dev)
+{
+	int r;
 
-		if (airq->host_irq)
-			adev->host_irq = airq->host_irq;
-		else
-			adev->host_irq = adev->dev->irq;
+	if (!dev->dev->msi_enabled) {
+		r = pci_enable_msi(dev->dev);
+		if (r)
+			return r;
+	}
 
-		/* Even though this is PCI, we don't want to use shared
-		 * interrupts. Sharing host devices with guest-assigned devices
-		 * on the same interrupt line is not a happy situation: there
-		 * are going to be long delays in accepting, acking, etc.
-		 */
-		if (request_irq(adev->host_irq, kvm_assigned_dev_intr,
-				0, "kvm_assigned_intx_device", (void *)adev))
-			return -EIO;
+	dev->host_irq = dev->dev->irq;
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+			"kvm_assigned_msi_device", (void *)dev)) {
+		pci_disable_msi(dev->dev);
+		return -EIO;
 	}
 
-	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
-				   KVM_ASSIGNED_DEV_HOST_INTX;
 	return 0;
 }
+#endif
 
-#ifdef CONFIG_X86
-static int assigned_device_update_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
 {
-	int r;
+	int i, r = -EINVAL;
 
-	adev->guest_irq = airq->guest_irq;
-	if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
-		/* x86 don't care upper address of guest msi message addr */
-		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->ack_notifier.gsi = -1;
-	} else if (msi2intx) {
-		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->ack_notifier.gsi = airq->guest_irq;
-	} else {
-		/*
-		 * Guest require to disable device MSI, we disable MSI and
-		 * re-enable INTx by default again. Notice it's only for
-		 * non-msi2intx.
-		 */
-		assigned_device_update_intx(kvm, adev, airq);
-		return 0;
-	}
+	/* host_msix_entries and guest_msix_entries should have been
+	 * initialized */
+	if (dev->entries_nr == 0)
+		return r;
 
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		return 0;
+	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+	if (r)
+		return r;
 
-	if (irqchip_in_kernel(kvm)) {
-		if (!msi2intx) {
-			if (adev->irq_requested_type &
-					KVM_ASSIGNED_DEV_HOST_INTX)
-				free_irq(adev->host_irq, (void *)adev);
+	for (i = 0; i < dev->entries_nr; i++) {
+		r = request_irq(dev->host_msix_entries[i].vector,
+				kvm_assigned_dev_intr, 0,
+				"kvm_assigned_msix_device",
+				(void *)dev);
+		/* FIXME: free requested_irq's on failure */
+		if (r)
+			return r;
+	}
 
-			r = pci_enable_msi(adev->dev);
-			if (r)
-				return r;
-		}
+	return 0;
+}
 
-		adev->host_irq = adev->dev->irq;
-		if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
-				"kvm_assigned_msi_device", (void *)adev))
-			return -EIO;
-	}
+#endif
 
-	if (!msi2intx)
-		adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI;
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *dev,
+				struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = irq->guest_irq;
+	return 0;
+}
 
-	adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI;
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
 	return 0;
 }
 #endif
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+			   struct kvm_assigned_dev_kernel *dev,
+			   __u32 host_irq_type)
+{
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+		return r;
 
+	switch (host_irq_type) {
+	case KVM_DEV_IRQ_HOST_INTX:
+		r = assigned_device_enable_host_intx(kvm, dev);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_HOST_MSI:
+		r = assigned_device_enable_host_msi(kvm, dev);
+		break;
+#endif
 #ifdef __KVM_HAVE_MSIX
-static int assigned_device_update_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
-{
-	/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-	int i, r;
-
-	adev->ack_notifier.gsi = -1;
-
-	if (irqchip_in_kernel(kvm)) {
-		if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
-			return -ENOTTY;
-
-		if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) {
-			/* Guest disable MSI-X */
-			kvm_free_assigned_irq(kvm, adev);
-			if (msi2intx) {
-				pci_enable_msi(adev->dev);
-				if (adev->dev->msi_enabled)
-					return assigned_device_update_msi(kvm,
-							adev, airq);
-			}
-			return assigned_device_update_intx(kvm, adev, airq);
-		}
+	case KVM_DEV_IRQ_HOST_MSIX:
+		r = assigned_device_enable_host_msix(kvm, dev);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
 
-		/* host_msix_entries and guest_msix_entries should have been
-		 * initialized */
-		if (adev->entries_nr == 0)
-			return -EINVAL;
+	if (!r)
+		dev->irq_requested_type |= host_irq_type;
 
-		kvm_free_assigned_irq(kvm, adev);
+	return r;
+}
 
-		r = pci_enable_msix(adev->dev, adev->host_msix_entries,
-				    adev->entries_nr);
-		if (r)
-			return r;
+static int assign_guest_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *dev,
+			    struct kvm_assigned_irq *irq,
+			    unsigned long guest_irq_type)
+{
+	int id;
+	int r = -EEXIST;
 
-		for (i = 0; i < adev->entries_nr; i++) {
-			r = request_irq((adev->host_msix_entries + i)->vector,
-					kvm_assigned_dev_intr, 0,
-					"kvm_assigned_msix_device",
-					(void *)adev);
-			if (r)
-				return r;
-		}
+	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+		return r;
+
+	id = kvm_request_irq_source_id(kvm);
+	if (id < 0)
+		return id;
+
+	dev->irq_source_id = id;
+
+	switch (guest_irq_type) {
+	case KVM_DEV_IRQ_GUEST_INTX:
+		r = assigned_device_enable_guest_intx(kvm, dev, irq);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_GUEST_MSI:
+		r = assigned_device_enable_guest_msi(kvm, dev, irq);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_GUEST_MSIX:
+		r = assigned_device_enable_guest_msix(kvm, dev, irq);
+		break;
+#endif
+	default:
+		r = -EINVAL;
 	}
 
-	adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX;
+	if (!r) {
+		dev->irq_requested_type |= guest_irq_type;
+		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+	} else
+		kvm_free_irq_source_id(kvm, dev->irq_source_id);
 
-	return 0;
+	return r;
 }
-#endif
 
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq
-				   *assigned_irq)
+				   struct kvm_assigned_irq *assigned_irq)
 {
-	int r = 0;
+	int r = -EINVAL;
 	struct kvm_assigned_dev_kernel *match;
-	u32 current_flags = 0, changed_flags;
+	unsigned long host_irq_type, guest_irq_type;
 
-	mutex_lock(&kvm->lock);
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
 
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	r = -ENODEV;
 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 				      assigned_irq->assigned_dev_id);
-	if (!match) {
-		mutex_unlock(&kvm->lock);
-		return -EINVAL;
-	}
-
-	if (!match->irq_requested_type) {
-		INIT_WORK(&match->interrupt_work,
-				kvm_assigned_dev_interrupt_work_handler);
-		if (irqchip_in_kernel(kvm)) {
-			/* Register ack nofitier */
-			match->ack_notifier.gsi = -1;
-			match->ack_notifier.irq_acked =
-					kvm_assigned_dev_ack_irq;
-			kvm_register_irq_ack_notifier(kvm,
-					&match->ack_notifier);
-
-			/* Request IRQ source ID */
-			r = kvm_request_irq_source_id(kvm);
-			if (r < 0)
-				goto out_release;
-			else
-				match->irq_source_id = r;
-
-#ifdef CONFIG_X86
-			/* Determine host device irq type, we can know the
-			 * result from dev->msi_enabled */
-			if (msi2intx)
-				pci_enable_msi(match->dev);
-#endif
-		}
-	}
+	if (!match)
+		goto out;
 
-	if (match->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)
-		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX;
-	else if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
-		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
-		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
+	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
 
-	changed_flags = assigned_irq->flags ^ current_flags;
+	r = -EINVAL;
+	/* can only assign one type at a time */
+	if (hweight_long(host_irq_type) > 1)
+		goto out;
+	if (hweight_long(guest_irq_type) > 1)
+		goto out;
+	if (host_irq_type == 0 && guest_irq_type == 0)
+		goto out;
 
-#ifdef __KVM_HAVE_MSIX
-	if (changed_flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) {
-		r = assigned_device_update_msix(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to execute "
-					"MSI-X action!\n");
-			goto out_release;
-		}
-	} else
-#endif
-	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
-	    (msi2intx && match->dev->msi_enabled)) {
-#ifdef CONFIG_X86
-		r = assigned_device_update_msi(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to enable "
-					"MSI device!\n");
-			goto out_release;
-		}
-#else
-		r = -ENOTTY;
-#endif
-	} else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
-		/* Host device IRQ 0 means don't support INTx */
-		if (!msi2intx) {
-			printk(KERN_WARNING
-			       "kvm: wait device to enable MSI!\n");
-			r = 0;
-		} else {
-			printk(KERN_WARNING
-			       "kvm: failed to enable MSI device!\n");
-			r = -ENOTTY;
-			goto out_release;
-		}
-	} else {
-		/* Non-sharing INTx mode */
-		r = assigned_device_update_intx(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to enable "
-					"INTx device!\n");
-			goto out_release;
-		}
-	}
+	r = 0;
+	if (host_irq_type)
+		r = assign_host_irq(kvm, match, host_irq_type);
+	if (r)
+		goto out;
 
+	if (guest_irq_type)
+		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
 	mutex_unlock(&kvm->lock);
 	return r;
-out_release:
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+					 struct kvm_assigned_irq
+					 *assigned_irq)
+{
+	int r = -ENODEV;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
 	mutex_unlock(&kvm->lock);
-	kvm_free_assigned_device(kvm, match);
 	return r;
 }
 
@@ -565,7 +578,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 				      assigned_dev->assigned_dev_id);
 	if (match) {
 		/* device already assigned */
-		r = -EINVAL;
+		r = -EEXIST;
 		goto out;
 	}
 
@@ -604,6 +617,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->dev = dev;
 	match->irq_source_id = -1;
 	match->kvm = kvm;
+	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
@@ -2084,6 +2100,11 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_ASSIGN_IRQ: {
+		r = -EOPNOTSUPP;
+		break;
+	}
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+	case KVM_ASSIGN_DEV_IRQ: {
 		struct kvm_assigned_irq assigned_irq;
 
 		r = -EFAULT;
@@ -2094,6 +2115,18 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+	case KVM_DEASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
 #endif
 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
 	case KVM_DEASSIGN_PCI_DEVICE: {
@@ -2596,9 +2629,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
-#ifndef CONFIG_X86
-	msi2intx = 0;
-#endif
 
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 78646121e9a2fcf7977cc15966420e572a450bc3 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 12:12:11 +0200
Subject: KVM: Fix interrupt unhalting a vcpu when it shouldn't

kvm_vcpu_block() unhalts vpu on an interrupt/timer without checking
if interrupt window is actually opened.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c        |  6 ++++++
 arch/powerpc/kvm/powerpc.c      |  6 ++++++
 arch/s390/kvm/interrupt.c       |  6 ++++++
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              | 10 ++++++++++
 arch/x86/kvm/vmx.c              |  8 +++++++-
 arch/x86/kvm/x86.c              |  5 +++++
 include/linux/kvm_host.h        |  1 +
 virt/kvm/kvm_main.c             |  3 ++-
 9 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d2a90fd505b..3bf0a345224 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1963,6 +1963,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.timer_fired;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9057335fdc6..2cf915e51e7 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -41,6 +41,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	return !!(v->arch.pending_exceptions);
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 	return !(v->arch.msr & MSR_WE);
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 0189356fe20..4ed4c3a1148 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -318,6 +318,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return rc;
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 46276273a1a..8351c4d00ac 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -521,7 +521,7 @@ struct kvm_x86_ops {
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
-
+	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	int (*get_mt_mask_shift)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index aa528dbad07..de741043c5b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2270,6 +2270,15 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
 }
 
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	return (vmcb->save.rflags & X86_EFLAGS_IF) &&
+		!(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
 static void svm_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2649,6 +2658,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
+	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da6461d5dc8..b9e06b07aca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2490,6 +2490,12 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 				 GUEST_INTR_STATE_MOV_SS)));
 }
 
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	vmx_update_window_states(vcpu);
+	return vcpu->arch.interrupt_window_open;
+}
+
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
@@ -3691,7 +3697,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
-
+	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
 	.get_mt_mask_shift = vmx_get_mt_mask_shift,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8fca7a4e95a..5bbcad34537 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4475,3 +4475,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
 }
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->interrupt_allowed(vcpu);
+}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40e49ede8f9..72d56844f38 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -298,6 +298,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a1a4272fa57..63d5fa2bc84 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1610,7 +1610,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-		if (kvm_cpu_has_interrupt(vcpu) ||
+		if ((kvm_arch_interrupt_allowed(vcpu) &&
+					kvm_cpu_has_interrupt(vcpu)) ||
 				kvm_arch_vcpu_runnable(vcpu)) {
 			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
-- 
cgit v1.2.3-70-g09d2


From 2f8b9ee14eb439008e0c5131116ea6baa40dba50 Mon Sep 17 00:00:00 2001
From: nathan binkert <nate@binkert.org>
Date: Fri, 27 Mar 2009 21:53:05 -0700
Subject: KVM: Make kvm header C++ friendly

Two things needed fixing: 1) g++ does not allow a named structure type
within an anonymous union and 2) Avoid name clash between two padding
fields within the same struct by giving them different names as is
done elsewhere in the header.

Signed-off-by: Nathan Binkert <nate@binkert.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 644e3a9f47d..3db5d8d3748 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -119,7 +119,7 @@ struct kvm_run {
 			__u32 error_code;
 		} ex;
 		/* KVM_EXIT_IO */
-		struct kvm_io {
+		struct {
 #define KVM_EXIT_IO_IN  0
 #define KVM_EXIT_IO_OUT 1
 			__u8 direction;
@@ -224,10 +224,10 @@ struct kvm_interrupt {
 /* for KVM_GET_DIRTY_LOG */
 struct kvm_dirty_log {
 	__u32 slot;
-	__u32 padding;
+	__u32 padding1;
 	union {
 		void __user *dirty_bitmap; /* one bit per page */
-		__u64 padding;
+		__u64 padding2;
 	};
 };
 
-- 
cgit v1.2.3-70-g09d2


From 522c68c4416de3cd3e11a9ff10d58e776a69ae1e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 27 Apr 2009 20:35:43 +0800
Subject: KVM: Enable snooping control for supported hardware

Memory aliases with different memory type is a problem for guest. For the guest
without assigned device, the memory type of guest memory would always been the
same as host(WB); but for the assigned device, some part of memory may be used
as DMA and then set to uncacheable memory type(UC/WC), which would be a conflict of
host memory type then be a potential issue.

Snooping control can guarantee the cache correctness of memory go through the
DMA engine of VT-d.

[avi: fix build on ia64]

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  1 +
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/kvm/vmx.c               | 19 +++++++++++++++++--
 include/linux/kvm_host.h         |  3 +++
 virt/kvm/iommu.c                 | 27 ++++++++++++++++++++++++---
 5 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 589536fa799..5f43697aed3 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -474,6 +474,7 @@ struct kvm_arch {
 
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct hlist_head irq_ack_notifier_list;
 
 	unsigned long irq_sources_bitmap;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a6f6b643df..253d8f669cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -393,6 +393,7 @@ struct kvm_arch{
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 59b080c262e..e8a5649f9c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3581,11 +3581,26 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	u64 ret;
 
+	/* For VT-d and EPT combination
+	 * 1. MMIO: always map as UC
+	 * 2. EPT with VT-d:
+	 *   a. VT-d without snooping control feature: can't guarantee the
+	 *	result, try to trust guest.
+	 *   b. VT-d with snooping control feature: snooping control feature of
+	 *	VT-d engine can guarantee the cache correctness. Just set it
+	 *	to WB to keep consistent with host. So the same as item 3.
+	 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+	 *    consistent with host MTRR
+	 */
 	if (is_mmio)
 		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+	else if (vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+		ret = kvm_get_guest_memory_type(vcpu, gfn) <<
+		      VMX_EPT_MT_EPTE_SHIFT;
 	else
-		ret = (kvm_get_guest_memory_type(vcpu, gfn) <<
-			VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IGMT_BIT;
+		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
+			| VMX_EPT_IGMT_BIT;
 
 	return ret;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 72d56844f38..bdce8e1303c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -367,6 +367,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY	0x1
+
 #ifdef CONFIG_IOMMU_API
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
 			unsigned long npages);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4c403750360..15147583abd 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -39,11 +39,16 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 	pfn_t pfn;
 	int i, r = 0;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	int flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
+	flags = IOMMU_READ | IOMMU_WRITE;
+	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+		flags |= IOMMU_CACHE;
+
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
 		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
@@ -53,8 +58,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 		r = iommu_map_range(domain,
 				    gfn_to_gpa(gfn),
 				    pfn_to_hpa(pfn),
-				    PAGE_SIZE,
-				    IOMMU_READ | IOMMU_WRITE);
+				    PAGE_SIZE, flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
@@ -88,7 +92,7 @@ int kvm_assign_device(struct kvm *kvm,
 {
 	struct pci_dev *pdev = NULL;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
+	int r, last_flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
@@ -107,12 +111,29 @@ int kvm_assign_device(struct kvm *kvm,
 		return r;
 	}
 
+	last_flags = kvm->arch.iommu_flags;
+	if (iommu_domain_has_cap(kvm->arch.iommu_domain,
+				 IOMMU_CAP_CACHE_COHERENCY))
+		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+
+	/* Check if need to update IOMMU page table for guest memory */
+	if ((last_flags ^ kvm->arch.iommu_flags) ==
+			KVM_IOMMU_CACHE_COHERENCY) {
+		kvm_iommu_unmap_memslots(kvm);
+		r = kvm_iommu_map_memslots(kvm);
+		if (r)
+			goto out_unmap;
+	}
+
 	printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
 		assigned_dev->host_busnr,
 		PCI_SLOT(assigned_dev->host_devfn),
 		PCI_FUNC(assigned_dev->host_devfn));
 
 	return 0;
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
 }
 
 int kvm_deassign_device(struct kvm *kvm,
-- 
cgit v1.2.3-70-g09d2


From 32f8840064d88cc3f6e85203aec7b6b57bebcb97 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 7 May 2009 17:55:12 -0300
Subject: KVM: use smp_send_reschedule in kvm_vcpu_kick

KVM uses a function call IPI to cause the exit of a guest running on a
physical cpu. For virtual interrupt notification there is no need to
wait on IPI receival, or to execute any function.

This is exactly what the reschedule IPI does, without the overhead
of function IPI. So use it instead of smp_call_function_single in
kvm_vcpu_kick.

Also change the "guest_mode" variable to a bit in vcpu->requests, and
use that to collapse multiple IPI's that would be issued between the
first one and zeroing of guest mode.

This allows kvm_vcpu_kick to called with interrupts disabled.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kernel/irq_ia64.c |  3 +++
 arch/ia64/kvm/kvm-ia64.c    | 22 ++++++++--------------
 arch/x86/kernel/smp.c       |  3 +++
 arch/x86/kvm/x86.c          | 36 +++++++++++-------------------------
 include/linux/kvm_host.h    |  2 +-
 5 files changed, 26 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index acc4d19ae62..b448197728b 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -610,6 +610,9 @@ static struct irqaction ipi_irqaction = {
 	.name =		"IPI"
 };
 
+/*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
 static struct irqaction resched_irqaction = {
 	.handler =	dummy_handler,
 	.flags =	IRQF_DISABLED,
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index be4413e1f43..80c57b0a21c 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -668,7 +668,7 @@ again:
 	host_ctx = kvm_get_host_context(vcpu);
 	guest_ctx = kvm_get_guest_context(vcpu);
 
-	vcpu->guest_mode = 1;
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
 
 	r = kvm_vcpu_pre_transition(vcpu);
 	if (r < 0)
@@ -685,7 +685,7 @@ again:
 	kvm_vcpu_post_transition(vcpu);
 
 	vcpu->arch.launched = 1;
-	vcpu->guest_mode = 0;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
 
 	/*
@@ -1879,24 +1879,18 @@ void kvm_arch_hardware_unsetup(void)
 {
 }
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG"vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	int ipi_pcpu = vcpu->cpu;
-	int cpu = get_cpu();
+	int me;
+	int cpu = vcpu->cpu;
 
 	if (waitqueue_active(&vcpu->wq))
 		wake_up_interruptible(&vcpu->wq);
 
-	if (vcpu->guest_mode && cpu != ipi_pcpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+	me = get_cpu();
+	if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_send_reschedule(cpu);
 	put_cpu();
 }
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8cca..3b2e55e8ad2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	/*
+	 * KVM uses this interrupt to force a cpu out of guest mode
+	 */
 }
 
 void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 75927700a26..3c4c327490a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3230,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	local_irq_disable();
 
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
+	smp_mb__after_clear_bit();
+
 	if (vcpu->requests || need_resched() || signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
@@ -3237,13 +3240,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	vcpu->guest_mode = 1;
-	/*
-	 * Make sure that guest_mode assignment won't happen after
-	 * testing the pending IRQ vector bitmap.
-	 */
-	smp_wmb();
-
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else
@@ -3288,7 +3284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	set_debugreg(vcpu->arch.host_dr6, 6);
 	set_debugreg(vcpu->arch.host_dr7, 7);
 
-	vcpu->guest_mode = 0;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
 
 	++vcpu->stat.exits;
@@ -4571,30 +4567,20 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	       || vcpu->arch.nmi_pending;
 }
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	int ipi_pcpu = vcpu->cpu;
-	int cpu;
+	int me;
+	int cpu = vcpu->cpu;
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	/*
-	 * We may be called synchronously with irqs disabled in guest mode,
-	 * So need not to call smp_call_function_single() in that case.
-	 */
-	cpu = get_cpu();
-	if (vcpu->guest_mode && vcpu->cpu != cpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+	me = get_cpu();
+	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_send_reschedule(cpu);
 	put_cpu();
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bdce8e1303c..16181628419 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -38,6 +38,7 @@
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
 #define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_KICK               9
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -72,7 +73,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	int   cpu;
 	struct kvm_run *run;
-	int guest_mode;
 	unsigned long requests;
 	unsigned long guest_debug;
 	int fpu_active;
-- 
cgit v1.2.3-70-g09d2


From 547de29e5b1662deb05b5f90917902dc0e9ac182 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 7 May 2009 17:55:13 -0300
Subject: KVM: protect assigned dev workqueue, int handler and irq acker

kvm_assigned_dev_ack_irq is vulnerable to a race condition with the
interrupt handler function. It does:

        if (dev->host_irq_disabled) {
                enable_irq(dev->host_irq);
                dev->host_irq_disabled = false;
        }

If an interrupt triggers before the host->dev_irq_disabled assignment,
it will disable the interrupt and set dev->host_irq_disabled to true.

On return to kvm_assigned_dev_ack_irq, dev->host_irq_disabled is set to
false, and the next kvm_assigned_dev_ack_irq call will fail to reenable
it.

Other than that, having the interrupt handler and work handlers run in
parallel sounds like asking for trouble (could not spot any obvious
problem, but better not have to, its fragile).

CC: sheng.yang@intel.com
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 16181628419..aacc5449f58 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -345,6 +345,7 @@ struct kvm_assigned_dev_kernel {
 	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
+	spinlock_t assigned_dev_lock;
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 29c0afb064d..687d113a3e5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -42,6 +42,7 @@
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/bitops.h>
+#include <linux/spinlock.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -130,6 +131,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&kvm->lock);
+	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
 			assigned_dev->guest_msix_entries;
@@ -156,18 +158,21 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 		}
 	}
 
+	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
 	mutex_unlock(&assigned_dev->kvm->lock);
 }
 
 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 {
+	unsigned long flags;
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
+	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int index = find_index_from_host_irq(assigned_dev, irq);
 		if (index < 0)
-			return IRQ_HANDLED;
+			goto out;
 		assigned_dev->guest_msix_entries[index].flags |=
 			KVM_ASSIGNED_MSIX_PENDING;
 	}
@@ -177,6 +182,8 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	disable_irq_nosync(irq);
 	assigned_dev->host_irq_disabled = true;
 
+out:
+	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -184,6 +191,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_assigned_dev_kernel *dev;
+	unsigned long flags;
 
 	if (kian->gsi == -1)
 		return;
@@ -196,10 +204,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	/* The guest irq may be shared so this ack may be
 	 * from another device.
 	 */
+	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
 	if (dev->host_irq_disabled) {
 		enable_irq(dev->host_irq);
 		dev->host_irq_disabled = false;
 	}
+	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -615,6 +625,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->host_devfn = assigned_dev->devfn;
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
+	spin_lock_init(&match->assigned_dev_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-- 
cgit v1.2.3-70-g09d2


From 04dce7d9d429ea5ea04e9432d1726c930f4d67da Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 10 Jun 2009 16:59:46 +1000
Subject: spinlock: Add missing __raw_spin_lock_flags() stub for UP

This was only defined with CONFIG_DEBUG_SPINLOCK set, but some
obscure arch/powerpc code wants it always.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/spinlock_up.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 938234c4a99..d4841ed8215 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 #define __raw_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched.c and kernel_lock.c: */
 # define __raw_spin_lock(lock)		do { (void)(lock); } while (0)
+# define __raw_spin_lock_flags(lock, flags)	do { (void)(lock); } while (0)
 # define __raw_spin_unlock(lock)	do { (void)(lock); } while (0)
 # define __raw_spin_trylock(lock)	({ (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
-- 
cgit v1.2.3-70-g09d2


From f1db457ce6e2f63cb01022f58c0c023838958bd1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 10 Jun 2009 10:06:24 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT(), fix
 !CONFIG_BLOCK

Fix building failures when CONFIG_BLOCK == n.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2F1520.8020003@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/blktrace_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index c7ec31dd04c..7e4350ece0f 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,7 +218,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
-#ifdef CONFIG_EVENT_TRACING
+#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
 
 static inline int blk_cmd_buf_len(struct request *rq)
 {
@@ -229,7 +229,7 @@ extern void blk_dump_cmd(char *buf, struct request *rq);
 extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
 extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
 
-#endif /* CONFIG_EVENT_TRACING */
+#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
 #endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3-70-g09d2


From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 13:40:57 +0200
Subject: perf_counter: More aggressive frequency adjustment

Also employ the overflow handler to adjust the frequency, this results
in a stable frequency in about 40~50 samples, instead of that many ticks.

This also means we can start sampling at a sample period of 1 without
running head-first into the throttle.

It relies on sched_clock() to accurately measure the time difference
between the overflow NMIs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |   5 +-
 include/linux/perf_counter.h       |   1 +
 kernel/perf_counter.c              | 130 +++++++++++++++++++++++++------------
 3 files changed, 92 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 49f258537cb..240ca563063 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (!hwc->sample_period)
+	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
 	counter->destroy = hw_perf_counter_destroy;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3586df840f6..282d8cc4898 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -371,6 +371,7 @@ struct hw_perf_counter {
 
 	u64				freq_count;
 	u64				freq_interrupts;
+	u64				freq_stamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5eacaaf3f9c..51c571ee4d0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1184,13 +1184,33 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
-static void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 period, sample_period;
+	s64 delta;
+
+	events *= hwc->sample_period;
+	period = div64_u64(events, counter->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	perf_log_period(counter, sample_period);
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	u64 interrupts, sample_period;
-	u64 events, period, freq;
-	s64 delta;
+	u64 interrupts, freq;
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -1202,6 +1222,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		interrupts = hwc->interrupts;
 		hwc->interrupts = 0;
 
+		/*
+		 * unthrottle counters on the tick
+		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
@@ -1211,6 +1234,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
 		if (counter->attr.sample_freq < HZ) {
 			freq = counter->attr.sample_freq;
 
@@ -1226,20 +1252,20 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		} else
 			freq = HZ;
 
-		events = freq * interrupts * hwc->sample_period;
-		period = div64_u64(events, counter->attr.sample_freq);
-
-		delta = (s64)(1 + period - hwc->sample_period);
-		delta >>= 1;
-
-		sample_period = hwc->sample_period + delta;
-
-		if (!sample_period)
-			sample_period = 1;
+		perf_adjust_period(counter, freq * interrupts);
 
-		perf_log_period(counter, sample_period);
-
-		hwc->sample_period = sample_period;
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			counter->pmu->disable(counter);
+			atomic_set(&hwc->period_left, 0);
+			counter->pmu->enable(counter);
+			perf_enable();
+		}
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -1279,9 +1305,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = curr->perf_counter_ctxp;
 
-	perf_adjust_freq(&cpuctx->ctx);
+	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
-		perf_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
 	if (ctx)
@@ -1647,10 +1673,10 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 		counter->attr.sample_freq = value;
 	} else {
+		perf_log_period(counter, value);
+
 		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
-
-		perf_log_period(counter, value);
 	}
 unlock:
 	spin_unlock_irq(&ctx->lock);
@@ -2853,35 +2879,41 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
  * event flow.
  */
 
+struct freq_event {
+	struct perf_event_header	header;
+	u64				time;
+	u64				id;
+	u64				period;
+};
+
 static void perf_log_period(struct perf_counter *counter, u64 period)
 {
 	struct perf_output_handle handle;
+	struct freq_event event;
 	int ret;
 
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				id;
-		u64				period;
-	} freq_event = {
+	if (counter->hw.sample_period == period)
+		return;
+
+	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+		return;
+
+	event = (struct freq_event) {
 		.header = {
 			.type = PERF_EVENT_PERIOD,
 			.misc = 0,
-			.size = sizeof(freq_event),
+			.size = sizeof(event),
 		},
 		.time = sched_clock(),
 		.id = counter->id,
 		.period = period,
 	};
 
-	if (counter->hw.sample_period == period)
-		return;
-
-	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
 	if (ret)
 		return;
 
-	perf_output_put(&handle, freq_event);
+	perf_output_put(&handle, event);
 	perf_output_end(&handle);
 }
 
@@ -2923,15 +2955,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 	int ret = 0;
 
 	if (!throttle) {
-		counter->hw.interrupts++;
+		hwc->interrupts++;
 	} else {
-		if (counter->hw.interrupts != MAX_INTERRUPTS) {
-			counter->hw.interrupts++;
-			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-				counter->hw.interrupts = MAX_INTERRUPTS;
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
 			}
@@ -2945,6 +2978,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 		}
 	}
 
+	if (counter->attr.freq) {
+		u64 now = sched_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+	}
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -3379,7 +3422,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3483,10 +3525,11 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	pmu = NULL;
 
 	hwc = &counter->hw;
+	hwc->sample_period = attr->sample_period;
 	if (attr->freq && attr->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
-	else
-		hwc->sample_period = attr->sample_period;
+		hwc->sample_period = 1;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
@@ -3687,6 +3730,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	else
 		child_counter->state = PERF_COUNTER_STATE_OFF;
 
+	if (parent_counter->attr.freq)
+		child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
 	/*
 	 * Link it up in the child's context:
 	 */
-- 
cgit v1.2.3-70-g09d2


From 4d1d49858c0a5a4fb1be4bc7972754cd640245ba Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 4 Jun 2009 21:57:03 +0200
Subject: net/libertas: remove GPIO-CS handling in SPI interface code

This removes the dependency on GPIO framework and lets the SPI host
driver handle the chip select. The SPI host driver is required to keep
the CS active for the entire message unless cs_change says otherwise.
This patch collects the two/three single SPI transfers into a message.
Also the delay in read path in case use_dummy_writes are not used is
moved into the SPI host driver.

Tested-by: Mike Rapoport <mike@compulab.co.il>
Tested-by: Andrey Yurovsky <andrey@cozybit.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/Kconfig           |  2 +-
 drivers/net/wireless/libertas/if_spi.c | 92 ++++++++++++++++------------------
 include/linux/spi/libertas_spi.h       |  3 --
 3 files changed, 45 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index daf4c805be5..fb7541c28e5 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -153,7 +153,7 @@ config LIBERTAS_SDIO
 
 config LIBERTAS_SPI
 	tristate "Marvell Libertas 8686 SPI 802.11b/g cards"
-	depends on LIBERTAS && SPI && GENERIC_GPIO
+	depends on LIBERTAS && SPI
 	---help---
 	  A driver for Marvell Libertas 8686 SPI devices.
 
diff --git a/drivers/net/wireless/libertas/if_spi.c b/drivers/net/wireless/libertas/if_spi.c
index ea23c5de142..f8c2898d82b 100644
--- a/drivers/net/wireless/libertas/if_spi.c
+++ b/drivers/net/wireless/libertas/if_spi.c
@@ -19,7 +19,6 @@
 
 #include <linux/moduleparam.h>
 #include <linux/firmware.h>
-#include <linux/gpio.h>
 #include <linux/jiffies.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
@@ -51,13 +50,6 @@ struct if_spi_card {
 	u16				card_id;
 	u8				card_rev;
 
-	/* Pin number for our GPIO chip-select. */
-	/* TODO: Once the generic SPI layer has some additional features, we
-	 * should take this out and use the normal chip select here.
-	 * We need support for chip select delays, and not dropping chipselect
-	 * after each word. */
-	int				gpio_cs;
-
 	/* The last time that we initiated an SPU operation */
 	unsigned long			prev_xfer_time;
 
@@ -130,12 +122,10 @@ static void spu_transaction_init(struct if_spi_card *card)
 		 * If not, we have to busy-wait to be on the safe side. */
 		ndelay(400);
 	}
-	gpio_set_value(card->gpio_cs, 0); /* assert CS */
 }
 
 static void spu_transaction_finish(struct if_spi_card *card)
 {
-	gpio_set_value(card->gpio_cs, 1); /* drop CS */
 	card->prev_xfer_time = jiffies;
 }
 
@@ -145,6 +135,13 @@ static int spu_write(struct if_spi_card *card, u16 reg, const u8 *buf, int len)
 {
 	int err = 0;
 	u16 reg_out = cpu_to_le16(reg | IF_SPI_WRITE_OPERATION_MASK);
+	struct spi_message m;
+	struct spi_transfer reg_trans;
+	struct spi_transfer data_trans;
+
+	spi_message_init(&m);
+	memset(&reg_trans, 0, sizeof(reg_trans));
+	memset(&data_trans, 0, sizeof(data_trans));
 
 	/* You must give an even number of bytes to the SPU, even if it
 	 * doesn't care about the last one.  */
@@ -153,13 +150,16 @@ static int spu_write(struct if_spi_card *card, u16 reg, const u8 *buf, int len)
 	spu_transaction_init(card);
 
 	/* write SPU register index */
-	err = spi_write(card->spi, (u8 *)&reg_out, sizeof(u16));
-	if (err)
-		goto out;
+	reg_trans.tx_buf = &reg_out;
+	reg_trans.len = sizeof(reg_out);
 
-	err = spi_write(card->spi, buf, len);
+	data_trans.tx_buf = buf;
+	data_trans.len = len;
 
-out:
+	spi_message_add_tail(&reg_trans, &m);
+	spi_message_add_tail(&data_trans, &m);
+
+	err = spi_sync(card->spi, &m);
 	spu_transaction_finish(card);
 	return err;
 }
@@ -186,10 +186,13 @@ static inline int spu_reg_is_port_reg(u16 reg)
 
 static int spu_read(struct if_spi_card *card, u16 reg, u8 *buf, int len)
 {
-	unsigned int i, delay;
+	unsigned int delay;
 	int err = 0;
-	u16 zero = 0;
 	u16 reg_out = cpu_to_le16(reg | IF_SPI_READ_OPERATION_MASK);
+	struct spi_message m;
+	struct spi_transfer reg_trans;
+	struct spi_transfer dummy_trans;
+	struct spi_transfer data_trans;
 
 	/* You must take an even number of bytes from the SPU, even if you
 	 * don't care about the last one.  */
@@ -197,29 +200,34 @@ static int spu_read(struct if_spi_card *card, u16 reg, u8 *buf, int len)
 
 	spu_transaction_init(card);
 
+	spi_message_init(&m);
+	memset(&reg_trans, 0, sizeof(reg_trans));
+	memset(&dummy_trans, 0, sizeof(dummy_trans));
+	memset(&data_trans, 0, sizeof(data_trans));
+
 	/* write SPU register index */
-	err = spi_write(card->spi, (u8 *)&reg_out, sizeof(u16));
-	if (err)
-		goto out;
+	reg_trans.tx_buf = &reg_out;
+	reg_trans.len = sizeof(reg_out);
+	spi_message_add_tail(&reg_trans, &m);
 
 	delay = spu_reg_is_port_reg(reg) ? card->spu_port_delay :
 						card->spu_reg_delay;
 	if (card->use_dummy_writes) {
 		/* Clock in dummy cycles while the SPU fills the FIFO */
-		for (i = 0; i < delay / 16; ++i) {
-			err = spi_write(card->spi, (u8 *)&zero, sizeof(u16));
-			if (err)
-				return err;
-		}
+		dummy_trans.len = delay / 8;
+		spi_message_add_tail(&dummy_trans, &m);
 	} else {
 		/* Busy-wait while the SPU fills the FIFO */
-		ndelay(100 + (delay * 10));
+		reg_trans.delay_usecs =
+			DIV_ROUND_UP((100 + (delay * 10)), 1000);
 	}
 
 	/* read in data */
-	err = spi_read(card->spi, buf, len);
+	data_trans.rx_buf = buf;
+	data_trans.len = len;
+	spi_message_add_tail(&data_trans, &m);
 
-out:
+	err = spi_sync(card->spi, &m);
 	spu_transaction_finish(card);
 	return err;
 }
@@ -1049,7 +1057,6 @@ static int __devinit if_spi_probe(struct spi_device *spi)
 	spi_set_drvdata(spi, card);
 	card->pdata = pdata;
 	card->spi = spi;
-	card->gpio_cs = pdata->gpio_cs;
 	card->prev_xfer_time = jiffies;
 
 	sema_init(&card->spi_ready, 0);
@@ -1058,26 +1065,18 @@ static int __devinit if_spi_probe(struct spi_device *spi)
 	INIT_LIST_HEAD(&card->data_packet_list);
 	spin_lock_init(&card->buffer_lock);
 
-	/* set up GPIO CS line. TODO: use  regular CS line */
-	err = gpio_request(card->gpio_cs, "if_spi_gpio_chip_select");
-	if (err)
-		goto free_card;
-	err = gpio_direction_output(card->gpio_cs, 1);
-	if (err)
-		goto free_gpio;
-
 	/* Initialize the SPI Interface Unit */
 	err = spu_init(card, pdata->use_dummy_writes);
 	if (err)
-		goto free_gpio;
+		goto free_card;
 	err = spu_get_chip_revision(card, &card->card_id, &card->card_rev);
 	if (err)
-		goto free_gpio;
+		goto free_card;
 
 	/* Firmware load */
 	err = spu_read_u32(card, IF_SPI_SCRATCH_4_REG, &scratch);
 	if (err)
-		goto free_gpio;
+		goto free_card;
 	if (scratch == SUCCESSFUL_FW_DOWNLOAD_MAGIC)
 		lbs_deb_spi("Firmware is already loaded for "
 			    "Marvell WLAN 802.11 adapter\n");
@@ -1085,7 +1084,7 @@ static int __devinit if_spi_probe(struct spi_device *spi)
 		err = if_spi_calculate_fw_names(card->card_id,
 				card->helper_fw_name, card->main_fw_name);
 		if (err)
-			goto free_gpio;
+			goto free_card;
 
 		lbs_deb_spi("Initializing FW for Marvell WLAN 802.11 adapter "
 				"(chip_id = 0x%04x, chip_rev = 0x%02x) "
@@ -1096,23 +1095,23 @@ static int __devinit if_spi_probe(struct spi_device *spi)
 				spi->max_speed_hz);
 		err = if_spi_prog_helper_firmware(card);
 		if (err)
-			goto free_gpio;
+			goto free_card;
 		err = if_spi_prog_main_firmware(card);
 		if (err)
-			goto free_gpio;
+			goto free_card;
 		lbs_deb_spi("loaded FW for Marvell WLAN 802.11 adapter\n");
 	}
 
 	err = spu_set_interrupt_mode(card, 0, 1);
 	if (err)
-		goto free_gpio;
+		goto free_card;
 
 	/* Register our card with libertas.
 	 * This will call alloc_etherdev */
 	priv = lbs_add_card(card, &spi->dev);
 	if (!priv) {
 		err = -ENOMEM;
-		goto free_gpio;
+		goto free_card;
 	}
 	card->priv = priv;
 	priv->card = card;
@@ -1157,8 +1156,6 @@ terminate_thread:
 	if_spi_terminate_spi_thread(card);
 remove_card:
 	lbs_remove_card(priv); /* will call free_netdev */
-free_gpio:
-	gpio_free(card->gpio_cs);
 free_card:
 	free_if_spi_card(card);
 out:
@@ -1179,7 +1176,6 @@ static int __devexit libertas_spi_remove(struct spi_device *spi)
 	free_irq(spi->irq, card);
 	if_spi_terminate_spi_thread(card);
 	lbs_remove_card(priv); /* will call free_netdev */
-	gpio_free(card->gpio_cs);
 	if (card->pdata->teardown)
 		card->pdata->teardown(spi);
 	free_if_spi_card(card);
diff --git a/include/linux/spi/libertas_spi.h b/include/linux/spi/libertas_spi.h
index 79506f5f9e6..1b5d5384fcd 100644
--- a/include/linux/spi/libertas_spi.h
+++ b/include/linux/spi/libertas_spi.h
@@ -22,9 +22,6 @@ struct libertas_spi_platform_data {
 	 * speed, you may want to use 0 here. */
 	u16 use_dummy_writes;
 
-	/* GPIO number to use as chip select */
-	u16 gpio_cs;
-
 	/* Board specific setup/teardown */
 	int (*setup)(struct spi_device *spi);
 	int (*teardown)(struct spi_device *spi);
-- 
cgit v1.2.3-70-g09d2


From 8f77f3849cc3ae2d6df9301785a3d316ea7d7ee1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sun, 7 Jun 2009 21:58:37 +0200
Subject: mac80211: do not pass PS frames out of mac80211 again

In order to handle powersave frames properly we had needed
to pass these out to the device queues again, and introduce
the skb->requeue bit. This, however, also has unnecessary
overhead by needing to 'clean up' already tried frames, and
this clean-up code is also buggy when software encryption
is used.

Instead of sending the frames via the master netdev queue
again, simply put them into the pending queue. This also
fixes a problem where frames for that particular station
could be reordered when some were still on the software
queues and older ones are re-injected into the software
queue after them.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/skbuff.h     |  4 ---
 include/net/mac80211.h     |  3 +++
 net/core/skbuff.c          |  1 -
 net/mac80211/ieee80211_i.h |  5 ++++
 net/mac80211/main.c        | 61 +++++-----------------------------------------
 net/mac80211/rx.c          | 25 +++++++------------
 net/mac80211/tx.c          |  3 ++-
 net/mac80211/util.c        | 46 ++++++++++++++++++++++++++++++++++
 net/mac80211/wme.c         |  2 +-
 9 files changed, 72 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f1c93b878b3..fa51293f270 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -304,9 +304,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@tc_verd: traffic control verdict
  *	@ndisc_nodetype: router type (from link layer)
  *	@do_not_encrypt: set to prevent encryption of this frame
- *	@requeue: set to indicate that the wireless core should attempt
- *		a software retry on this frame if we failed to
- *		receive an ACK for it
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
  *	@secmark: security marking
@@ -380,7 +377,6 @@ struct sk_buff {
 #endif
 #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
 	__u8			do_not_encrypt:1;
-	__u8			requeue:1;
 #endif
 	/* 0/13/14 bit hole */
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 17d61d19d91..c0610447697 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -239,6 +239,8 @@ struct ieee80211_bss_conf {
  * @IEEE80211_TX_INTFL_NEED_TXPROCESSING: completely internal to mac80211,
  *	used to indicate that a pending frame requires TX processing before
  *	it can be sent out.
+ * @IEEE80211_TX_INTFL_RETRIED: completely internal to mac80211,
+ *	used to indicate that a frame was already retried due to PS
  */
 enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTL_REQ_TX_STATUS		= BIT(0),
@@ -256,6 +258,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTL_RATE_CTRL_PROBE	= BIT(12),
 	IEEE80211_TX_INTFL_RCALGO		= BIT(13),
 	IEEE80211_TX_INTFL_NEED_TXPROCESSING	= BIT(14),
+	IEEE80211_TX_INTFL_RETRIED		= BIT(15),
 };
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 49961ba3c0f..b94d777e3eb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -552,7 +552,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->vlan_tci		= old->vlan_tci;
 #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
 	new->do_not_encrypt	= old->do_not_encrypt;
-	new->requeue		= old->requeue;
 #endif
 
 	skb_copy_secmark(new, old);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c088c46704a..4dbc2896419 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -589,6 +589,7 @@ enum queue_stop_reason {
 	IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
 	IEEE80211_QUEUE_STOP_REASON_SUSPEND,
 	IEEE80211_QUEUE_STOP_REASON_PENDING,
+	IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
 };
 
 struct ieee80211_master_priv {
@@ -1121,6 +1122,10 @@ void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
 				    enum queue_stop_reason reason);
 void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
 				    enum queue_stop_reason reason);
+void ieee80211_add_pending_skb(struct ieee80211_local *local,
+			       struct sk_buff *skb);
+int ieee80211_add_pending_skbs(struct ieee80211_local *local,
+			       struct sk_buff_head *skbs);
 
 void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 u16 transaction, u16 auth_alg,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 2683df91807..092a017b237 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -369,60 +369,12 @@ static void ieee80211_tasklet_handler(unsigned long data)
 	}
 }
 
-/* Remove added headers (e.g., QoS control), encryption header/MIC, etc. to
- * make a prepared TX frame (one that has been given to hw) to look like brand
- * new IEEE 802.11 frame that is ready to go through TX processing again.
- */
-static void ieee80211_remove_tx_extra(struct ieee80211_local *local,
-				      struct ieee80211_key *key,
-				      struct sk_buff *skb)
-{
-	unsigned int hdrlen, iv_len, mic_len;
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-
-	hdrlen = ieee80211_hdrlen(hdr->frame_control);
-
-	if (!key)
-		goto no_key;
-
-	switch (key->conf.alg) {
-	case ALG_WEP:
-		iv_len = WEP_IV_LEN;
-		mic_len = WEP_ICV_LEN;
-		break;
-	case ALG_TKIP:
-		iv_len = TKIP_IV_LEN;
-		mic_len = TKIP_ICV_LEN;
-		break;
-	case ALG_CCMP:
-		iv_len = CCMP_HDR_LEN;
-		mic_len = CCMP_MIC_LEN;
-		break;
-	default:
-		goto no_key;
-	}
-
-	if (skb->len >= hdrlen + mic_len &&
-	    !(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
-		skb_trim(skb, skb->len - mic_len);
-	if (skb->len >= hdrlen + iv_len) {
-		memmove(skb->data + iv_len, skb->data, hdrlen);
-		hdr = (struct ieee80211_hdr *)skb_pull(skb, iv_len);
-	}
-
-no_key:
-	if (ieee80211_is_data_qos(hdr->frame_control)) {
-		hdr->frame_control &= ~cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
-		memmove(skb->data + IEEE80211_QOS_CTL_LEN, skb->data,
-			hdrlen - IEEE80211_QOS_CTL_LEN);
-		skb_pull(skb, IEEE80211_QOS_CTL_LEN);
-	}
-}
-
 static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 					    struct sta_info *sta,
 					    struct sk_buff *skb)
 {
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
 	sta->tx_filtered_count++;
 
 	/*
@@ -464,16 +416,15 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 	 */
 	if (test_sta_flags(sta, WLAN_STA_PS) &&
 	    skb_queue_len(&sta->tx_filtered) < STA_MAX_TX_BUFFER) {
-		ieee80211_remove_tx_extra(local, sta->key, skb);
 		skb_queue_tail(&sta->tx_filtered, skb);
 		return;
 	}
 
-	if (!test_sta_flags(sta, WLAN_STA_PS) && !skb->requeue) {
+	if (!test_sta_flags(sta, WLAN_STA_PS) &&
+	    !(info->flags & IEEE80211_TX_INTFL_RETRIED)) {
 		/* Software retry the packet once */
-		skb->requeue = 1;
-		ieee80211_remove_tx_extra(local, sta->key, skb);
-		dev_queue_xmit(skb);
+		info->flags |= IEEE80211_TX_INTFL_RETRIED;
+		ieee80211_add_pending_skb(local, skb);
 		return;
 	}
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 75412518510..de5bba7f910 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -797,8 +797,7 @@ static int ap_sta_ps_end(struct sta_info *sta)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	int sent = 0;
+	int sent, buffered;
 
 	atomic_dec(&sdata->bss->num_sta_ps);
 
@@ -814,22 +813,16 @@ static int ap_sta_ps_end(struct sta_info *sta)
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
 
 	/* Send all buffered frames to the station */
-	while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) {
-		sent++;
-		skb->requeue = 1;
-		dev_queue_xmit(skb);
-	}
-	while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) {
-		local->total_ps_buffered--;
-		sent++;
+	sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered);
+	buffered = ieee80211_add_pending_skbs(local, &sta->ps_tx_buf);
+	sent += buffered;
+	local->total_ps_buffered -= buffered;
+
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
-		printk(KERN_DEBUG "%s: STA %pM aid %d send PS frame "
-		       "since STA not sleeping anymore\n", sdata->dev->name,
-		       sta->sta.addr, sta->sta.aid);
+	printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames "
+	       "since STA not sleeping anymore\n", sdata->dev->name,
+	       sta->sta.addr, sta->sta.aid, sent - buffered, buffered);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
-		skb->requeue = 1;
-		dev_queue_xmit(skb);
-	}
 
 	return sent;
 }
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1436f747531..bfaa9ce3314 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -400,6 +400,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 			sta_info_set_tim_bit(sta);
 
 		info->control.jiffies = jiffies;
+		info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
 		skb_queue_tail(&sta->ps_tx_buf, tx->skb);
 		return TX_QUEUED;
 	}
@@ -420,7 +421,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 		 * frame filtering and keeps a station  blacklist on its own
 		 * (e.g: p54), so that frames can be delivered unimpeded.
 		 *
-		 * Note: It should be save to disable the filter now.
+		 * Note: It should be safe to disable the filter now.
 		 * As, it is really unlikely that we still have any pending
 		 * frame for this station in the hw's buffers/fifos left,
 		 * that is not rejected with a unsuccessful tx_status yet.
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 22f63815fb3..66ce96a69f3 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -341,6 +341,52 @@ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue)
 }
 EXPORT_SYMBOL(ieee80211_stop_queue);
 
+void ieee80211_add_pending_skb(struct ieee80211_local *local,
+			       struct sk_buff *skb)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	unsigned long flags;
+	int queue = skb_get_queue_mapping(skb);
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	__ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+	__ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_PENDING);
+	skb_queue_tail(&local->pending[queue], skb);
+	__ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+int ieee80211_add_pending_skbs(struct ieee80211_local *local,
+			       struct sk_buff_head *skbs)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	struct sk_buff *skb;
+	unsigned long flags;
+	int queue, ret = 0, i;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	for (i = 0; i < hw->queues; i++)
+		__ieee80211_stop_queue(hw, i,
+			IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+
+	while ((skb = skb_dequeue(skbs))) {
+		ret++;
+		queue = skb_get_queue_mapping(skb);
+		skb_queue_tail(&local->pending[queue], skb);
+	}
+
+	for (i = 0; i < hw->queues; i++) {
+		if (ret)
+			__ieee80211_stop_queue(hw, i,
+				IEEE80211_QUEUE_STOP_REASON_PENDING);
+		__ieee80211_wake_queue(hw, i,
+			IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+	}
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+	return ret;
+}
+
 void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
 				    enum queue_stop_reason reason)
 {
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 694343b9102..116a923b14d 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -101,7 +101,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 	 * Now we know the 1d priority, fill in the QoS header if
 	 * there is one (and we haven't done this before).
 	 */
-	if (!skb->requeue && ieee80211_is_data_qos(hdr->frame_control)) {
+	if (ieee80211_is_data_qos(hdr->frame_control)) {
 		u8 *p = ieee80211_get_qos_ctl(hdr);
 		u8 ack_policy = 0;
 		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
-- 
cgit v1.2.3-70-g09d2


From b3fa1329eaf2a7b97124dacf5b663fd51346ac19 Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Mon, 8 Jun 2009 13:27:27 +0100
Subject: rfkill: remove set_global_sw_state

rfkill_set_global_sw_state() (previously rfkill_set_default()) will no
longer be exported by the rewritten rfkill core.

Instead, platform drivers which can provide persistent soft-rfkill state
across power-down/reboot should indicate their initial state by calling
rfkill_set_sw_state() before registration.  Otherwise, they will be
initialized to a default value during registration by a set_block call.

We remove existing calls to rfkill_set_sw_state() which happen before
registration, since these had no effect in the old model.  If these
drivers do have persistent state, the calls can be put back (subject
to testing :-).  This affects hp-wmi and acer-wmi.

Drivers with persistent state will affect the global state only if
rfkill-input is enabled.  This is required, otherwise booting with
wireless soft-blocked and pressing the wireless-toggle key once would
have no apparent effect.  This special case will be removed in future
along with rfkill-input, in favour of a more flexible userspace daemon
(see Documentation/feature-removal-schedule.txt).

Now rfkill_global_states[n].def is only used to preserve global states
over EPO, it is renamed to ".sav".

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/platform/x86/acer-wmi.c      |  3 --
 drivers/platform/x86/eeepc-laptop.c  |  8 ++--
 drivers/platform/x86/hp-wmi.c        |  4 --
 drivers/platform/x86/thinkpad_acpi.c | 31 +++++++-------
 include/linux/rfkill.h               | 28 ++++---------
 net/rfkill/core.c                    | 81 +++++++++++++-----------------------
 6 files changed, 56 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index b618fa51db2..09a503e5da6 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -988,7 +988,6 @@ static struct rfkill *acer_rfkill_register(struct device *dev,
 					   char *name, u32 cap)
 {
 	int err;
-	u32 state;
 	struct rfkill *rfkill_dev;
 
 	rfkill_dev = rfkill_alloc(name, dev, type,
@@ -996,8 +995,6 @@ static struct rfkill *acer_rfkill_register(struct device *dev,
 				  (void *)(unsigned long)cap);
 	if (!rfkill_dev)
 		return ERR_PTR(-ENOMEM);
-	get_u32(&state, cap);
-	rfkill_set_sw_state(rfkill_dev, !state);
 
 	err = rfkill_register(rfkill_dev);
 	if (err) {
diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 1208d0cedd1..03bf522bd7a 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -675,8 +675,8 @@ static int eeepc_hotk_add(struct acpi_device *device)
 		if (!ehotk->eeepc_wlan_rfkill)
 			goto wlan_fail;
 
-		rfkill_set_global_sw_state(RFKILL_TYPE_WLAN,
-					   get_acpi(CM_ASL_WLAN) != 1);
+		rfkill_set_sw_state(ehotk->eeepc_wlan_rfkill,
+				    get_acpi(CM_ASL_WLAN) != 1);
 		result = rfkill_register(ehotk->eeepc_wlan_rfkill);
 		if (result)
 			goto wlan_fail;
@@ -693,8 +693,8 @@ static int eeepc_hotk_add(struct acpi_device *device)
 		if (!ehotk->eeepc_bluetooth_rfkill)
 			goto bluetooth_fail;
 
-		rfkill_set_global_sw_state(RFKILL_TYPE_BLUETOOTH,
-					   get_acpi(CM_ASL_BLUETOOTH) != 1);
+		rfkill_set_sw_state(ehotk->eeepc_bluetooth_rfkill,
+				    get_acpi(CM_ASL_BLUETOOTH) != 1);
 		result = rfkill_register(ehotk->eeepc_bluetooth_rfkill);
 		if (result)
 			goto bluetooth_fail;
diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c
index 8d931145cbf..16fffe44e33 100644
--- a/drivers/platform/x86/hp-wmi.c
+++ b/drivers/platform/x86/hp-wmi.c
@@ -422,7 +422,6 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 					   RFKILL_TYPE_WLAN,
 					   &hp_wmi_rfkill_ops,
 					   (void *) 0);
-		rfkill_set_sw_state(wifi_rfkill, hp_wmi_wifi_state());
 		err = rfkill_register(wifi_rfkill);
 		if (err)
 			goto register_wifi_error;
@@ -433,8 +432,6 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 						RFKILL_TYPE_BLUETOOTH,
 						&hp_wmi_rfkill_ops,
 						(void *) 1);
-		rfkill_set_sw_state(bluetooth_rfkill,
-				    hp_wmi_bluetooth_state());
 		err = rfkill_register(bluetooth_rfkill);
 		if (err)
 			goto register_bluetooth_error;
@@ -445,7 +442,6 @@ static int __init hp_wmi_bios_setup(struct platform_device *device)
 					   RFKILL_TYPE_WWAN,
 					   &hp_wmi_rfkill_ops,
 					   (void *) 2);
-		rfkill_set_sw_state(wwan_rfkill, hp_wmi_wwan_state());
 		err = rfkill_register(wwan_rfkill);
 		if (err)
 			goto register_wwan_err;
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index cfcafa4e947..86e958539f4 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -1168,21 +1168,6 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 
 	BUG_ON(id >= TPACPI_RFK_SW_MAX || tpacpi_rfkill_switches[id]);
 
-	initial_sw_status = (tp_rfkops->get_status)();
-	if (initial_sw_status < 0) {
-		printk(TPACPI_ERR
-			"failed to read initial state for %s, error %d; "
-			"will turn radio off\n", name, initial_sw_status);
-	} else {
-		initial_sw_state = (initial_sw_status == TPACPI_RFK_RADIO_OFF);
-		if (set_default) {
-			/* try to set the initial state as the default for the
-			 * rfkill type, since we ask the firmware to preserve
-			 * it across S5 in NVRAM */
-			rfkill_set_global_sw_state(rfktype, initial_sw_state);
-		}
-	}
-
 	atp_rfk = kzalloc(sizeof(struct tpacpi_rfk), GFP_KERNEL);
 	if (atp_rfk)
 		atp_rfk->rfkill = rfkill_alloc(name,
@@ -1200,8 +1185,20 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 	atp_rfk->id = id;
 	atp_rfk->ops = tp_rfkops;
 
-	rfkill_set_states(atp_rfk->rfkill, initial_sw_state,
-				tpacpi_rfk_check_hwblock_state());
+	initial_sw_status = (tp_rfkops->get_status)();
+	if (initial_sw_status < 0) {
+		printk(TPACPI_ERR
+			"failed to read initial state for %s, error %d\n",
+			name, initial_sw_status);
+	} else {
+		initial_sw_state = (initial_sw_status == TPACPI_RFK_RADIO_OFF);
+		if (set_default) {
+			/* try to keep the initial state, since we ask the
+			 * firmware to preserve it across S5 in NVRAM */
+			rfkill_set_sw_state(atp_rfk->rfkill, initial_sw_state);
+		}
+	}
+	rfkill_set_hw_state(atp_rfk->rfkill, tpacpi_rfk_check_hwblock_state());
 
 	res = rfkill_register(atp_rfk->rfkill);
 	if (res < 0) {
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index d7e818ad0bc..c1dca0b8138 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -157,8 +157,14 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
  * @rfkill: rfkill structure to be registered
  *
  * This function should be called by the transmitter driver to register
- * the rfkill structure needs to be registered. Before calling this function
- * the driver needs to be ready to service method calls from rfkill.
+ * the rfkill structure. Before calling this function the driver needs
+ * to be ready to service method calls from rfkill.
+ *
+ * If the software blocked state is not set before registration,
+ * set_block will be called to initialize it to a default value.
+ *
+ * If the hardware blocked state is not set before registration,
+ * it is assumed to be unblocked.
  */
 int __must_check rfkill_register(struct rfkill *rfkill);
 
@@ -250,19 +256,6 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
  */
 void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
 
-/**
- * rfkill_set_global_sw_state - set global sw block default
- * @type: rfkill type to set default for
- * @blocked: default to set
- *
- * This function sets the global default -- use at boot if your platform has
- * an rfkill switch. If not early enough this call may be ignored.
- *
- * XXX: instead of ignoring -- how about just updating all currently
- *	registered drivers?
- */
-void rfkill_set_global_sw_state(const enum rfkill_type type, bool blocked);
-
 /**
  * rfkill_blocked - query rfkill block
  *
@@ -317,11 +310,6 @@ static inline void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 {
 }
 
-static inline void rfkill_set_global_sw_state(const enum rfkill_type type,
-					      bool blocked)
-{
-}
-
 static inline bool rfkill_blocked(struct rfkill *rfkill)
 {
 	return false;
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index e161ebc40a3..fa430bd03f1 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -57,6 +57,7 @@ struct rfkill {
 
 	bool			registered;
 	bool			suspended;
+	bool			persistent;
 
 	const struct rfkill_ops	*ops;
 	void			*data;
@@ -116,11 +117,9 @@ MODULE_PARM_DESC(default_state,
 		 "Default initial state for all radio types, 0 = radio off");
 
 static struct {
-	bool cur, def;
+	bool cur, sav;
 } rfkill_global_states[NUM_RFKILL_TYPES];
 
-static unsigned long rfkill_states_default_locked;
-
 static bool rfkill_epo_lock_active;
 
 
@@ -392,7 +391,7 @@ void rfkill_epo(void)
 		rfkill_set_block(rfkill, true);
 
 	for (i = 0; i < NUM_RFKILL_TYPES; i++) {
-		rfkill_global_states[i].def = rfkill_global_states[i].cur;
+		rfkill_global_states[i].sav = rfkill_global_states[i].cur;
 		rfkill_global_states[i].cur = true;
 	}
 
@@ -417,7 +416,7 @@ void rfkill_restore_states(void)
 
 	rfkill_epo_lock_active = false;
 	for (i = 0; i < NUM_RFKILL_TYPES; i++)
-		__rfkill_switch_all(i, rfkill_global_states[i].def);
+		__rfkill_switch_all(i, rfkill_global_states[i].sav);
 	mutex_unlock(&rfkill_global_mutex);
 }
 
@@ -464,29 +463,6 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
 }
 #endif
 
-void rfkill_set_global_sw_state(const enum rfkill_type type, bool blocked)
-{
-	BUG_ON(type == RFKILL_TYPE_ALL);
-
-	mutex_lock(&rfkill_global_mutex);
-
-	/* don't allow unblock when epo */
-	if (rfkill_epo_lock_active && !blocked)
-		goto out;
-
-	/* too late */
-	if (rfkill_states_default_locked & BIT(type))
-		goto out;
-
-	rfkill_states_default_locked |= BIT(type);
-
-	rfkill_global_states[type].cur = blocked;
-	rfkill_global_states[type].def = blocked;
- out:
-	mutex_unlock(&rfkill_global_mutex);
-}
-EXPORT_SYMBOL(rfkill_set_global_sw_state);
-
 
 bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 {
@@ -532,13 +508,14 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 	blocked = blocked || hwblock;
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
-	if (!rfkill->registered)
-		return blocked;
+	if (!rfkill->registered) {
+		rfkill->persistent = true;
+	} else {
+		if (prev != blocked && !hwblock)
+			schedule_work(&rfkill->uevent_work);
 
-	if (prev != blocked && !hwblock)
-		schedule_work(&rfkill->uevent_work);
-
-	rfkill_led_trigger_event(rfkill);
+		rfkill_led_trigger_event(rfkill);
+	}
 
 	return blocked;
 }
@@ -563,13 +540,14 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
-	if (!rfkill->registered)
-		return;
-
-	if (swprev != sw || hwprev != hw)
-		schedule_work(&rfkill->uevent_work);
+	if (!rfkill->registered) {
+		rfkill->persistent = true;
+	} else {
+		if (swprev != sw || hwprev != hw)
+			schedule_work(&rfkill->uevent_work);
 
-	rfkill_led_trigger_event(rfkill);
+		rfkill_led_trigger_event(rfkill);
+	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
 
@@ -888,15 +866,6 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 	dev_set_name(dev, "rfkill%lu", rfkill_no);
 	rfkill_no++;
 
-	if (!(rfkill_states_default_locked & BIT(rfkill->type))) {
-		/* first of its kind */
-		BUILD_BUG_ON(NUM_RFKILL_TYPES >
-			sizeof(rfkill_states_default_locked) * 8);
-		rfkill_states_default_locked |= BIT(rfkill->type);
-		rfkill_global_states[rfkill->type].cur =
-			rfkill_global_states[rfkill->type].def;
-	}
-
 	list_add_tail(&rfkill->node, &rfkill_list);
 
 	error = device_add(dev);
@@ -916,7 +885,17 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 	if (rfkill->ops->poll)
 		schedule_delayed_work(&rfkill->poll_work,
 			round_jiffies_relative(POLL_INTERVAL));
-	schedule_work(&rfkill->sync_work);
+
+	if (!rfkill->persistent || rfkill_epo_lock_active) {
+		schedule_work(&rfkill->sync_work);
+	} else {
+#ifdef CONFIG_RFKILL_INPUT
+		bool soft_blocked = !!(rfkill->state & RFKILL_BLOCK_SW);
+
+		if (!atomic_read(&rfkill_input_disabled))
+			__rfkill_switch_all(rfkill->type, soft_blocked);
+#endif
+	}
 
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
@@ -1193,7 +1172,7 @@ static int __init rfkill_init(void)
 	int i;
 
 	for (i = 0; i < NUM_RFKILL_TYPES; i++)
-		rfkill_global_states[i].def = !rfkill_default_state;
+		rfkill_global_states[i].cur = !rfkill_default_state;
 
 	error = class_register(&rfkill_class);
 	if (error)
-- 
cgit v1.2.3-70-g09d2


From 908209c160da8ecb68052111972b7a21310eac3f Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Mon, 8 Jun 2009 13:12:23 +0100
Subject: rfkill: don't impose global states on resume (just restore the
 previous states)

Once rfkill-input is disabled, the "global" states will only be used as
default initial states.

Since the states will always be the same after resume, we shouldn't
generate events on resume.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/rfkill.h | 7 ++++---
 net/rfkill/core.c      | 6 +-----
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index c1dca0b8138..16e39c7a67f 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -212,7 +212,7 @@ void rfkill_destroy(struct rfkill *rfkill);
  *
  * rfkill drivers that get events when the hard-blocked state changes
  * use this function to notify the rfkill core (and through that also
- * userspace) of the current state -- they should also use this after
+ * userspace) of the current state.  They should also use this after
  * resume if the state could have changed.
  *
  * You need not (but may) call this function if poll_state is assigned.
@@ -234,8 +234,9 @@ bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
  * rfkill drivers that get events when the soft-blocked state changes
  * (yes, some platforms directly act on input but allow changing again)
  * use this function to notify the rfkill core (and through that also
- * userspace) of the current state -- they should also use this after
- * resume if the state could have changed.
+ * userspace) of the current state.  It is not necessary to notify on
+ * resume; since hibernation can always change the soft-blocked state,
+ * the rfkill core will unconditionally restore the previous state.
  *
  * This function can be called in any context, even from within rfkill
  * callbacks.
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index fa430bd03f1..4e68ab439d5 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -728,15 +728,11 @@ static int rfkill_resume(struct device *dev)
 	struct rfkill *rfkill = to_rfkill(dev);
 	bool cur;
 
-	mutex_lock(&rfkill_global_mutex);
-	cur = rfkill_global_states[rfkill->type].cur;
+	cur = !!(rfkill->state & RFKILL_BLOCK_SW);
 	rfkill_set_block(rfkill, cur);
-	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill->suspended = false;
 
-	schedule_work(&rfkill->uevent_work);
-
 	rfkill_resume_polling(rfkill);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From d9c7d394a8ebacb60097b192939ae9f15235225e Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@novell.com>
Date: Wed, 10 Jun 2009 12:57:06 -0700
Subject: block: prevent possible io_context->refcount overflow

Currently io_context has an atomic_t(32-bit) as refcount.  In the case of
cfq, for each device against whcih a task does I/O, a reference to the
io_context would be taken.  And when there are multiple process sharing
io_contexts(CLONE_IO) would also have a reference to the same io_context.

Theoretically the possible maximum number of processes sharing the same
io_context + the number of disks/cfq_data referring to the same io_context
can overflow the 32-bit counter on a very high-end machine.

Even though it is an improbable case, let us make it atomic_long_t.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c           | 12 ++++++------
 block/cfq-iosched.c       |  2 +-
 include/linux/iocontext.h |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065ac8e..d4ed6000147 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -35,9 +35,9 @@ int put_io_context(struct io_context *ioc)
 	if (ioc == NULL)
 		return 1;
 
-	BUG_ON(atomic_read(&ioc->refcount) == 0);
+	BUG_ON(atomic_long_read(&ioc->refcount) == 0);
 
-	if (atomic_dec_and_test(&ioc->refcount)) {
+	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
@@ -90,7 +90,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
-		atomic_set(&ret->refcount, 1);
+		atomic_long_set(&ret->refcount, 1);
 		atomic_set(&ret->nr_tasks, 1);
 		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
@@ -151,7 +151,7 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
 		ret = current_io_context(gfp_flags, node);
 		if (unlikely(!ret))
 			break;
-	} while (!atomic_inc_not_zero(&ret->refcount));
+	} while (!atomic_long_inc_not_zero(&ret->refcount));
 
 	return ret;
 }
@@ -163,8 +163,8 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 	struct io_context *dst = *pdst;
 
 	if (src) {
-		BUG_ON(atomic_read(&src->refcount) == 0);
-		atomic_inc(&src->refcount);
+		BUG_ON(atomic_long_read(&src->refcount) == 0);
+		atomic_long_inc(&src->refcount);
 		put_io_context(dst);
 		*pdst = src;
 	}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 99ac4304d71..ef2f72d4243 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1282,7 +1282,7 @@ static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (!cfqd->active_cic) {
 		struct cfq_io_context *cic = RQ_CIC(rq);
 
-		atomic_inc(&cic->ioc->refcount);
+		atomic_long_inc(&cic->ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 }
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987bccf8..dd05434fa45 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -64,7 +64,7 @@ struct cfq_io_context {
  * and kmalloc'ed. These could be shared between processes.
  */
 struct io_context {
-	atomic_t refcount;
+	atomic_long_t refcount;
 	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
@@ -91,8 +91,8 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 	 * if ref count is zero, don't allow sharing (ioc is going away, it's
 	 * a race).
 	 */
-	if (ioc && atomic_inc_not_zero(&ioc->refcount)) {
-		atomic_inc(&ioc->nr_tasks);
+	if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) {
+		atomic_long_inc(&ioc->refcount);
 		return ioc;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From b43d65f7e818485664037a46367cfb15af05bd8c Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Tue, 9 Jun 2009 08:11:42 +0100
Subject: [ARM] 5546/1: ARM PL022 SSP/SPI driver v3

This adds a driver for the ARM PL022 PrimeCell SSP/SPI
driver found in the U300 platforms as well as in some
ARM reference hardware, and in a modified version on the
Nomadik board.

Reviewed-by: Alessandro Rubini <rubini-list@gnudd.com>
Reviewed-by: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Baruch Siach <baruch@tkos.co.il>

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/spi/Kconfig        |    9 +
 drivers/spi/Makefile       |    1 +
 drivers/spi/amba-pl022.c   | 1866 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/amba/pl022.h |  264 +++++++
 4 files changed, 2140 insertions(+)
 create mode 100644 drivers/spi/amba-pl022.c
 create mode 100644 include/linux/amba/pl022.h

(limited to 'include/linux')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 7c61251bea6..8e7c17e4461 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -171,6 +171,15 @@ config SPI_ORION
 	help
 	  This enables using the SPI master controller on the Orion chips.
 
+config SPI_PL022
+	tristate "ARM AMBA PL022 SSP controller (EXPERIMENTAL)"
+	depends on ARM_AMBA && EXPERIMENTAL
+	default y if MACH_U300
+	help
+	  This selects the ARM(R) AMBA(R) PrimeCell PL022 SSP
+	  controller. If you have an embedded system with an AMBA(R)
+	  bus and a PL022 controller, say Y or M here.
+
 config SPI_PXA2XX
 	tristate "PXA2xx SSP SPI master"
 	depends on ARCH_PXA && EXPERIMENTAL
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index 5d0451936d8..ecfadb18048 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_SPI_PXA2XX)		+= pxa2xx_spi.o
 obj-$(CONFIG_SPI_OMAP_UWIRE)		+= omap_uwire.o
 obj-$(CONFIG_SPI_OMAP24XX)		+= omap2_mcspi.o
 obj-$(CONFIG_SPI_ORION)			+= orion_spi.o
+obj-$(CONFIG_SPI_PL022)			+= amba-pl022.o
 obj-$(CONFIG_SPI_MPC52xx_PSC)		+= mpc52xx_psc_spi.o
 obj-$(CONFIG_SPI_MPC83xx)		+= spi_mpc83xx.o
 obj-$(CONFIG_SPI_S3C24XX_GPIO)		+= spi_s3c24xx_gpio.o
diff --git a/drivers/spi/amba-pl022.c b/drivers/spi/amba-pl022.c
new file mode 100644
index 00000000000..da76797ce8b
--- /dev/null
+++ b/drivers/spi/amba-pl022.c
@@ -0,0 +1,1866 @@
+/*
+ * drivers/spi/amba-pl022.c
+ *
+ * A driver for the ARM PL022 PrimeCell SSP/SPI bus master.
+ *
+ * Copyright (C) 2008-2009 ST-Ericsson AB
+ * Copyright (C) 2006 STMicroelectronics Pvt. Ltd.
+ *
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ *
+ * Initial version inspired by:
+ *	linux-2.6.17-rc3-mm1/drivers/spi/pxa2xx_spi.c
+ * Initial adoption to PL022 by:
+ *      Sachin Verma <sachin.verma@st.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * TODO:
+ * - add timeout on polled transfers
+ * - add generic DMA framework support
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/ioport.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/spi/spi.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/pl022.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+/*
+ * This macro is used to define some register default values.
+ * reg is masked with mask, the OR:ed with an (again masked)
+ * val shifted sb steps to the left.
+ */
+#define SSP_WRITE_BITS(reg, val, mask, sb) \
+ ((reg) = (((reg) & ~(mask)) | (((val)<<(sb)) & (mask))))
+
+/*
+ * This macro is also used to define some default values.
+ * It will just shift val by sb steps to the left and mask
+ * the result with mask.
+ */
+#define GEN_MASK_BITS(val, mask, sb) \
+ (((val)<<(sb)) & (mask))
+
+#define DRIVE_TX		0
+#define DO_NOT_DRIVE_TX		1
+
+#define DO_NOT_QUEUE_DMA	0
+#define QUEUE_DMA		1
+
+#define RX_TRANSFER		1
+#define TX_TRANSFER		2
+
+/*
+ * Macros to access SSP Registers with their offsets
+ */
+#define SSP_CR0(r)	(r + 0x000)
+#define SSP_CR1(r)	(r + 0x004)
+#define SSP_DR(r)	(r + 0x008)
+#define SSP_SR(r)	(r + 0x00C)
+#define SSP_CPSR(r)	(r + 0x010)
+#define SSP_IMSC(r)	(r + 0x014)
+#define SSP_RIS(r)	(r + 0x018)
+#define SSP_MIS(r)	(r + 0x01C)
+#define SSP_ICR(r)	(r + 0x020)
+#define SSP_DMACR(r)	(r + 0x024)
+#define SSP_ITCR(r)	(r + 0x080)
+#define SSP_ITIP(r)	(r + 0x084)
+#define SSP_ITOP(r)	(r + 0x088)
+#define SSP_TDR(r)	(r + 0x08C)
+
+#define SSP_PID0(r)	(r + 0xFE0)
+#define SSP_PID1(r)	(r + 0xFE4)
+#define SSP_PID2(r)	(r + 0xFE8)
+#define SSP_PID3(r)	(r + 0xFEC)
+
+#define SSP_CID0(r)	(r + 0xFF0)
+#define SSP_CID1(r)	(r + 0xFF4)
+#define SSP_CID2(r)	(r + 0xFF8)
+#define SSP_CID3(r)	(r + 0xFFC)
+
+/*
+ * SSP Control Register 0  - SSP_CR0
+ */
+#define SSP_CR0_MASK_DSS	(0x1FUL << 0)
+#define SSP_CR0_MASK_HALFDUP	(0x1UL << 5)
+#define SSP_CR0_MASK_SPO	(0x1UL << 6)
+#define SSP_CR0_MASK_SPH	(0x1UL << 7)
+#define SSP_CR0_MASK_SCR	(0xFFUL << 8)
+#define SSP_CR0_MASK_CSS	(0x1FUL << 16)
+#define SSP_CR0_MASK_FRF	(0x3UL << 21)
+
+/*
+ * SSP Control Register 0  - SSP_CR1
+ */
+#define SSP_CR1_MASK_LBM	(0x1UL << 0)
+#define SSP_CR1_MASK_SSE	(0x1UL << 1)
+#define SSP_CR1_MASK_MS		(0x1UL << 2)
+#define SSP_CR1_MASK_SOD	(0x1UL << 3)
+#define SSP_CR1_MASK_RENDN	(0x1UL << 4)
+#define SSP_CR1_MASK_TENDN	(0x1UL << 5)
+#define SSP_CR1_MASK_MWAIT	(0x1UL << 6)
+#define SSP_CR1_MASK_RXIFLSEL	(0x7UL << 7)
+#define SSP_CR1_MASK_TXIFLSEL	(0x7UL << 10)
+
+/*
+ * SSP Data Register - SSP_DR
+ */
+#define SSP_DR_MASK_DATA	0xFFFFFFFF
+
+/*
+ * SSP Status Register - SSP_SR
+ */
+#define SSP_SR_MASK_TFE		(0x1UL << 0) /* Transmit FIFO empty */
+#define SSP_SR_MASK_TNF		(0x1UL << 1) /* Transmit FIFO not full */
+#define SSP_SR_MASK_RNE		(0x1UL << 2) /* Receive FIFO not empty */
+#define SSP_SR_MASK_RFF 	(0x1UL << 3) /* Receive FIFO full */
+#define SSP_SR_MASK_BSY		(0x1UL << 4) /* Busy Flag */
+
+/*
+ * SSP Clock Prescale Register  - SSP_CPSR
+ */
+#define SSP_CPSR_MASK_CPSDVSR	(0xFFUL << 0)
+
+/*
+ * SSP Interrupt Mask Set/Clear Register - SSP_IMSC
+ */
+#define SSP_IMSC_MASK_RORIM (0x1UL << 0) /* Receive Overrun Interrupt mask */
+#define SSP_IMSC_MASK_RTIM  (0x1UL << 1) /* Receive timeout Interrupt mask */
+#define SSP_IMSC_MASK_RXIM  (0x1UL << 2) /* Receive FIFO Interrupt mask */
+#define SSP_IMSC_MASK_TXIM  (0x1UL << 3) /* Transmit FIFO Interrupt mask */
+
+/*
+ * SSP Raw Interrupt Status Register - SSP_RIS
+ */
+/* Receive Overrun Raw Interrupt status */
+#define SSP_RIS_MASK_RORRIS		(0x1UL << 0)
+/* Receive Timeout Raw Interrupt status */
+#define SSP_RIS_MASK_RTRIS		(0x1UL << 1)
+/* Receive FIFO Raw Interrupt status */
+#define SSP_RIS_MASK_RXRIS		(0x1UL << 2)
+/* Transmit FIFO Raw Interrupt status */
+#define SSP_RIS_MASK_TXRIS		(0x1UL << 3)
+
+/*
+ * SSP Masked Interrupt Status Register - SSP_MIS
+ */
+/* Receive Overrun Masked Interrupt status */
+#define SSP_MIS_MASK_RORMIS		(0x1UL << 0)
+/* Receive Timeout Masked Interrupt status */
+#define SSP_MIS_MASK_RTMIS		(0x1UL << 1)
+/* Receive FIFO Masked Interrupt status */
+#define SSP_MIS_MASK_RXMIS		(0x1UL << 2)
+/* Transmit FIFO Masked Interrupt status */
+#define SSP_MIS_MASK_TXMIS		(0x1UL << 3)
+
+/*
+ * SSP Interrupt Clear Register - SSP_ICR
+ */
+/* Receive Overrun Raw Clear Interrupt bit */
+#define SSP_ICR_MASK_RORIC		(0x1UL << 0)
+/* Receive Timeout Clear Interrupt bit */
+#define SSP_ICR_MASK_RTIC		(0x1UL << 1)
+
+/*
+ * SSP DMA Control Register - SSP_DMACR
+ */
+/* Receive DMA Enable bit */
+#define SSP_DMACR_MASK_RXDMAE		(0x1UL << 0)
+/* Transmit DMA Enable bit */
+#define SSP_DMACR_MASK_TXDMAE		(0x1UL << 1)
+
+/*
+ * SSP Integration Test control Register - SSP_ITCR
+ */
+#define SSP_ITCR_MASK_ITEN		(0x1UL << 0)
+#define SSP_ITCR_MASK_TESTFIFO		(0x1UL << 1)
+
+/*
+ * SSP Integration Test Input Register - SSP_ITIP
+ */
+#define ITIP_MASK_SSPRXD		 (0x1UL << 0)
+#define ITIP_MASK_SSPFSSIN		 (0x1UL << 1)
+#define ITIP_MASK_SSPCLKIN		 (0x1UL << 2)
+#define ITIP_MASK_RXDMAC		 (0x1UL << 3)
+#define ITIP_MASK_TXDMAC		 (0x1UL << 4)
+#define ITIP_MASK_SSPTXDIN		 (0x1UL << 5)
+
+/*
+ * SSP Integration Test output Register - SSP_ITOP
+ */
+#define ITOP_MASK_SSPTXD		 (0x1UL << 0)
+#define ITOP_MASK_SSPFSSOUT		 (0x1UL << 1)
+#define ITOP_MASK_SSPCLKOUT		 (0x1UL << 2)
+#define ITOP_MASK_SSPOEn		 (0x1UL << 3)
+#define ITOP_MASK_SSPCTLOEn		 (0x1UL << 4)
+#define ITOP_MASK_RORINTR		 (0x1UL << 5)
+#define ITOP_MASK_RTINTR		 (0x1UL << 6)
+#define ITOP_MASK_RXINTR		 (0x1UL << 7)
+#define ITOP_MASK_TXINTR		 (0x1UL << 8)
+#define ITOP_MASK_INTR			 (0x1UL << 9)
+#define ITOP_MASK_RXDMABREQ		 (0x1UL << 10)
+#define ITOP_MASK_RXDMASREQ		 (0x1UL << 11)
+#define ITOP_MASK_TXDMABREQ		 (0x1UL << 12)
+#define ITOP_MASK_TXDMASREQ		 (0x1UL << 13)
+
+/*
+ * SSP Test Data Register - SSP_TDR
+ */
+#define TDR_MASK_TESTDATA 		(0xFFFFFFFF)
+
+/*
+ * Message State
+ * we use the spi_message.state (void *) pointer to
+ * hold a single state value, that's why all this
+ * (void *) casting is done here.
+ */
+#define STATE_START                     ((void *) 0)
+#define STATE_RUNNING                   ((void *) 1)
+#define STATE_DONE                      ((void *) 2)
+#define STATE_ERROR                     ((void *) -1)
+
+/*
+ * Queue State
+ */
+#define QUEUE_RUNNING                   (0)
+#define QUEUE_STOPPED                   (1)
+/*
+ * SSP State - Whether Enabled or Disabled
+ */
+#define SSP_DISABLED 			(0)
+#define SSP_ENABLED 			(1)
+
+/*
+ * SSP DMA State - Whether DMA Enabled or Disabled
+ */
+#define SSP_DMA_DISABLED 		(0)
+#define SSP_DMA_ENABLED 		(1)
+
+/*
+ * SSP Clock Defaults
+ */
+#define NMDK_SSP_DEFAULT_CLKRATE 0x2
+#define NMDK_SSP_DEFAULT_PRESCALE 0x40
+
+/*
+ * SSP Clock Parameter ranges
+ */
+#define CPSDVR_MIN 0x02
+#define CPSDVR_MAX 0xFE
+#define SCR_MIN 0x00
+#define SCR_MAX 0xFF
+
+/*
+ * SSP Interrupt related Macros
+ */
+#define DEFAULT_SSP_REG_IMSC  0x0UL
+#define DISABLE_ALL_INTERRUPTS DEFAULT_SSP_REG_IMSC
+#define ENABLE_ALL_INTERRUPTS (~DEFAULT_SSP_REG_IMSC)
+
+#define CLEAR_ALL_INTERRUPTS  0x3
+
+
+/*
+ * The type of reading going on on this chip
+ */
+enum ssp_reading {
+	READING_NULL,
+	READING_U8,
+	READING_U16,
+	READING_U32
+};
+
+/**
+ * The type of writing going on on this chip
+ */
+enum ssp_writing {
+	WRITING_NULL,
+	WRITING_U8,
+	WRITING_U16,
+	WRITING_U32
+};
+
+/**
+ * struct vendor_data - vendor-specific config parameters
+ * for PL022 derivates
+ * @fifodepth: depth of FIFOs (both)
+ * @max_bpw: maximum number of bits per word
+ * @unidir: supports unidirection transfers
+ */
+struct vendor_data {
+	int fifodepth;
+	int max_bpw;
+	bool unidir;
+};
+
+/**
+ * struct pl022 - This is the private SSP driver data structure
+ * @adev: AMBA device model hookup
+ * @phybase: The physical memory where the SSP device resides
+ * @virtbase: The virtual memory where the SSP is mapped
+ * @master: SPI framework hookup
+ * @master_info: controller-specific data from machine setup
+ * @regs: SSP controller register's virtual address
+ * @pump_messages: Work struct for scheduling work to the workqueue
+ * @lock: spinlock to syncronise access to driver data
+ * @workqueue: a workqueue on which any spi_message request is queued
+ * @busy: workqueue is busy
+ * @run: workqueue is running
+ * @pump_transfers: Tasklet used in Interrupt Transfer mode
+ * @cur_msg: Pointer to current spi_message being processed
+ * @cur_transfer: Pointer to current spi_transfer
+ * @cur_chip: pointer to current clients chip(assigned from controller_state)
+ * @tx: current position in TX buffer to be read
+ * @tx_end: end position in TX buffer to be read
+ * @rx: current position in RX buffer to be written
+ * @rx_end: end position in RX buffer to be written
+ * @readingtype: the type of read currently going on
+ * @writingtype: the type or write currently going on
+ */
+struct pl022 {
+	struct amba_device		*adev;
+	struct vendor_data		*vendor;
+	resource_size_t			phybase;
+	void __iomem			*virtbase;
+	struct clk			*clk;
+	struct spi_master		*master;
+	struct pl022_ssp_controller	*master_info;
+	/* Driver message queue */
+	struct workqueue_struct		*workqueue;
+	struct work_struct		pump_messages;
+	spinlock_t			queue_lock;
+	struct list_head		queue;
+	int				busy;
+	int				run;
+	/* Message transfer pump */
+	struct tasklet_struct		pump_transfers;
+	struct spi_message		*cur_msg;
+	struct spi_transfer		*cur_transfer;
+	struct chip_data		*cur_chip;
+	void				*tx;
+	void				*tx_end;
+	void				*rx;
+	void				*rx_end;
+	enum ssp_reading		read;
+	enum ssp_writing		write;
+};
+
+/**
+ * struct chip_data - To maintain runtime state of SSP for each client chip
+ * @cr0: Value of control register CR0 of SSP
+ * @cr1: Value of control register CR1 of SSP
+ * @dmacr: Value of DMA control Register of SSP
+ * @cpsr: Value of Clock prescale register
+ * @n_bytes: how many bytes(power of 2) reqd for a given data width of client
+ * @enable_dma: Whether to enable DMA or not
+ * @write: function ptr to be used to write when doing xfer for this chip
+ * @read: function ptr to be used to read when doing xfer for this chip
+ * @cs_control: chip select callback provided by chip
+ * @xfer_type: polling/interrupt/DMA
+ *
+ * Runtime state of the SSP controller, maintained per chip,
+ * This would be set according to the current message that would be served
+ */
+struct chip_data {
+	u16 cr0;
+	u16 cr1;
+	u16 dmacr;
+	u16 cpsr;
+	u8 n_bytes;
+	u8 enable_dma:1;
+	enum ssp_reading read;
+	enum ssp_writing write;
+	void (*cs_control) (u32 command);
+	int xfer_type;
+};
+
+/**
+ * null_cs_control - Dummy chip select function
+ * @command: select/delect the chip
+ *
+ * If no chip select function is provided by client this is used as dummy
+ * chip select
+ */
+static void null_cs_control(u32 command)
+{
+	pr_debug("pl022: dummy chip select control, CS=0x%x\n", command);
+}
+
+/**
+ * giveback - current spi_message is over, schedule next message and call
+ * callback of this message. Assumes that caller already
+ * set message->status; dma and pio irqs are blocked
+ * @pl022: SSP driver private data structure
+ */
+static void giveback(struct pl022 *pl022)
+{
+	struct spi_transfer *last_transfer;
+	unsigned long flags;
+	struct spi_message *msg;
+	void (*curr_cs_control) (u32 command);
+
+	/*
+	 * This local reference to the chip select function
+	 * is needed because we set curr_chip to NULL
+	 * as a step toward termininating the message.
+	 */
+	curr_cs_control = pl022->cur_chip->cs_control;
+	spin_lock_irqsave(&pl022->queue_lock, flags);
+	msg = pl022->cur_msg;
+	pl022->cur_msg = NULL;
+	pl022->cur_transfer = NULL;
+	pl022->cur_chip = NULL;
+	queue_work(pl022->workqueue, &pl022->pump_messages);
+	spin_unlock_irqrestore(&pl022->queue_lock, flags);
+
+	last_transfer = list_entry(msg->transfers.prev,
+					struct spi_transfer,
+					transfer_list);
+
+	/* Delay if requested before any change in chip select */
+	if (last_transfer->delay_usecs)
+		/*
+		 * FIXME: This runs in interrupt context.
+		 * Is this really smart?
+		 */
+		udelay(last_transfer->delay_usecs);
+
+	/*
+	 * Drop chip select UNLESS cs_change is true or we are returning
+	 * a message with an error, or next message is for another chip
+	 */
+	if (!last_transfer->cs_change)
+		curr_cs_control(SSP_CHIP_DESELECT);
+	else {
+		struct spi_message *next_msg;
+
+		/* Holding of cs was hinted, but we need to make sure
+		 * the next message is for the same chip.  Don't waste
+		 * time with the following tests unless this was hinted.
+		 *
+		 * We cannot postpone this until pump_messages, because
+		 * after calling msg->complete (below) the driver that
+		 * sent the current message could be unloaded, which
+		 * could invalidate the cs_control() callback...
+		 */
+
+		/* get a pointer to the next message, if any */
+		spin_lock_irqsave(&pl022->queue_lock, flags);
+		if (list_empty(&pl022->queue))
+			next_msg = NULL;
+		else
+			next_msg = list_entry(pl022->queue.next,
+					struct spi_message, queue);
+		spin_unlock_irqrestore(&pl022->queue_lock, flags);
+
+		/* see if the next and current messages point
+		 * to the same chip
+		 */
+		if (next_msg && next_msg->spi != msg->spi)
+			next_msg = NULL;
+		if (!next_msg || msg->state == STATE_ERROR)
+			curr_cs_control(SSP_CHIP_DESELECT);
+	}
+	msg->state = NULL;
+	if (msg->complete)
+		msg->complete(msg->context);
+	/* This message is completed, so let's turn off the clock! */
+	clk_disable(pl022->clk);
+}
+
+/**
+ * flush - flush the FIFO to reach a clean state
+ * @pl022: SSP driver private data structure
+ */
+static int flush(struct pl022 *pl022)
+{
+	unsigned long limit = loops_per_jiffy << 1;
+
+	dev_dbg(&pl022->adev->dev, "flush\n");
+	do {
+		while (readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_RNE)
+			readw(SSP_DR(pl022->virtbase));
+	} while ((readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_BSY) && limit--);
+	return limit;
+}
+
+/**
+ * restore_state - Load configuration of current chip
+ * @pl022: SSP driver private data structure
+ */
+static void restore_state(struct pl022 *pl022)
+{
+	struct chip_data *chip = pl022->cur_chip;
+
+	writew(chip->cr0, SSP_CR0(pl022->virtbase));
+	writew(chip->cr1, SSP_CR1(pl022->virtbase));
+	writew(chip->dmacr, SSP_DMACR(pl022->virtbase));
+	writew(chip->cpsr, SSP_CPSR(pl022->virtbase));
+	writew(DISABLE_ALL_INTERRUPTS, SSP_IMSC(pl022->virtbase));
+	writew(CLEAR_ALL_INTERRUPTS, SSP_ICR(pl022->virtbase));
+}
+
+/**
+ * load_ssp_default_config - Load default configuration for SSP
+ * @pl022: SSP driver private data structure
+ */
+
+/*
+ * Default SSP Register Values
+ */
+#define DEFAULT_SSP_REG_CR0 ( \
+	GEN_MASK_BITS(SSP_DATA_BITS_12, SSP_CR0_MASK_DSS, 0)	| \
+	GEN_MASK_BITS(SSP_MICROWIRE_CHANNEL_FULL_DUPLEX, SSP_CR0_MASK_HALFDUP, 5) | \
+	GEN_MASK_BITS(SSP_CLK_POL_IDLE_LOW, SSP_CR0_MASK_SPO, 6) | \
+	GEN_MASK_BITS(SSP_CLK_FALLING_EDGE, SSP_CR0_MASK_SPH, 7) | \
+	GEN_MASK_BITS(NMDK_SSP_DEFAULT_CLKRATE, SSP_CR0_MASK_SCR, 8) | \
+	GEN_MASK_BITS(SSP_BITS_8, SSP_CR0_MASK_CSS, 16)	| \
+	GEN_MASK_BITS(SSP_INTERFACE_MOTOROLA_SPI, SSP_CR0_MASK_FRF, 21) \
+)
+
+#define DEFAULT_SSP_REG_CR1 ( \
+	GEN_MASK_BITS(LOOPBACK_DISABLED, SSP_CR1_MASK_LBM, 0) | \
+	GEN_MASK_BITS(SSP_DISABLED, SSP_CR1_MASK_SSE, 1) | \
+	GEN_MASK_BITS(SSP_MASTER, SSP_CR1_MASK_MS, 2) | \
+	GEN_MASK_BITS(DO_NOT_DRIVE_TX, SSP_CR1_MASK_SOD, 3) | \
+	GEN_MASK_BITS(SSP_RX_MSB, SSP_CR1_MASK_RENDN, 4) | \
+	GEN_MASK_BITS(SSP_TX_MSB, SSP_CR1_MASK_TENDN, 5) | \
+	GEN_MASK_BITS(SSP_MWIRE_WAIT_ZERO, SSP_CR1_MASK_MWAIT, 6) |\
+	GEN_MASK_BITS(SSP_RX_1_OR_MORE_ELEM, SSP_CR1_MASK_RXIFLSEL, 7) | \
+	GEN_MASK_BITS(SSP_TX_1_OR_MORE_EMPTY_LOC, SSP_CR1_MASK_TXIFLSEL, 10) \
+)
+
+#define DEFAULT_SSP_REG_CPSR ( \
+	GEN_MASK_BITS(NMDK_SSP_DEFAULT_PRESCALE, SSP_CPSR_MASK_CPSDVSR, 0) \
+)
+
+#define DEFAULT_SSP_REG_DMACR (\
+	GEN_MASK_BITS(SSP_DMA_DISABLED, SSP_DMACR_MASK_RXDMAE, 0) | \
+	GEN_MASK_BITS(SSP_DMA_DISABLED, SSP_DMACR_MASK_TXDMAE, 1) \
+)
+
+
+static void load_ssp_default_config(struct pl022 *pl022)
+{
+	writew(DEFAULT_SSP_REG_CR0, SSP_CR0(pl022->virtbase));
+	writew(DEFAULT_SSP_REG_CR1, SSP_CR1(pl022->virtbase));
+	writew(DEFAULT_SSP_REG_DMACR, SSP_DMACR(pl022->virtbase));
+	writew(DEFAULT_SSP_REG_CPSR, SSP_CPSR(pl022->virtbase));
+	writew(DISABLE_ALL_INTERRUPTS, SSP_IMSC(pl022->virtbase));
+	writew(CLEAR_ALL_INTERRUPTS, SSP_ICR(pl022->virtbase));
+}
+
+/**
+ * This will write to TX and read from RX according to the parameters
+ * set in pl022.
+ */
+static void readwriter(struct pl022 *pl022)
+{
+
+	/*
+	 * The FIFO depth is different inbetween primecell variants.
+	 * I believe filling in too much in the FIFO might cause
+	 * errons in 8bit wide transfers on ARM variants (just 8 words
+	 * FIFO, means only 8x8 = 64 bits in FIFO) at least.
+	 *
+	 * FIXME: currently we have no logic to account for this.
+	 * perhaps there is even something broken in HW regarding
+	 * 8bit transfers (it doesn't fail on 16bit) so this needs
+	 * more investigation...
+	 */
+	dev_dbg(&pl022->adev->dev,
+		"%s, rx: %p, rxend: %p, tx: %p, txend: %p\n",
+		__func__, pl022->rx, pl022->rx_end, pl022->tx, pl022->tx_end);
+
+	/* Read as much as you can */
+	while ((readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_RNE)
+	       && (pl022->rx < pl022->rx_end)) {
+		switch (pl022->read) {
+		case READING_NULL:
+			readw(SSP_DR(pl022->virtbase));
+			break;
+		case READING_U8:
+			*(u8 *) (pl022->rx) =
+				readw(SSP_DR(pl022->virtbase)) & 0xFFU;
+			break;
+		case READING_U16:
+			*(u16 *) (pl022->rx) =
+				(u16) readw(SSP_DR(pl022->virtbase));
+			break;
+		case READING_U32:
+			*(u32 *) (pl022->rx) =
+				readl(SSP_DR(pl022->virtbase));
+			break;
+		}
+		pl022->rx += (pl022->cur_chip->n_bytes);
+	}
+	/*
+	 * Write as much as you can, while keeping an eye on the RX FIFO!
+	 */
+	while ((readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_TNF)
+	       && (pl022->tx < pl022->tx_end)) {
+		switch (pl022->write) {
+		case WRITING_NULL:
+			writew(0x0, SSP_DR(pl022->virtbase));
+			break;
+		case WRITING_U8:
+			writew(*(u8 *) (pl022->tx), SSP_DR(pl022->virtbase));
+			break;
+		case WRITING_U16:
+			writew((*(u16 *) (pl022->tx)), SSP_DR(pl022->virtbase));
+			break;
+		case WRITING_U32:
+			writel(*(u32 *) (pl022->tx), SSP_DR(pl022->virtbase));
+			break;
+		}
+		pl022->tx += (pl022->cur_chip->n_bytes);
+		/*
+		 * This inner reader takes care of things appearing in the RX
+		 * FIFO as we're transmitting. This will happen a lot since the
+		 * clock starts running when you put things into the TX FIFO,
+		 * and then things are continously clocked into the RX FIFO.
+		 */
+		while ((readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_RNE)
+		       && (pl022->rx < pl022->rx_end)) {
+			switch (pl022->read) {
+			case READING_NULL:
+				readw(SSP_DR(pl022->virtbase));
+				break;
+			case READING_U8:
+				*(u8 *) (pl022->rx) =
+					readw(SSP_DR(pl022->virtbase)) & 0xFFU;
+				break;
+			case READING_U16:
+				*(u16 *) (pl022->rx) =
+					(u16) readw(SSP_DR(pl022->virtbase));
+				break;
+			case READING_U32:
+				*(u32 *) (pl022->rx) =
+					readl(SSP_DR(pl022->virtbase));
+				break;
+			}
+			pl022->rx += (pl022->cur_chip->n_bytes);
+		}
+	}
+	/*
+	 * When we exit here the TX FIFO should be full and the RX FIFO
+	 * should be empty
+	 */
+}
+
+
+/**
+ * next_transfer - Move to the Next transfer in the current spi message
+ * @pl022: SSP driver private data structure
+ *
+ * This function moves though the linked list of spi transfers in the
+ * current spi message and returns with the state of current spi
+ * message i.e whether its last transfer is done(STATE_DONE) or
+ * Next transfer is ready(STATE_RUNNING)
+ */
+static void *next_transfer(struct pl022 *pl022)
+{
+	struct spi_message *msg = pl022->cur_msg;
+	struct spi_transfer *trans = pl022->cur_transfer;
+
+	/* Move to next transfer */
+	if (trans->transfer_list.next != &msg->transfers) {
+		pl022->cur_transfer =
+		    list_entry(trans->transfer_list.next,
+			       struct spi_transfer, transfer_list);
+		return STATE_RUNNING;
+	}
+	return STATE_DONE;
+}
+/**
+ * pl022_interrupt_handler - Interrupt handler for SSP controller
+ *
+ * This function handles interrupts generated for an interrupt based transfer.
+ * If a receive overrun (ROR) interrupt is there then we disable SSP, flag the
+ * current message's state as STATE_ERROR and schedule the tasklet
+ * pump_transfers which will do the postprocessing of the current message by
+ * calling giveback(). Otherwise it reads data from RX FIFO till there is no
+ * more data, and writes data in TX FIFO till it is not full. If we complete
+ * the transfer we move to the next transfer and schedule the tasklet.
+ */
+static irqreturn_t pl022_interrupt_handler(int irq, void *dev_id)
+{
+	struct pl022 *pl022 = dev_id;
+	struct spi_message *msg = pl022->cur_msg;
+	u16 irq_status = 0;
+	u16 flag = 0;
+
+	if (unlikely(!msg)) {
+		dev_err(&pl022->adev->dev,
+			"bad message state in interrupt handler");
+		/* Never fail */
+		return IRQ_HANDLED;
+	}
+
+	/* Read the Interrupt Status Register */
+	irq_status = readw(SSP_MIS(pl022->virtbase));
+
+	if (unlikely(!irq_status))
+		return IRQ_NONE;
+
+	/* This handles the error code interrupts */
+	if (unlikely(irq_status & SSP_MIS_MASK_RORMIS)) {
+		/*
+		 * Overrun interrupt - bail out since our Data has been
+		 * corrupted
+		 */
+		dev_err(&pl022->adev->dev,
+			"FIFO overrun\n");
+		if (readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_RFF)
+			dev_err(&pl022->adev->dev,
+				"RXFIFO is full\n");
+		if (readw(SSP_SR(pl022->virtbase)) & SSP_SR_MASK_TNF)
+			dev_err(&pl022->adev->dev,
+				"TXFIFO is full\n");
+
+		/*
+		 * Disable and clear interrupts, disable SSP,
+		 * mark message with bad status so it can be
+		 * retried.
+		 */
+		writew(DISABLE_ALL_INTERRUPTS,
+		       SSP_IMSC(pl022->virtbase));
+		writew(CLEAR_ALL_INTERRUPTS, SSP_ICR(pl022->virtbase));
+		writew((readw(SSP_CR1(pl022->virtbase)) &
+			(~SSP_CR1_MASK_SSE)), SSP_CR1(pl022->virtbase));
+		msg->state = STATE_ERROR;
+
+		/* Schedule message queue handler */
+		tasklet_schedule(&pl022->pump_transfers);
+		return IRQ_HANDLED;
+	}
+
+	readwriter(pl022);
+
+	if ((pl022->tx == pl022->tx_end) && (flag == 0)) {
+		flag = 1;
+		/* Disable Transmit interrupt */
+		writew(readw(SSP_IMSC(pl022->virtbase)) &
+		       (~SSP_IMSC_MASK_TXIM),
+		       SSP_IMSC(pl022->virtbase));
+	}
+
+	/*
+	 * Since all transactions must write as much as shall be read,
+	 * we can conclude the entire transaction once RX is complete.
+	 * At this point, all TX will always be finished.
+	 */
+	if (pl022->rx >= pl022->rx_end) {
+		writew(DISABLE_ALL_INTERRUPTS,
+		       SSP_IMSC(pl022->virtbase));
+		writew(CLEAR_ALL_INTERRUPTS, SSP_ICR(pl022->virtbase));
+		if (unlikely(pl022->rx > pl022->rx_end)) {
+			dev_warn(&pl022->adev->dev, "read %u surplus "
+				 "bytes (did you request an odd "
+				 "number of bytes on a 16bit bus?)\n",
+				 (u32) (pl022->rx - pl022->rx_end));
+		}
+		/* Update total bytes transfered */
+		msg->actual_length += pl022->cur_transfer->len;
+		if (pl022->cur_transfer->cs_change)
+			pl022->cur_chip->
+				cs_control(SSP_CHIP_DESELECT);
+		/* Move to next transfer */
+		msg->state = next_transfer(pl022);
+		tasklet_schedule(&pl022->pump_transfers);
+		return IRQ_HANDLED;
+	}
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * This sets up the pointers to memory for the next message to
+ * send out on the SPI bus.
+ */
+static int set_up_next_transfer(struct pl022 *pl022,
+				struct spi_transfer *transfer)
+{
+	int residue;
+
+	/* Sanity check the message for this bus width */
+	residue = pl022->cur_transfer->len % pl022->cur_chip->n_bytes;
+	if (unlikely(residue != 0)) {
+		dev_err(&pl022->adev->dev,
+			"message of %u bytes to transmit but the current "
+			"chip bus has a data width of %u bytes!\n",
+			pl022->cur_transfer->len,
+			pl022->cur_chip->n_bytes);
+		dev_err(&pl022->adev->dev, "skipping this message\n");
+		return -EIO;
+	}
+	pl022->tx = (void *)transfer->tx_buf;
+	pl022->tx_end = pl022->tx + pl022->cur_transfer->len;
+	pl022->rx = (void *)transfer->rx_buf;
+	pl022->rx_end = pl022->rx + pl022->cur_transfer->len;
+	pl022->write =
+	    pl022->tx ? pl022->cur_chip->write : WRITING_NULL;
+	pl022->read = pl022->rx ? pl022->cur_chip->read : READING_NULL;
+	return 0;
+}
+
+/**
+ * pump_transfers - Tasklet function which schedules next interrupt transfer
+ * when running in interrupt transfer mode.
+ * @data: SSP driver private data structure
+ *
+ */
+static void pump_transfers(unsigned long data)
+{
+	struct pl022 *pl022 = (struct pl022 *) data;
+	struct spi_message *message = NULL;
+	struct spi_transfer *transfer = NULL;
+	struct spi_transfer *previous = NULL;
+
+	/* Get current state information */
+	message = pl022->cur_msg;
+	transfer = pl022->cur_transfer;
+
+	/* Handle for abort */
+	if (message->state == STATE_ERROR) {
+		message->status = -EIO;
+		giveback(pl022);
+		return;
+	}
+
+	/* Handle end of message */
+	if (message->state == STATE_DONE) {
+		message->status = 0;
+		giveback(pl022);
+		return;
+	}
+
+	/* Delay if requested at end of transfer before CS change */
+	if (message->state == STATE_RUNNING) {
+		previous = list_entry(transfer->transfer_list.prev,
+					struct spi_transfer,
+					transfer_list);
+		if (previous->delay_usecs)
+			/*
+			 * FIXME: This runs in interrupt context.
+			 * Is this really smart?
+			 */
+			udelay(previous->delay_usecs);
+
+		/* Drop chip select only if cs_change is requested */
+		if (previous->cs_change)
+			pl022->cur_chip->cs_control(SSP_CHIP_SELECT);
+	} else {
+		/* STATE_START */
+		message->state = STATE_RUNNING;
+	}
+
+	if (set_up_next_transfer(pl022, transfer)) {
+		message->state = STATE_ERROR;
+		message->status = -EIO;
+		giveback(pl022);
+		return;
+	}
+	/* Flush the FIFOs and let's go! */
+	flush(pl022);
+	writew(ENABLE_ALL_INTERRUPTS, SSP_IMSC(pl022->virtbase));
+}
+
+/**
+ * NOT IMPLEMENTED
+ * configure_dma - It configures the DMA pipes for DMA transfers
+ * @data: SSP driver's private data structure
+ *
+ */
+static int configure_dma(void *data)
+{
+	struct pl022 *pl022 = data;
+	dev_dbg(&pl022->adev->dev, "configure DMA\n");
+	return -ENOTSUPP;
+}
+
+/**
+ * do_dma_transfer - It handles transfers of the current message
+ * if it is DMA xfer.
+ * NOT FULLY IMPLEMENTED
+ * @data: SSP driver's private data structure
+ */
+static void do_dma_transfer(void *data)
+{
+	struct pl022 *pl022 = data;
+
+	if (configure_dma(data)) {
+		dev_dbg(&pl022->adev->dev, "configuration of DMA Failed!\n");
+		goto err_config_dma;
+	}
+
+	/* TODO: Implememt DMA setup of pipes here */
+
+	/* Enable target chip, set up transfer */
+	pl022->cur_chip->cs_control(SSP_CHIP_SELECT);
+	if (set_up_next_transfer(pl022, pl022->cur_transfer)) {
+		/* Error path */
+		pl022->cur_msg->state = STATE_ERROR;
+		pl022->cur_msg->status = -EIO;
+		giveback(pl022);
+		return;
+	}
+	/* Enable SSP */
+	writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE),
+	       SSP_CR1(pl022->virtbase));
+
+	/* TODO: Enable the DMA transfer here */
+	return;
+
+ err_config_dma:
+	pl022->cur_msg->state = STATE_ERROR;
+	pl022->cur_msg->status = -EIO;
+	giveback(pl022);
+	return;
+}
+
+static void do_interrupt_transfer(void *data)
+{
+	struct pl022 *pl022 = data;
+
+	/* Enable target chip */
+	pl022->cur_chip->cs_control(SSP_CHIP_SELECT);
+	if (set_up_next_transfer(pl022, pl022->cur_transfer)) {
+		/* Error path */
+		pl022->cur_msg->state = STATE_ERROR;
+		pl022->cur_msg->status = -EIO;
+		giveback(pl022);
+		return;
+	}
+	/* Enable SSP, turn on interrupts */
+	writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE),
+	       SSP_CR1(pl022->virtbase));
+	writew(ENABLE_ALL_INTERRUPTS, SSP_IMSC(pl022->virtbase));
+}
+
+static void do_polling_transfer(void *data)
+{
+	struct pl022 *pl022 = data;
+	struct spi_message *message = NULL;
+	struct spi_transfer *transfer = NULL;
+	struct spi_transfer *previous = NULL;
+	struct chip_data *chip;
+
+	chip = pl022->cur_chip;
+	message = pl022->cur_msg;
+
+	while (message->state != STATE_DONE) {
+		/* Handle for abort */
+		if (message->state == STATE_ERROR)
+			break;
+		transfer = pl022->cur_transfer;
+
+		/* Delay if requested at end of transfer */
+		if (message->state == STATE_RUNNING) {
+			previous =
+			    list_entry(transfer->transfer_list.prev,
+				       struct spi_transfer, transfer_list);
+			if (previous->delay_usecs)
+				udelay(previous->delay_usecs);
+			if (previous->cs_change)
+				pl022->cur_chip->cs_control(SSP_CHIP_SELECT);
+		} else {
+			/* STATE_START */
+			message->state = STATE_RUNNING;
+			pl022->cur_chip->cs_control(SSP_CHIP_SELECT);
+		}
+
+		/* Configuration Changing Per Transfer */
+		if (set_up_next_transfer(pl022, transfer)) {
+			/* Error path */
+			message->state = STATE_ERROR;
+			break;
+		}
+		/* Flush FIFOs and enable SSP */
+		flush(pl022);
+		writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE),
+		       SSP_CR1(pl022->virtbase));
+
+		dev_dbg(&pl022->adev->dev, "POLLING TRANSFER ONGOING ... \n");
+		/* FIXME: insert a timeout so we don't hang here indefinately */
+		while (pl022->tx < pl022->tx_end || pl022->rx < pl022->rx_end)
+			readwriter(pl022);
+
+		/* Update total byte transfered */
+		message->actual_length += pl022->cur_transfer->len;
+		if (pl022->cur_transfer->cs_change)
+			pl022->cur_chip->cs_control(SSP_CHIP_DESELECT);
+		/* Move to next transfer */
+		message->state = next_transfer(pl022);
+	}
+
+	/* Handle end of message */
+	if (message->state == STATE_DONE)
+		message->status = 0;
+	else
+		message->status = -EIO;
+
+	giveback(pl022);
+	return;
+}
+
+/**
+ * pump_messages - Workqueue function which processes spi message queue
+ * @data: pointer to private data of SSP driver
+ *
+ * This function checks if there is any spi message in the queue that
+ * needs processing and delegate control to appropriate function
+ * do_polling_transfer()/do_interrupt_transfer()/do_dma_transfer()
+ * based on the kind of the transfer
+ *
+ */
+static void pump_messages(struct work_struct *work)
+{
+	struct pl022 *pl022 =
+		container_of(work, struct pl022, pump_messages);
+	unsigned long flags;
+
+	/* Lock queue and check for queue work */
+	spin_lock_irqsave(&pl022->queue_lock, flags);
+	if (list_empty(&pl022->queue) || pl022->run == QUEUE_STOPPED) {
+		pl022->busy = 0;
+		spin_unlock_irqrestore(&pl022->queue_lock, flags);
+		return;
+	}
+	/* Make sure we are not already running a message */
+	if (pl022->cur_msg) {
+		spin_unlock_irqrestore(&pl022->queue_lock, flags);
+		return;
+	}
+	/* Extract head of queue */
+	pl022->cur_msg =
+	    list_entry(pl022->queue.next, struct spi_message, queue);
+
+	list_del_init(&pl022->cur_msg->queue);
+	pl022->busy = 1;
+	spin_unlock_irqrestore(&pl022->queue_lock, flags);
+
+	/* Initial message state */
+	pl022->cur_msg->state = STATE_START;
+	pl022->cur_transfer = list_entry(pl022->cur_msg->transfers.next,
+					    struct spi_transfer,
+					    transfer_list);
+
+	/* Setup the SPI using the per chip configuration */
+	pl022->cur_chip = spi_get_ctldata(pl022->cur_msg->spi);
+	/*
+	 * We enable the clock here, then the clock will be disabled when
+	 * giveback() is called in each method (poll/interrupt/DMA)
+	 */
+	clk_enable(pl022->clk);
+	restore_state(pl022);
+	flush(pl022);
+
+	if (pl022->cur_chip->xfer_type == POLLING_TRANSFER)
+		do_polling_transfer(pl022);
+	else if (pl022->cur_chip->xfer_type == INTERRUPT_TRANSFER)
+		do_interrupt_transfer(pl022);
+	else
+		do_dma_transfer(pl022);
+}
+
+
+static int __init init_queue(struct pl022 *pl022)
+{
+	INIT_LIST_HEAD(&pl022->queue);
+	spin_lock_init(&pl022->queue_lock);
+
+	pl022->run = QUEUE_STOPPED;
+	pl022->busy = 0;
+
+	tasklet_init(&pl022->pump_transfers,
+			pump_transfers,	(unsigned long)pl022);
+
+	INIT_WORK(&pl022->pump_messages, pump_messages);
+	pl022->workqueue = create_singlethread_workqueue(
+					dev_name(pl022->master->dev.parent));
+	if (pl022->workqueue == NULL)
+		return -EBUSY;
+
+	return 0;
+}
+
+
+static int start_queue(struct pl022 *pl022)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pl022->queue_lock, flags);
+
+	if (pl022->run == QUEUE_RUNNING || pl022->busy) {
+		spin_unlock_irqrestore(&pl022->queue_lock, flags);
+		return -EBUSY;
+	}
+
+	pl022->run = QUEUE_RUNNING;
+	pl022->cur_msg = NULL;
+	pl022->cur_transfer = NULL;
+	pl022->cur_chip = NULL;
+	spin_unlock_irqrestore(&pl022->queue_lock, flags);
+
+	queue_work(pl022->workqueue, &pl022->pump_messages);
+
+	return 0;
+}
+
+
+static int stop_queue(struct pl022 *pl022)
+{
+	unsigned long flags;
+	unsigned limit = 500;
+	int status = 0;
+
+	spin_lock_irqsave(&pl022->queue_lock, flags);
+
+	/* This is a bit lame, but is optimized for the common execution path.
+	 * A wait_queue on the pl022->busy could be used, but then the common
+	 * execution path (pump_messages) would be required to call wake_up or
+	 * friends on every SPI message. Do this instead */
+	pl022->run = QUEUE_STOPPED;
+	while (!list_empty(&pl022->queue) && pl022->busy && limit--) {
+		spin_unlock_irqrestore(&pl022->queue_lock, flags);
+		msleep(10);
+		spin_lock_irqsave(&pl022->queue_lock, flags);
+	}
+
+	if (!list_empty(&pl022->queue) || pl022->busy)
+		status = -EBUSY;
+
+	spin_unlock_irqrestore(&pl022->queue_lock, flags);
+
+	return status;
+}
+
+static int destroy_queue(struct pl022 *pl022)
+{
+	int status;
+
+	status = stop_queue(pl022);
+	/* we are unloading the module or failing to load (only two calls
+	 * to this routine), and neither call can handle a return value.
+	 * However, destroy_workqueue calls flush_workqueue, and that will
+	 * block until all work is done.  If the reason that stop_queue
+	 * timed out is that the work will never finish, then it does no
+	 * good to call destroy_workqueue, so return anyway. */
+	if (status != 0)
+		return status;
+
+	destroy_workqueue(pl022->workqueue);
+
+	return 0;
+}
+
+static int verify_controller_parameters(struct pl022 *pl022,
+					struct pl022_config_chip *chip_info)
+{
+	if ((chip_info->lbm != LOOPBACK_ENABLED)
+	    && (chip_info->lbm != LOOPBACK_DISABLED)) {
+		dev_err(chip_info->dev,
+			"loopback Mode is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->iface < SSP_INTERFACE_MOTOROLA_SPI)
+	    || (chip_info->iface > SSP_INTERFACE_UNIDIRECTIONAL)) {
+		dev_err(chip_info->dev,
+			"interface is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->iface == SSP_INTERFACE_UNIDIRECTIONAL) &&
+	    (!pl022->vendor->unidir)) {
+		dev_err(chip_info->dev,
+			"unidirectional mode not supported in this "
+			"hardware version\n");
+		return -EINVAL;
+	}
+	if ((chip_info->hierarchy != SSP_MASTER)
+	    && (chip_info->hierarchy != SSP_SLAVE)) {
+		dev_err(chip_info->dev,
+			"hierarchy is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if (((chip_info->clk_freq).cpsdvsr < CPSDVR_MIN)
+	    || ((chip_info->clk_freq).cpsdvsr > CPSDVR_MAX)) {
+		dev_err(chip_info->dev,
+			"cpsdvsr is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->endian_rx != SSP_RX_MSB)
+	    && (chip_info->endian_rx != SSP_RX_LSB)) {
+		dev_err(chip_info->dev,
+			"RX FIFO endianess is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->endian_tx != SSP_TX_MSB)
+	    && (chip_info->endian_tx != SSP_TX_LSB)) {
+		dev_err(chip_info->dev,
+			"TX FIFO endianess is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->data_size < SSP_DATA_BITS_4)
+	    || (chip_info->data_size > SSP_DATA_BITS_32)) {
+		dev_err(chip_info->dev,
+			"DATA Size is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->com_mode != INTERRUPT_TRANSFER)
+	    && (chip_info->com_mode != DMA_TRANSFER)
+	    && (chip_info->com_mode != POLLING_TRANSFER)) {
+		dev_err(chip_info->dev,
+			"Communication mode is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->rx_lev_trig < SSP_RX_1_OR_MORE_ELEM)
+	    || (chip_info->rx_lev_trig > SSP_RX_32_OR_MORE_ELEM)) {
+		dev_err(chip_info->dev,
+			"RX FIFO Trigger Level is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if ((chip_info->tx_lev_trig < SSP_TX_1_OR_MORE_EMPTY_LOC)
+	    || (chip_info->tx_lev_trig > SSP_TX_32_OR_MORE_EMPTY_LOC)) {
+		dev_err(chip_info->dev,
+			"TX FIFO Trigger Level is configured incorrectly\n");
+		return -EINVAL;
+	}
+	if (chip_info->iface == SSP_INTERFACE_MOTOROLA_SPI) {
+		if ((chip_info->clk_phase != SSP_CLK_RISING_EDGE)
+		    && (chip_info->clk_phase != SSP_CLK_FALLING_EDGE)) {
+			dev_err(chip_info->dev,
+				"Clock Phase is configured incorrectly\n");
+			return -EINVAL;
+		}
+		if ((chip_info->clk_pol != SSP_CLK_POL_IDLE_LOW)
+		    && (chip_info->clk_pol != SSP_CLK_POL_IDLE_HIGH)) {
+			dev_err(chip_info->dev,
+				"Clock Polarity is configured incorrectly\n");
+			return -EINVAL;
+		}
+	}
+	if (chip_info->iface == SSP_INTERFACE_NATIONAL_MICROWIRE) {
+		if ((chip_info->ctrl_len < SSP_BITS_4)
+		    || (chip_info->ctrl_len > SSP_BITS_32)) {
+			dev_err(chip_info->dev,
+				"CTRL LEN is configured incorrectly\n");
+			return -EINVAL;
+		}
+		if ((chip_info->wait_state != SSP_MWIRE_WAIT_ZERO)
+		    && (chip_info->wait_state != SSP_MWIRE_WAIT_ONE)) {
+			dev_err(chip_info->dev,
+				"Wait State is configured incorrectly\n");
+			return -EINVAL;
+		}
+		if ((chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX)
+		    && (chip_info->duplex !=
+			SSP_MICROWIRE_CHANNEL_HALF_DUPLEX)) {
+			dev_err(chip_info->dev,
+				"DUPLEX is configured incorrectly\n");
+			return -EINVAL;
+		}
+	}
+	if (chip_info->cs_control == NULL) {
+		dev_warn(chip_info->dev,
+			"Chip Select Function is NULL for this chip\n");
+		chip_info->cs_control = null_cs_control;
+	}
+	return 0;
+}
+
+/**
+ * pl022_transfer - transfer function registered to SPI master framework
+ * @spi: spi device which is requesting transfer
+ * @msg: spi message which is to handled is queued to driver queue
+ *
+ * This function is registered to the SPI framework for this SPI master
+ * controller. It will queue the spi_message in the queue of driver if
+ * the queue is not stopped and return.
+ */
+static int pl022_transfer(struct spi_device *spi, struct spi_message *msg)
+{
+	struct pl022 *pl022 = spi_master_get_devdata(spi->master);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pl022->queue_lock, flags);
+
+	if (pl022->run == QUEUE_STOPPED) {
+		spin_unlock_irqrestore(&pl022->queue_lock, flags);
+		return -ESHUTDOWN;
+	}
+	msg->actual_length = 0;
+	msg->status = -EINPROGRESS;
+	msg->state = STATE_START;
+
+	list_add_tail(&msg->queue, &pl022->queue);
+	if (pl022->run == QUEUE_RUNNING && !pl022->busy)
+		queue_work(pl022->workqueue, &pl022->pump_messages);
+
+	spin_unlock_irqrestore(&pl022->queue_lock, flags);
+	return 0;
+}
+
+static int calculate_effective_freq(struct pl022 *pl022,
+				    int freq,
+				    struct ssp_clock_params *clk_freq)
+{
+	/* Lets calculate the frequency parameters */
+	u16 cpsdvsr = 2;
+	u16 scr = 0;
+	bool freq_found = false;
+	u32 rate;
+	u32 max_tclk;
+	u32 min_tclk;
+
+	rate = clk_get_rate(pl022->clk);
+	/* cpsdvscr = 2 & scr 0 */
+	max_tclk = (rate / (CPSDVR_MIN * (1 + SCR_MIN)));
+	/* cpsdvsr = 254 & scr = 255 */
+	min_tclk = (rate / (CPSDVR_MAX * (1 + SCR_MAX)));
+
+	if ((freq <= max_tclk) && (freq >= min_tclk)) {
+		while (cpsdvsr <= CPSDVR_MAX && !freq_found) {
+			while (scr <= SCR_MAX && !freq_found) {
+				if ((rate /
+				     (cpsdvsr * (1 + scr))) > freq)
+					scr += 1;
+				else {
+					/*
+					 * This bool is made true when
+					 * effective frequency >=
+					 * target frequency is found
+					 */
+					freq_found = true;
+					if ((rate /
+					     (cpsdvsr * (1 + scr))) != freq) {
+						if (scr == SCR_MIN) {
+							cpsdvsr -= 2;
+							scr = SCR_MAX;
+						} else
+							scr -= 1;
+					}
+				}
+			}
+			if (!freq_found) {
+				cpsdvsr += 2;
+				scr = SCR_MIN;
+			}
+		}
+		if (cpsdvsr != 0) {
+			dev_dbg(&pl022->adev->dev,
+				"SSP Effective Frequency is %u\n",
+				(rate / (cpsdvsr * (1 + scr))));
+			clk_freq->cpsdvsr = (u8) (cpsdvsr & 0xFF);
+			clk_freq->scr = (u8) (scr & 0xFF);
+			dev_dbg(&pl022->adev->dev,
+				"SSP cpsdvsr = %d, scr = %d\n",
+				clk_freq->cpsdvsr, clk_freq->scr);
+		}
+	} else {
+		dev_err(&pl022->adev->dev,
+			"controller data is incorrect: out of range frequency");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * NOT IMPLEMENTED
+ * process_dma_info - Processes the DMA info provided by client drivers
+ * @chip_info: chip info provided by client device
+ * @chip: Runtime state maintained by the SSP controller for each spi device
+ *
+ * This function processes and stores DMA config provided by client driver
+ * into the runtime state maintained by the SSP controller driver
+ */
+static int process_dma_info(struct pl022_config_chip *chip_info,
+			    struct chip_data *chip)
+{
+	dev_err(chip_info->dev,
+		"cannot process DMA info, DMA not implemented!\n");
+	return -ENOTSUPP;
+}
+
+/**
+ * pl022_setup - setup function registered to SPI master framework
+ * @spi: spi device which is requesting setup
+ *
+ * This function is registered to the SPI framework for this SPI master
+ * controller. If it is the first time when setup is called by this device,
+ * this function will initialize the runtime state for this chip and save
+ * the same in the device structure. Else it will update the runtime info
+ * with the updated chip info. Nothing is really being written to the
+ * controller hardware here, that is not done until the actual transfer
+ * commence.
+ */
+
+/* FIXME: JUST GUESSING the spi->mode bits understood by this driver */
+#define MODEBITS	(SPI_CPOL | SPI_CPHA | SPI_CS_HIGH \
+			| SPI_LSB_FIRST | SPI_LOOP)
+
+static int pl022_setup(struct spi_device *spi)
+{
+	struct pl022_config_chip *chip_info;
+	struct chip_data *chip;
+	int status = 0;
+	struct pl022 *pl022 = spi_master_get_devdata(spi->master);
+
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
+	if (!spi->max_speed_hz)
+		return -EINVAL;
+
+	/* Get controller_state if one is supplied */
+	chip = spi_get_ctldata(spi);
+
+	if (chip == NULL) {
+		chip = kzalloc(sizeof(struct chip_data), GFP_KERNEL);
+		if (!chip) {
+			dev_err(&spi->dev,
+				"cannot allocate controller state\n");
+			return -ENOMEM;
+		}
+		dev_dbg(&spi->dev,
+			"allocated memory for controller's runtime state\n");
+	}
+
+	/* Get controller data if one is supplied */
+	chip_info = spi->controller_data;
+
+	if (chip_info == NULL) {
+		/* spi_board_info.controller_data not is supplied */
+		dev_dbg(&spi->dev,
+			"using default controller_data settings\n");
+
+		chip_info =
+			kzalloc(sizeof(struct pl022_config_chip), GFP_KERNEL);
+
+		if (!chip_info) {
+			dev_err(&spi->dev,
+				"cannot allocate controller data\n");
+			status = -ENOMEM;
+			goto err_first_setup;
+		}
+
+		dev_dbg(&spi->dev, "allocated memory for controller data\n");
+
+		/* Pointer back to the SPI device */
+		chip_info->dev = &spi->dev;
+		/*
+		 * Set controller data default values:
+		 * Polling is supported by default
+		 */
+		chip_info->lbm = LOOPBACK_DISABLED;
+		chip_info->com_mode = POLLING_TRANSFER;
+		chip_info->iface = SSP_INTERFACE_MOTOROLA_SPI;
+		chip_info->hierarchy = SSP_SLAVE;
+		chip_info->slave_tx_disable = DO_NOT_DRIVE_TX;
+		chip_info->endian_tx = SSP_TX_LSB;
+		chip_info->endian_rx = SSP_RX_LSB;
+		chip_info->data_size = SSP_DATA_BITS_12;
+		chip_info->rx_lev_trig = SSP_RX_1_OR_MORE_ELEM;
+		chip_info->tx_lev_trig = SSP_TX_1_OR_MORE_EMPTY_LOC;
+		chip_info->clk_phase = SSP_CLK_FALLING_EDGE;
+		chip_info->clk_pol = SSP_CLK_POL_IDLE_LOW;
+		chip_info->ctrl_len = SSP_BITS_8;
+		chip_info->wait_state = SSP_MWIRE_WAIT_ZERO;
+		chip_info->duplex = SSP_MICROWIRE_CHANNEL_FULL_DUPLEX;
+		chip_info->cs_control = null_cs_control;
+	} else {
+		dev_dbg(&spi->dev,
+			"using user supplied controller_data settings\n");
+	}
+
+	/*
+	 * We can override with custom divisors, else we use the board
+	 * frequency setting
+	 */
+	if ((0 == chip_info->clk_freq.cpsdvsr)
+	    && (0 == chip_info->clk_freq.scr)) {
+		status = calculate_effective_freq(pl022,
+						  spi->max_speed_hz,
+						  &chip_info->clk_freq);
+		if (status < 0)
+			goto err_config_params;
+	} else {
+		if ((chip_info->clk_freq.cpsdvsr % 2) != 0)
+			chip_info->clk_freq.cpsdvsr =
+				chip_info->clk_freq.cpsdvsr - 1;
+	}
+	status = verify_controller_parameters(pl022, chip_info);
+	if (status) {
+		dev_err(&spi->dev, "controller data is incorrect");
+		goto err_config_params;
+	}
+	/* Now set controller state based on controller data */
+	chip->xfer_type = chip_info->com_mode;
+	chip->cs_control = chip_info->cs_control;
+
+	if (chip_info->data_size <= 8) {
+		dev_dbg(&spi->dev, "1 <= n <=8 bits per word\n");
+		chip->n_bytes = 1;
+		chip->read = READING_U8;
+		chip->write = WRITING_U8;
+	} else if (chip_info->data_size <= 16) {
+		dev_dbg(&spi->dev, "9 <= n <= 16 bits per word\n");
+		chip->n_bytes = 2;
+		chip->read = READING_U16;
+		chip->write = WRITING_U16;
+	} else {
+		if (pl022->vendor->max_bpw >= 32) {
+			dev_dbg(&spi->dev, "17 <= n <= 32 bits per word\n");
+			chip->n_bytes = 4;
+			chip->read = READING_U32;
+			chip->write = WRITING_U32;
+		} else {
+			dev_err(&spi->dev,
+				"illegal data size for this controller!\n");
+			dev_err(&spi->dev,
+				"a standard pl022 can only handle "
+				"1 <= n <= 16 bit words\n");
+			goto err_config_params;
+		}
+	}
+
+	/* Now Initialize all register settings required for this chip */
+	chip->cr0 = 0;
+	chip->cr1 = 0;
+	chip->dmacr = 0;
+	chip->cpsr = 0;
+	if ((chip_info->com_mode == DMA_TRANSFER)
+	    && ((pl022->master_info)->enable_dma)) {
+		chip->enable_dma = 1;
+		dev_dbg(&spi->dev, "DMA mode set in controller state\n");
+		status = process_dma_info(chip_info, chip);
+		if (status < 0)
+			goto err_config_params;
+		SSP_WRITE_BITS(chip->dmacr, SSP_DMA_ENABLED,
+			       SSP_DMACR_MASK_RXDMAE, 0);
+		SSP_WRITE_BITS(chip->dmacr, SSP_DMA_ENABLED,
+			       SSP_DMACR_MASK_TXDMAE, 1);
+	} else {
+		chip->enable_dma = 0;
+		dev_dbg(&spi->dev, "DMA mode NOT set in controller state\n");
+		SSP_WRITE_BITS(chip->dmacr, SSP_DMA_DISABLED,
+			       SSP_DMACR_MASK_RXDMAE, 0);
+		SSP_WRITE_BITS(chip->dmacr, SSP_DMA_DISABLED,
+			       SSP_DMACR_MASK_TXDMAE, 1);
+	}
+
+	chip->cpsr = chip_info->clk_freq.cpsdvsr;
+
+	SSP_WRITE_BITS(chip->cr0, chip_info->data_size, SSP_CR0_MASK_DSS, 0);
+	SSP_WRITE_BITS(chip->cr0, chip_info->duplex, SSP_CR0_MASK_HALFDUP, 5);
+	SSP_WRITE_BITS(chip->cr0, chip_info->clk_pol, SSP_CR0_MASK_SPO, 6);
+	SSP_WRITE_BITS(chip->cr0, chip_info->clk_phase, SSP_CR0_MASK_SPH, 7);
+	SSP_WRITE_BITS(chip->cr0, chip_info->clk_freq.scr, SSP_CR0_MASK_SCR, 8);
+	SSP_WRITE_BITS(chip->cr0, chip_info->ctrl_len, SSP_CR0_MASK_CSS, 16);
+	SSP_WRITE_BITS(chip->cr0, chip_info->iface, SSP_CR0_MASK_FRF, 21);
+	SSP_WRITE_BITS(chip->cr1, chip_info->lbm, SSP_CR1_MASK_LBM, 0);
+	SSP_WRITE_BITS(chip->cr1, SSP_DISABLED, SSP_CR1_MASK_SSE, 1);
+	SSP_WRITE_BITS(chip->cr1, chip_info->hierarchy, SSP_CR1_MASK_MS, 2);
+	SSP_WRITE_BITS(chip->cr1, chip_info->slave_tx_disable, SSP_CR1_MASK_SOD, 3);
+	SSP_WRITE_BITS(chip->cr1, chip_info->endian_rx, SSP_CR1_MASK_RENDN, 4);
+	SSP_WRITE_BITS(chip->cr1, chip_info->endian_tx, SSP_CR1_MASK_TENDN, 5);
+	SSP_WRITE_BITS(chip->cr1, chip_info->wait_state, SSP_CR1_MASK_MWAIT, 6);
+	SSP_WRITE_BITS(chip->cr1, chip_info->rx_lev_trig, SSP_CR1_MASK_RXIFLSEL, 7);
+	SSP_WRITE_BITS(chip->cr1, chip_info->tx_lev_trig, SSP_CR1_MASK_TXIFLSEL, 10);
+
+	/* Save controller_state */
+	spi_set_ctldata(spi, chip);
+	return status;
+ err_config_params:
+ err_first_setup:
+	kfree(chip);
+	return status;
+}
+
+/**
+ * pl022_cleanup - cleanup function registered to SPI master framework
+ * @spi: spi device which is requesting cleanup
+ *
+ * This function is registered to the SPI framework for this SPI master
+ * controller. It will free the runtime state of chip.
+ */
+static void pl022_cleanup(struct spi_device *spi)
+{
+	struct chip_data *chip = spi_get_ctldata(spi);
+
+	spi_set_ctldata(spi, NULL);
+	kfree(chip);
+}
+
+
+static int __init
+pl022_probe(struct amba_device *adev, struct amba_id *id)
+{
+	struct device *dev = &adev->dev;
+	struct pl022_ssp_controller *platform_info = adev->dev.platform_data;
+	struct spi_master *master;
+	struct pl022 *pl022 = NULL;	/*Data for this driver */
+	int status = 0;
+
+	dev_info(&adev->dev,
+		 "ARM PL022 driver, device ID: 0x%08x\n", adev->periphid);
+	if (platform_info == NULL) {
+		dev_err(&adev->dev, "probe - no platform data supplied\n");
+		status = -ENODEV;
+		goto err_no_pdata;
+	}
+
+	/* Allocate master with space for data */
+	master = spi_alloc_master(dev, sizeof(struct pl022));
+	if (master == NULL) {
+		dev_err(&adev->dev, "probe - cannot alloc SPI master\n");
+		status = -ENOMEM;
+		goto err_no_master;
+	}
+
+	pl022 = spi_master_get_devdata(master);
+	pl022->master = master;
+	pl022->master_info = platform_info;
+	pl022->adev = adev;
+	pl022->vendor = id->data;
+
+	/*
+	 * Bus Number Which has been Assigned to this SSP controller
+	 * on this board
+	 */
+	master->bus_num = platform_info->bus_id;
+	master->num_chipselect = platform_info->num_chipselect;
+	master->cleanup = pl022_cleanup;
+	master->setup = pl022_setup;
+	master->transfer = pl022_transfer;
+
+	dev_dbg(&adev->dev, "BUSNO: %d\n", master->bus_num);
+
+	status = amba_request_regions(adev, NULL);
+	if (status)
+		goto err_no_ioregion;
+
+	pl022->virtbase = ioremap(adev->res.start, resource_size(&adev->res));
+	if (pl022->virtbase == NULL) {
+		status = -ENOMEM;
+		goto err_no_ioremap;
+	}
+	printk(KERN_INFO "pl022: mapped registers from 0x%08x to %p\n",
+	       adev->res.start, pl022->virtbase);
+
+	pl022->clk = clk_get(&adev->dev, NULL);
+	if (IS_ERR(pl022->clk)) {
+		status = PTR_ERR(pl022->clk);
+		dev_err(&adev->dev, "could not retrieve SSP/SPI bus clock\n");
+		goto err_no_clk;
+	}
+
+	/* Disable SSP */
+	clk_enable(pl022->clk);
+	writew((readw(SSP_CR1(pl022->virtbase)) & (~SSP_CR1_MASK_SSE)),
+	       SSP_CR1(pl022->virtbase));
+	load_ssp_default_config(pl022);
+	clk_disable(pl022->clk);
+
+	status = request_irq(adev->irq[0], pl022_interrupt_handler, 0, "pl022",
+			     pl022);
+	if (status < 0) {
+		dev_err(&adev->dev, "probe - cannot get IRQ (%d)\n", status);
+		goto err_no_irq;
+	}
+	/* Initialize and start queue */
+	status = init_queue(pl022);
+	if (status != 0) {
+		dev_err(&adev->dev, "probe - problem initializing queue\n");
+		goto err_init_queue;
+	}
+	status = start_queue(pl022);
+	if (status != 0) {
+		dev_err(&adev->dev, "probe - problem starting queue\n");
+		goto err_start_queue;
+	}
+	/* Register with the SPI framework */
+	amba_set_drvdata(adev, pl022);
+	status = spi_register_master(master);
+	if (status != 0) {
+		dev_err(&adev->dev,
+			"probe - problem registering spi master\n");
+		goto err_spi_register;
+	}
+	dev_dbg(dev, "probe succeded\n");
+	return 0;
+
+ err_spi_register:
+ err_start_queue:
+ err_init_queue:
+	destroy_queue(pl022);
+	free_irq(adev->irq[0], pl022);
+ err_no_irq:
+	clk_put(pl022->clk);
+ err_no_clk:
+	iounmap(pl022->virtbase);
+ err_no_ioremap:
+	amba_release_regions(adev);
+ err_no_ioregion:
+	spi_master_put(master);
+ err_no_master:
+ err_no_pdata:
+	return status;
+}
+
+static int __exit
+pl022_remove(struct amba_device *adev)
+{
+	struct pl022 *pl022 = amba_get_drvdata(adev);
+	int status = 0;
+	if (!pl022)
+		return 0;
+
+	/* Remove the queue */
+	status = destroy_queue(pl022);
+	if (status != 0) {
+		dev_err(&adev->dev,
+			"queue remove failed (%d)\n", status);
+		return status;
+	}
+	load_ssp_default_config(pl022);
+	free_irq(adev->irq[0], pl022);
+	clk_disable(pl022->clk);
+	clk_put(pl022->clk);
+	iounmap(pl022->virtbase);
+	amba_release_regions(adev);
+	tasklet_disable(&pl022->pump_transfers);
+	spi_unregister_master(pl022->master);
+	spi_master_put(pl022->master);
+	amba_set_drvdata(adev, NULL);
+	dev_dbg(&adev->dev, "remove succeded\n");
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int pl022_suspend(struct amba_device *adev, pm_message_t state)
+{
+	struct pl022 *pl022 = amba_get_drvdata(adev);
+	int status = 0;
+
+	status = stop_queue(pl022);
+	if (status) {
+		dev_warn(&adev->dev, "suspend cannot stop queue\n");
+		return status;
+	}
+
+	clk_enable(pl022->clk);
+	load_ssp_default_config(pl022);
+	clk_disable(pl022->clk);
+	dev_dbg(&adev->dev, "suspended\n");
+	return 0;
+}
+
+static int pl022_resume(struct amba_device *adev)
+{
+	struct pl022 *pl022 = amba_get_drvdata(adev);
+	int status = 0;
+
+	/* Start the queue running */
+	status = start_queue(pl022);
+	if (status)
+		dev_err(&adev->dev, "problem starting queue (%d)\n", status);
+	else
+		dev_dbg(&adev->dev, "resumed\n");
+
+	return status;
+}
+#else
+#define pl022_suspend NULL
+#define pl022_resume NULL
+#endif	/* CONFIG_PM */
+
+static struct vendor_data vendor_arm = {
+	.fifodepth = 8,
+	.max_bpw = 16,
+	.unidir = false,
+};
+
+
+static struct vendor_data vendor_st = {
+	.fifodepth = 32,
+	.max_bpw = 32,
+	.unidir = false,
+};
+
+static struct amba_id pl022_ids[] = {
+	{
+		/*
+		 * ARM PL022 variant, this has a 16bit wide
+		 * and 8 locations deep TX/RX FIFO
+		 */
+		.id	= 0x00041022,
+		.mask	= 0x000fffff,
+		.data	= &vendor_arm,
+	},
+	{
+		/*
+		 * ST Micro derivative, this has 32bit wide
+		 * and 32 locations deep TX/RX FIFO
+		 */
+		.id	= 0x00108022,
+		.mask	= 0xffffffff,
+		.data	= &vendor_st,
+	},
+	{ 0, 0 },
+};
+
+static struct amba_driver pl022_driver = {
+	.drv = {
+		.name	= "ssp-pl022",
+	},
+	.id_table	= pl022_ids,
+	.probe		= pl022_probe,
+	.remove		= __exit_p(pl022_remove),
+	.suspend        = pl022_suspend,
+	.resume         = pl022_resume,
+};
+
+
+static int __init pl022_init(void)
+{
+	return amba_driver_register(&pl022_driver);
+}
+
+module_init(pl022_init);
+
+static void __exit pl022_exit(void)
+{
+	amba_driver_unregister(&pl022_driver);
+}
+
+module_exit(pl022_exit);
+
+MODULE_AUTHOR("Linus Walleij <linus.walleij@stericsson.com>");
+MODULE_DESCRIPTION("PL022 SSP Controller Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/amba/pl022.h b/include/linux/amba/pl022.h
new file mode 100644
index 00000000000..dcad0ffd175
--- /dev/null
+++ b/include/linux/amba/pl022.h
@@ -0,0 +1,264 @@
+/*
+ * include/linux/amba/pl022.h
+ *
+ * Copyright (C) 2008-2009 ST-Ericsson AB
+ * Copyright (C) 2006 STMicroelectronics Pvt. Ltd.
+ *
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ *
+ * Initial version inspired by:
+ *	linux-2.6.17-rc3-mm1/drivers/spi/pxa2xx_spi.c
+ * Initial adoption to PL022 by:
+ *      Sachin Verma <sachin.verma@st.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _SSP_PL022_H
+#define _SSP_PL022_H
+
+#include <linux/device.h>
+
+/**
+ * whether SSP is in loopback mode or not
+ */
+enum ssp_loopback {
+	LOOPBACK_DISABLED,
+	LOOPBACK_ENABLED
+};
+
+/**
+ * enum ssp_interface - interfaces allowed for this SSP Controller
+ * @SSP_INTERFACE_MOTOROLA_SPI: Motorola Interface
+ * @SSP_INTERFACE_TI_SYNC_SERIAL: Texas Instrument Synchronous Serial
+ * interface
+ * @SSP_INTERFACE_NATIONAL_MICROWIRE: National Semiconductor Microwire
+ * interface
+ * @SSP_INTERFACE_UNIDIRECTIONAL: Unidirectional interface (STn8810
+ * &STn8815 only)
+ */
+enum ssp_interface {
+	SSP_INTERFACE_MOTOROLA_SPI,
+	SSP_INTERFACE_TI_SYNC_SERIAL,
+	SSP_INTERFACE_NATIONAL_MICROWIRE,
+	SSP_INTERFACE_UNIDIRECTIONAL
+};
+
+/**
+ * enum ssp_hierarchy - whether SSP is configured as Master or Slave
+ */
+enum ssp_hierarchy {
+	SSP_MASTER,
+	SSP_SLAVE
+};
+
+/**
+ * enum ssp_clock_params - clock parameters, to set SSP clock at a
+ * desired freq
+ */
+struct ssp_clock_params {
+	u8 cpsdvsr; /* value from 2 to 254 (even only!) */
+	u8 scr;	    /* value from 0 to 255 */
+};
+
+/**
+ * enum ssp_rx_endian - endianess of Rx FIFO Data
+ */
+enum ssp_rx_endian {
+	SSP_RX_MSB,
+	SSP_RX_LSB
+};
+
+/**
+ * enum ssp_tx_endian - endianess of Tx FIFO Data
+ */
+enum ssp_tx_endian {
+	SSP_TX_MSB,
+	SSP_TX_LSB
+};
+
+/**
+ * enum ssp_data_size - number of bits in one data element
+ */
+enum ssp_data_size {
+	SSP_DATA_BITS_4 = 0x03, SSP_DATA_BITS_5, SSP_DATA_BITS_6,
+	SSP_DATA_BITS_7, SSP_DATA_BITS_8, SSP_DATA_BITS_9,
+	SSP_DATA_BITS_10, SSP_DATA_BITS_11, SSP_DATA_BITS_12,
+	SSP_DATA_BITS_13, SSP_DATA_BITS_14, SSP_DATA_BITS_15,
+	SSP_DATA_BITS_16, SSP_DATA_BITS_17, SSP_DATA_BITS_18,
+	SSP_DATA_BITS_19, SSP_DATA_BITS_20, SSP_DATA_BITS_21,
+	SSP_DATA_BITS_22, SSP_DATA_BITS_23, SSP_DATA_BITS_24,
+	SSP_DATA_BITS_25, SSP_DATA_BITS_26, SSP_DATA_BITS_27,
+	SSP_DATA_BITS_28, SSP_DATA_BITS_29, SSP_DATA_BITS_30,
+	SSP_DATA_BITS_31, SSP_DATA_BITS_32
+};
+
+/**
+ * enum ssp_mode - SSP mode of operation (Communication modes)
+ */
+enum ssp_mode {
+	INTERRUPT_TRANSFER,
+	POLLING_TRANSFER,
+	DMA_TRANSFER
+};
+
+/**
+ * enum ssp_rx_level_trig - receive FIFO watermark level which triggers
+ * IT: Interrupt fires when _N_ or more elements in RX FIFO.
+ */
+enum ssp_rx_level_trig {
+	SSP_RX_1_OR_MORE_ELEM,
+	SSP_RX_4_OR_MORE_ELEM,
+	SSP_RX_8_OR_MORE_ELEM,
+	SSP_RX_16_OR_MORE_ELEM,
+	SSP_RX_32_OR_MORE_ELEM
+};
+
+/**
+ * Transmit FIFO watermark level which triggers (IT Interrupt fires
+ * when _N_ or more empty locations in TX FIFO)
+ */
+enum ssp_tx_level_trig {
+	SSP_TX_1_OR_MORE_EMPTY_LOC,
+	SSP_TX_4_OR_MORE_EMPTY_LOC,
+	SSP_TX_8_OR_MORE_EMPTY_LOC,
+	SSP_TX_16_OR_MORE_EMPTY_LOC,
+	SSP_TX_32_OR_MORE_EMPTY_LOC
+};
+
+/**
+ * enum SPI Clock Phase - clock phase (Motorola SPI interface only)
+ * @SSP_CLK_RISING_EDGE: Receive data on rising edge
+ * @SSP_CLK_FALLING_EDGE: Receive data on falling edge
+ */
+enum ssp_spi_clk_phase {
+	SSP_CLK_RISING_EDGE,
+	SSP_CLK_FALLING_EDGE
+};
+
+/**
+ * enum SPI Clock Polarity - clock polarity (Motorola SPI interface only)
+ * @SSP_CLK_POL_IDLE_LOW: Low inactive level
+ * @SSP_CLK_POL_IDLE_HIGH: High inactive level
+ */
+enum ssp_spi_clk_pol {
+	SSP_CLK_POL_IDLE_LOW,
+	SSP_CLK_POL_IDLE_HIGH
+};
+
+/**
+ * Microwire Conrol Lengths Command size in microwire format
+ */
+enum ssp_microwire_ctrl_len {
+	SSP_BITS_4 = 0x03, SSP_BITS_5, SSP_BITS_6,
+	SSP_BITS_7, SSP_BITS_8, SSP_BITS_9,
+	SSP_BITS_10, SSP_BITS_11, SSP_BITS_12,
+	SSP_BITS_13, SSP_BITS_14, SSP_BITS_15,
+	SSP_BITS_16, SSP_BITS_17, SSP_BITS_18,
+	SSP_BITS_19, SSP_BITS_20, SSP_BITS_21,
+	SSP_BITS_22, SSP_BITS_23, SSP_BITS_24,
+	SSP_BITS_25, SSP_BITS_26, SSP_BITS_27,
+	SSP_BITS_28, SSP_BITS_29, SSP_BITS_30,
+	SSP_BITS_31, SSP_BITS_32
+};
+
+/**
+ * enum Microwire Wait State
+ * @SSP_MWIRE_WAIT_ZERO: No wait state inserted after last command bit
+ * @SSP_MWIRE_WAIT_ONE: One wait state inserted after last command bit
+ */
+enum ssp_microwire_wait_state {
+	SSP_MWIRE_WAIT_ZERO,
+	SSP_MWIRE_WAIT_ONE
+};
+
+/**
+ * enum Microwire - whether Full/Half Duplex
+ * @SSP_MICROWIRE_CHANNEL_FULL_DUPLEX: SSPTXD becomes bi-directional,
+ *     SSPRXD not used
+ * @SSP_MICROWIRE_CHANNEL_HALF_DUPLEX: SSPTXD is an output, SSPRXD is
+ *     an input.
+ */
+enum ssp_duplex {
+	SSP_MICROWIRE_CHANNEL_FULL_DUPLEX,
+	SSP_MICROWIRE_CHANNEL_HALF_DUPLEX
+};
+
+/**
+ * CHIP select/deselect commands
+ */
+enum ssp_chip_select {
+	SSP_CHIP_SELECT,
+	SSP_CHIP_DESELECT
+};
+
+
+/**
+ * struct pl022_ssp_master - device.platform_data for SPI controller devices.
+ * @num_chipselect: chipselects are used to distinguish individual
+ *     SPI slaves, and are numbered from zero to num_chipselects - 1.
+ *     each slave has a chipselect signal, but it's common that not
+ *     every chipselect is connected to a slave.
+ * @enable_dma: if true enables DMA driven transfers.
+ */
+struct pl022_ssp_controller {
+	u16 bus_id;
+	u8 num_chipselect;
+	u8 enable_dma:1;
+};
+
+/**
+ * struct ssp_config_chip - spi_board_info.controller_data for SPI
+ * slave devices, copied to spi_device.controller_data.
+ *
+ * @lbm: used for test purpose to internally connect RX and TX
+ * @iface: Interface type(Motorola, TI, Microwire, Universal)
+ * @hierarchy: sets whether interface is master or slave
+ * @slave_tx_disable: SSPTXD is disconnected (in slave mode only)
+ * @clk_freq: Tune freq parameters of SSP(when in master mode)
+ * @endian_rx: Endianess of Data in Rx FIFO
+ * @endian_tx: Endianess of Data in Tx FIFO
+ * @data_size: Width of data element(4 to 32 bits)
+ * @com_mode: communication mode: polling, Interrupt or DMA
+ * @rx_lev_trig: Rx FIFO watermark level (for IT & DMA mode)
+ * @tx_lev_trig: Tx FIFO watermark level (for IT & DMA mode)
+ * @clk_phase: Motorola SPI interface Clock phase
+ * @clk_pol: Motorola SPI interface Clock polarity
+ * @ctrl_len: Microwire interface: Control length
+ * @wait_state: Microwire interface: Wait state
+ * @duplex: Microwire interface: Full/Half duplex
+ * @cs_control: function pointer to board-specific function to
+ * assert/deassert I/O port to control HW generation of devices chip-select.
+ * @dma_xfer_type: Type of DMA xfer (Mem-to-periph or Periph-to-Periph)
+ * @dma_config: DMA configuration for SSP controller and peripheral
+ */
+struct pl022_config_chip {
+	struct device *dev;
+	enum ssp_loopback lbm;
+	enum ssp_interface iface;
+	enum ssp_hierarchy hierarchy;
+	bool slave_tx_disable;
+	struct ssp_clock_params clk_freq;
+	enum ssp_rx_endian endian_rx;
+	enum ssp_tx_endian endian_tx;
+	enum ssp_data_size data_size;
+	enum ssp_mode com_mode;
+	enum ssp_rx_level_trig rx_lev_trig;
+	enum ssp_tx_level_trig tx_lev_trig;
+	enum ssp_spi_clk_phase clk_phase;
+	enum ssp_spi_clk_pol clk_pol;
+	enum ssp_microwire_ctrl_len ctrl_len;
+	enum ssp_microwire_wait_state wait_state;
+	enum ssp_duplex duplex;
+	void (*cs_control) (u32 control);
+};
+
+#endif /* _SSP_PL022_H */
-- 
cgit v1.2.3-70-g09d2


From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:02:22 +0200
Subject: perf_counter: Introduce struct for sample data

For easy extension of the sample data, put it in a structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 10 +++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++----
 include/linux/perf_counter.h       | 10 ++++++++--
 kernel/perf_counter.c              | 38 ++++++++++++++++++++++----------------
 4 files changed, 48 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4786ad9a288..5e0bf399c43 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record) {
-		addr = 0;
+		struct perf_sample_data data = {
+			.regs = regs,
+			.addr = 0,
+		};
+
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
@@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
 				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-				addr = mfspr(SPRN_SDAR);
+				data.addr = mfspr(SPRN_SDAR);
 		}
-		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+		if (perf_counter_overflow(counter, nmi, &data)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
 			 * by setting the counter to 0, so it will be
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 240ca563063..82a23d487f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
-	struct cpu_hw_counters;
 	int bit, cpu, loops;
 	u64 ack, status;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1210,7 +1213,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1230,12 +1233,16 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
-	int cpu, idx, handled = 0;
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
+	int cpu, idx, handled = 0;
 	u64 val;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 282d8cc4898..d8c0eb480f9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -605,8 +605,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs, u64 addr);
+struct perf_sample_data {
+	struct pt_regs	*regs;
+	u64		addr;
+};
+
+extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+				 struct perf_sample_data *data);
+
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ae591a1275a..4fe85e804f4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2378,8 +2378,8 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs, u64 addr)
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data)
 {
 	int ret;
 	u64 sample_type = counter->attr.sample_type;
@@ -2404,10 +2404,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= perf_misc_flags(regs);
+	header.misc |= perf_misc_flags(data->regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(regs);
+		ip = perf_instruction_pointer(data->regs);
 		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
@@ -2460,7 +2460,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	}
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(regs);
+		callchain = perf_callchain(data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -2486,7 +2486,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, time);
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, addr);
+		perf_output_put(&handle, data->addr);
 
 	if (sample_type & PERF_SAMPLE_ID)
 		perf_output_put(&handle, counter->id);
@@ -2950,8 +2950,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+			  struct perf_sample_data *data)
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
@@ -3005,7 +3005,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs, addr);
+	perf_counter_output(counter, nmi, data);
 	return ret;
 }
 
@@ -3054,24 +3054,25 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
 	struct perf_counter *counter;
-	struct pt_regs *regs;
 	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
 
-	regs = get_irq_regs();
+	data.addr = 0;
+	data.regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !regs) &&
+	if ((counter->attr.exclude_kernel || !data.regs) &&
 			!counter->attr.exclude_user)
-		regs = task_pt_regs(current);
+		data.regs = task_pt_regs(current);
 
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs, 0))
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3084,9 +3085,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
+	struct perf_sample_data data = {
+		.regs = regs,
+		.addr = addr,
+	};
+
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs, addr))
+	if (perf_counter_overflow(counter, nmi, &data))
 		/* soft-disable the counter */
 		;
 
-- 
cgit v1.2.3-70-g09d2


From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:34:59 +0200
Subject: perf_counter: Accurate period data

We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is
incorrect. When we adjust the period, it will only take effect the next
cycle but report it for the current cycle. So when we adjust the period
for every cycle, we're always wrong.

Solve this by keeping track of the last_period.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  9 ++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++---
 include/linux/perf_counter.h       |  6 ++++--
 kernel/perf_counter.c              |  9 ++++++---
 4 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5e0bf399c43..4990ce2e5f0 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	perf_disable();
 	power_pmu_read(counter);
 	left = counter->hw.sample_period;
+	counter->hw.last_period = left;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
+	counter->hw.last_period = counter->hw.sample_period;
+	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		struct perf_sample_data data = {
-			.regs = regs,
-			.addr = 0,
+			.regs	= regs,
+			.addr	= 0,
+			.period	= counter->hw.last_period,
 		};
 
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82a23d487f9..57ae1bec81b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
 	}
 
@@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 	/*
@@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 
-		/* counter overflow */
-		handled = 1;
-		inc_irq_stat(apic_perf_irqs);
+		/*
+		 * counter overflow
+		 */
+		handled		= 1;
+		data.period	= counter->hw.last_period;
+
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
@@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
 	return handled;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d8c0eb480f9..5b966472b45 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -366,6 +366,7 @@ struct hw_perf_counter {
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
+	u64				last_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 
@@ -606,8 +607,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs	*regs;
-	u64		addr;
+	struct pt_regs		*regs;
+	u64			addr;
+	u64			period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4fe85e804f4..8b89b40bd0f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2495,7 +2495,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		perf_output_put(&handle, cpu_entry);
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, counter->hw.sample_period);
+		perf_output_put(&handle, data->period);
 
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
@@ -3040,11 +3040,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_add(period, &hwc->period_left);
+		hwc->last_period = period;
 	}
 
 	atomic64_set(&hwc->prev_count, -left);
@@ -3086,8 +3088,9 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
-		.addr = addr,
+		.regs	= regs,
+		.addr	= addr,
+		.period	= counter->hw.last_period,
 	};
 
 	perf_swcounter_update(counter);
-- 
cgit v1.2.3-70-g09d2


From 3f68535adad8dd89499505a65fb25d0e02d118cc Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 21 Jan 2009 22:53:22 -0700
Subject: clocksource: sanity check sysfs clocksource changes

Thomas, Andrew and Ingo pointed out that we don't have any safety checks
in the clocksource sysfs entries to make sure sysadmins don't try to
change the clocksource to a non high-res timer capable clocksource (such
as jiffies) when high-res timers (HRT) is enabled.  Doing so will likely
hang a system.

Correct this by filtering non HRT clocksources from available_clocksources
and not accepting non HRT clocksources with HRT enabled.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |  2 +-
 kernel/hrtimer.c          |  4 ++--
 kernel/time/clocksource.c | 18 +++++++++++++++++-
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0d2f7c8a33d..58021b0c396 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -305,7 +305,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
-
+extern int hrtimer_hres_active(void);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c1958..1a70c18cdff 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -476,7 +476,7 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
-static inline int hrtimer_hres_active(void)
+int hrtimer_hres_active(void)
 {
 	return __get_cpu_var(hrtimer_bases).hres_active;
 }
@@ -704,7 +704,7 @@ static int hrtimer_switch_to_hres(void)
 
 #else
 
-static inline int hrtimer_hres_active(void) { return 0; }
+int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 80189f6f1c5..18b9f5da4ee 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
+#include <linux/hrtimer.h>
 
 void timecounter_init(struct timecounter *tc,
 		      const struct cyclecounter *cc,
@@ -509,6 +510,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 		}
 	}
 
+	/*
+	 * Check to make sure we don't switch to a non-HRT usable
+	 * clocksource if HRT is enabled and running
+	 */
+	if (hrtimer_hres_active() &&
+	    !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
+		printk(KERN_WARNING "%s clocksource is not HRT compatible. "
+			"Cannot switch while in HRT mode\n", ovr->name);
+		ovr = NULL;
+		override_name[0] = 0;
+	}
+
 	/* Reselect, when the override name has changed */
 	if (ovr != clocksource_override) {
 		clocksource_override = ovr;
@@ -537,7 +550,10 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 
 	spin_lock_irq(&clocksource_lock);
 	list_for_each_entry(src, &clocksource_list, list) {
-		count += snprintf(buf + count,
+		/* Don't show non-HRES clocksource if HRES is enabled */
+		if (!hrtimer_hres_active() ||
+				(src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+			count += snprintf(buf + count,
 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
 				  "%s ", src->name);
 	}
-- 
cgit v1.2.3-70-g09d2


From d005ba6cc82440d9ebf96f3ec8f79c54578b898f Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 10 Jun 2009 05:28:04 +0000
Subject: mdio: Expose 10GBASE-T MDI-X status via ethtool

This is available in a standard MDIO register in 10GBASE-T PHYs.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio.c   | 17 +++++++++++++++++
 include/linux/mdio.h |  9 +++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index 66483035f68..dc45e9856c3 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -296,6 +296,23 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 		ecmd->duplex = (reg & MDIO_CTRL1_FULLDPLX ||
 				ecmd->speed == SPEED_10000);
 	}
+
+	/* 10GBASE-T MDI/MDI-X */
+	if (ecmd->port == PORT_TP && ecmd->speed == SPEED_10000) {
+		switch (mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+					MDIO_PMA_10GBT_SWAPPOL)) {
+		case MDIO_PMA_10GBT_SWAPPOL_ABNX | MDIO_PMA_10GBT_SWAPPOL_CDNX:
+			ecmd->eth_tp_mdix = ETH_TP_MDI;
+			break;
+		case 0:
+			ecmd->eth_tp_mdix = ETH_TP_MDI_X;
+			break;
+		default:
+			/* It's complicated... */
+			ecmd->eth_tp_mdix = ETH_TP_MDI_INVALID;
+			break;
+		}
+	}
 }
 EXPORT_SYMBOL(mdio45_ethtool_gset_npage);
 
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 56851646529..cfdf1df2875 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -45,6 +45,7 @@
 #define MDIO_PHYXS_LNSTAT	24	/* PHY XGXS lane state */
 
 /* Media-dependent registers. */
+#define MDIO_PMA_10GBT_SWAPPOL	130	/* 10GBASE-T pair swap & polarity */
 #define MDIO_PMA_10GBT_TXPWR	131	/* 10GBASE-T TX power control */
 #define MDIO_PMA_10GBT_SNR	133	/* 10GBASE-T SNR margin, lane A.
 					 * Lanes B-D are numbered 134-136. */
@@ -195,6 +196,14 @@
 #define MDIO_PHYXS_LNSTAT_SYNC3		0x0008
 #define MDIO_PHYXS_LNSTAT_ALIGN		0x1000
 
+/* PMA 10GBASE-T pair swap & polarity */
+#define MDIO_PMA_10GBT_SWAPPOL_ABNX	0x0001	/* Pair A/B uncrossed */
+#define MDIO_PMA_10GBT_SWAPPOL_CDNX	0x0002	/* Pair C/D uncrossed */
+#define MDIO_PMA_10GBT_SWAPPOL_AREV	0x0100	/* Pair A polarity reversed */
+#define MDIO_PMA_10GBT_SWAPPOL_BREV	0x0200	/* Pair B polarity reversed */
+#define MDIO_PMA_10GBT_SWAPPOL_CREV	0x0400	/* Pair C polarity reversed */
+#define MDIO_PMA_10GBT_SWAPPOL_DREV	0x0800	/* Pair D polarity reversed */
+
 /* PMA 10GBASE-T TX power register. */
 #define MDIO_PMA_10GBT_TXPWR_SHORT	0x0001	/* Short-reach mode */
 
-- 
cgit v1.2.3-70-g09d2


From 8593a1967fb9746d318dde88a0a39a36dbfc3445 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Wed, 20 May 2009 16:53:30 -0700
Subject: wimax/i2400m: rename misleading I2400M_PL_PAD to I2400M_PL_ALIGN

The constant is being use as an alignment factor, not as a padding
factor; made reading/reviewing the code quite confusing.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 drivers/net/wimax/i2400m/fw.c | 2 +-
 drivers/net/wimax/i2400m/rx.c | 4 ++--
 drivers/net/wimax/i2400m/tx.c | 4 ++--
 include/linux/wimax/i2400m.h  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/fw.c b/drivers/net/wimax/i2400m/fw.c
index 7ee1b99b384..26924f17f19 100644
--- a/drivers/net/wimax/i2400m/fw.c
+++ b/drivers/net/wimax/i2400m/fw.c
@@ -397,7 +397,7 @@ static int i2400m_download_chunk(struct i2400m *i2400m, const void *chunk,
 				 unsigned int direct, unsigned int do_csum)
 {
 	int ret;
-	size_t chunk_len = ALIGN(__chunk_len, I2400M_PL_PAD);
+	size_t chunk_len = ALIGN(__chunk_len, I2400M_PL_ALIGN);
 	struct device *dev = i2400m_dev(i2400m);
 	struct {
 		struct i2400m_bootrom_header cmd;
diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
index 7643850a6fb..07c32e68909 100644
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -1148,7 +1148,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
 	num_pls = le16_to_cpu(msg_hdr->num_pls);
 	pl_itr = sizeof(*msg_hdr) +	/* Check payload descriptor(s) */
 		num_pls * sizeof(msg_hdr->pld[0]);
-	pl_itr = ALIGN(pl_itr, I2400M_PL_PAD);
+	pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN);
 	if (pl_itr > skb->len) {	/* got all the payload descriptors? */
 		dev_err(dev, "RX: HW BUG? message too short (%u bytes) for "
 			"%u payload descriptors (%zu each, total %zu)\n",
@@ -1166,7 +1166,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
 		single_last = num_pls == 1 || i == num_pls - 1;
 		i2400m_rx_payload(i2400m, skb, single_last, &msg_hdr->pld[i],
 				  skb->data + pl_itr);
-		pl_itr += ALIGN(pl_size, I2400M_PL_PAD);
+		pl_itr += ALIGN(pl_size, I2400M_PL_ALIGN);
 		cond_resched();		/* Don't monopolize */
 	}
 	kfree_skb(skb);
diff --git a/drivers/net/wimax/i2400m/tx.c b/drivers/net/wimax/i2400m/tx.c
index 8ef724d31fb..a635fd720f3 100644
--- a/drivers/net/wimax/i2400m/tx.c
+++ b/drivers/net/wimax/i2400m/tx.c
@@ -491,7 +491,7 @@ void i2400m_tx_close(struct i2400m *i2400m)
 	 */
 	hdr_size = sizeof(*tx_msg)
 		+ le16_to_cpu(tx_msg->num_pls) * sizeof(tx_msg->pld[0]);
-	hdr_size = ALIGN(hdr_size, I2400M_PL_PAD);
+	hdr_size = ALIGN(hdr_size, I2400M_PL_ALIGN);
 	tx_msg->offset = I2400M_TX_PLD_SIZE - hdr_size;
 	tx_msg_moved = (void *) tx_msg + tx_msg->offset;
 	memmove(tx_msg_moved, tx_msg, hdr_size);
@@ -574,7 +574,7 @@ int i2400m_tx(struct i2400m *i2400m, const void *buf, size_t buf_len,
 
 	d_fnstart(3, dev, "(i2400m %p skb %p [%zu bytes] pt %u)\n",
 		  i2400m, buf, buf_len, pl_type);
-	padded_len = ALIGN(buf_len, I2400M_PL_PAD);
+	padded_len = ALIGN(buf_len, I2400M_PL_ALIGN);
 	d_printf(5, dev, "padded_len %zd buf_len %zd\n", padded_len, buf_len);
 	/* If there is no current TX message, create one; if the
 	 * current one is out of payload slots or we have a singleton,
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index d5148a7889a..433693ef2bb 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -266,7 +266,7 @@ enum i2400m_ro_type {
 
 /* Misc constants */
 enum {
-	I2400M_PL_PAD = 16,	/* Payload data size alignment */
+	I2400M_PL_ALIGN = 16,	/* Payload data size alignment */
 	I2400M_PL_SIZE_MAX = 0x3EFF,
 	I2400M_MAX_PLS_IN_MSG = 60,
 	/* protocol barkers: sync sequences; for notifications they
-- 
cgit v1.2.3-70-g09d2


From b0fd271d5fba0b2d00888363f3869e3f9b26caa9 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 11 Jun 2009 13:10:16 +0200
Subject: block: add request clone interface (v2)

This patch adds the following 2 interfaces for request-stacking drivers:

  - blk_rq_prep_clone(struct request *clone, struct request *orig,
		      struct bio_set *bs, gfp_t gfp_mask,
		      int (*bio_ctr)(struct bio *, struct bio*, void *),
		      void *data)
      * Clones bios in the original request to the clone request
        (bio_ctr is called for each cloned bios.)
      * Copies attributes of the original request to the clone request.
        The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not
        copied.

  - blk_rq_unprep_clone(struct request *clone)
      * Frees cloned bios from the clone request.

Request stacking drivers (e.g. request-based dm) need to make a clone
request for a submitted request and dispatch it to other devices.

To allocate request for the clone, request stacking drivers may not
be able to use blk_get_request() because the allocation may be done
in an irq-disabled context.
So blk_rq_prep_clone() takes a request allocated by the caller
as an argument.

For each clone bio in the clone request, request stacking drivers
should be able to set up their own completion handler.
So blk_rq_prep_clone() takes a callback function which is called
for each clone bio, and a pointer for private data which is passed
to the callback.

NOTE:
blk_rq_prep_clone() doesn't copy any actual data of the original
request.  Pages are shared between original bios and cloned bios.
So caller must not complete the original request before the clone
request.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   5 +++
 2 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 03c5a64b6cc..02a9252107a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2295,6 +2295,106 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ *     Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+	struct bio *bio;
+
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+
+		bio_put(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/*
+ * Copy attributes of the original request to the clone request.
+ * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
+ */
+static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+{
+	dst->cpu = src->cpu;
+	dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
+	dst->cmd_type = src->cmd_type;
+	dst->__sector = blk_rq_pos(src);
+	dst->__data_len = blk_rq_bytes(src);
+	dst->nr_phys_segments = src->nr_phys_segments;
+	dst->ioprio = src->ioprio;
+	dst->extra_len = src->extra_len;
+}
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ *           Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
+ *     are not copied, and copying such parts is the caller's responsibility.
+ *     Also, pages which the original bios are pointing to are not copied
+ *     and the cloned bios just point same pages.
+ *     So cloned bios must be completed before original bios, which means
+ *     the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		      struct bio_set *bs, gfp_t gfp_mask,
+		      int (*bio_ctr)(struct bio *, struct bio *, void *),
+		      void *data)
+{
+	struct bio *bio, *bio_src;
+
+	if (!bs)
+		bs = fs_bio_set;
+
+	blk_rq_init(NULL, rq);
+
+	__rq_for_each_bio(bio_src, rq_src) {
+		bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+		if (!bio)
+			goto free_and_out;
+
+		__bio_clone(bio, bio_src);
+
+		if (bio_integrity(bio_src) &&
+		    bio_integrity_clone(bio, bio_src, gfp_mask))
+			goto free_and_out;
+
+		if (bio_ctr && bio_ctr(bio, bio_src, data))
+			goto free_and_out;
+
+		if (rq->bio) {
+			rq->biotail->bi_next = bio;
+			rq->biotail = bio;
+		} else
+			rq->bio = rq->biotail = bio;
+	}
+
+	__blk_rq_prep_clone(rq, rq_src);
+
+	return 0;
+
+free_and_out:
+	if (bio)
+		bio_free(bio, bs);
+	blk_rq_unprep_clone(rq);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a135e7..ebdfde8fe55 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -765,6 +765,11 @@ extern void blk_insert_request(struct request_queue *, struct request *, int, vo
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
+extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+			     struct bio_set *bs, gfp_t gfp_mask,
+			     int (*bio_ctr)(struct bio *, struct bio *, void *),
+			     void *data);
+extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_plug_device(struct request_queue *);
-- 
cgit v1.2.3-70-g09d2


From 0764771dab80d7b84b9a271bee7f1b21a04a3f0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 11:18:36 +0200
Subject: perf_counter: More paranoia settings

Rename the perf_counter_priv knob to perf_counter_paranoia (because
priv can be read as private, as opposed to privileged) and provide
one more level:

 0 - permissive
 1 - restrict cpu counters to privilidged contexts
 2 - restrict kernel-mode code counting and profiling

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 +-
 kernel/perf_counter.c        | 25 +++++++++++++++++++++++--
 kernel/sysctl.c              |  6 +++---
 3 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5b966472b45..386be915baa 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -648,7 +648,7 @@ struct perf_callchain_entry {
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
-extern int sysctl_perf_counter_priv;
+extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
 extern int sysctl_perf_counter_limit;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8b89b40bd0f..63f1987c1c1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,7 +43,23 @@ static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
-int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+/*
+ * 0 - not paranoid
+ * 1 - disallow cpu counters to unpriv
+ * 2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */
+
+static inline bool perf_paranoid_cpu(void)
+{
+	return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+	return sysctl_perf_counter_paranoid > 1;
+}
+
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
@@ -1385,7 +1401,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
@@ -3618,6 +3634,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0c4bf863afa..344a65981de 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -916,9 +916,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_PERF_COUNTERS
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_privileged",
-		.data		= &sysctl_perf_counter_priv,
-		.maxlen		= sizeof(sysctl_perf_counter_priv),
+		.procname	= "perf_counter_paranoid",
+		.data		= &sysctl_perf_counter_paranoid,
+		.maxlen		= sizeof(sysctl_perf_counter_paranoid),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-- 
cgit v1.2.3-70-g09d2


From df58ab24bf26b166874bfb18b3b5a2e0a8e63179 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 11:25:05 +0200
Subject: perf_counter: Rename perf_counter_limit sysctl

Rename perf_counter_limit to perf_counter_max_sample_rate and
prohibit creation of counters with a known higher sample
frequency.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 +-
 kernel/perf_counter.c        | 27 +++++++++++++++++++--------
 kernel/sysctl.c              |  6 +++---
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 386be915baa..95c797c480e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -650,7 +650,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
-extern int sysctl_perf_counter_limit;
+extern int sysctl_perf_counter_sample_rate;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 63f1987c1c1..3b2829de559 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,11 +44,12 @@ static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
 /*
- * 0 - not paranoid
- * 1 - disallow cpu counters to unpriv
- * 2 - disallow kernel profiling to unpriv
+ * perf counter paranoia level:
+ *  0 - not paranoid
+ *  1 - disallow cpu counters to unpriv
+ *  2 - disallow kernel profiling to unpriv
  */
-int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */
+int sysctl_perf_counter_paranoid __read_mostly;
 
 static inline bool perf_paranoid_cpu(void)
 {
@@ -61,7 +62,11 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_counter_id;
 
@@ -1244,7 +1249,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
-			interrupts = 2*sysctl_perf_counter_limit/HZ;
+			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
 		}
 
 		if (!counter->attr.freq || !counter->attr.sample_freq)
@@ -1682,7 +1687,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 	spin_lock_irq(&ctx->lock);
 	if (counter->attr.freq) {
-		if (value > sysctl_perf_counter_limit) {
+		if (value > sysctl_perf_counter_sample_rate) {
 			ret = -EINVAL;
 			goto unlock;
 		}
@@ -2979,7 +2984,8 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
 	} else {
 		if (hwc->interrupts != MAX_INTERRUPTS) {
 			hwc->interrupts++;
-			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+			if (HZ * hwc->interrupts >
+					(u64)sysctl_perf_counter_sample_rate) {
 				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
@@ -3639,6 +3645,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 			return -EACCES;
 	}
 
+	if (attr.freq) {
+		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+			return -EINVAL;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 344a65981de..9fd4e436b69 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -932,9 +932,9 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_int_limit",
-		.data		= &sysctl_perf_counter_limit,
-		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.procname	= "perf_counter_max_sample_rate",
+		.data		= &sysctl_perf_counter_sample_rate,
+		.maxlen		= sizeof(sysctl_perf_counter_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-- 
cgit v1.2.3-70-g09d2


From fcc8ac1825d3d0fb81f73bc1a80ebc863168bb56 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:24:17 +0100
Subject: tty: Add carrier processing on close to the tty_port core

Some drivers implement this internally, others miss it out. Push the
behaviour into the core code as that way everyone will do it consistently.

Update the dtr rts method to raise or lower depending upon flags. Having a
single method in this style fits most of the implementations more cleanly than
two funtions.

We need this in place before we tackle the USB side

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/epca.c               |  4 ++--
 drivers/char/isicom.c             | 19 +++++++++++++------
 drivers/char/istallion.c          |  8 ++++----
 drivers/char/mxser.c              | 12 ++++++++----
 drivers/char/pcmcia/synclink_cs.c | 11 +++++++----
 drivers/char/rocket.c             | 13 +++++++++----
 drivers/char/stallion.c           |  6 +++---
 drivers/char/synclink.c           |  9 ++++++---
 drivers/char/synclink_gt.c        |  9 ++++++---
 drivers/char/synclinkmp.c         |  9 ++++++---
 drivers/char/tty_port.c           | 27 +++++++++++++++++++++++----
 include/linux/tty.h               |  3 ++-
 12 files changed, 89 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index af7c13ca949..8797b774235 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -745,7 +745,7 @@ static int epca_carrier_raised(struct tty_port *port)
 	return 0;
 }
 
-static void epca_raise_dtr_rts(struct tty_port *port)
+static void epca_dtr_rts(struct tty_port *port, int onoff)
 {
 }
 
@@ -925,7 +925,7 @@ static const struct tty_operations pc_ops = {
 
 static const struct tty_port_operations epca_port_ops = {
 	.carrier_raised = epca_carrier_raised,
-	.raise_dtr_rts = epca_raise_dtr_rts,
+	.dtr_rts = epca_dtr_rts,
 };
 
 static int info_open(struct tty_struct *tty, struct file *filp)
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index a59eac584d1..4d745a89504 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -329,7 +329,7 @@ static inline void drop_rts(struct isi_port *port)
 
 /* card->lock MUST NOT be held */
 
-static void isicom_raise_dtr_rts(struct tty_port *port)
+static void isicom_dtr_rts(struct tty_port *port, int on)
 {
 	struct isi_port *ip = container_of(port, struct isi_port, port);
 	struct isi_board *card = ip->card;
@@ -339,10 +339,17 @@ static void isicom_raise_dtr_rts(struct tty_port *port)
 	if (!lock_card(card))
 		return;
 
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0f04, base);
-	InterruptTheCard(base);
-	ip->status |= (ISI_DTR | ISI_RTS);
+	if (on) {
+		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+		outw(0x0f04, base);
+		InterruptTheCard(base);
+		ip->status |= (ISI_DTR | ISI_RTS);
+	} else {
+		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+		outw(0x0C04, base);
+		InterruptTheCard(base);
+		ip->status &= ~(ISI_DTR | ISI_RTS);
+	}
 	unlock_card(card);
 }
 
@@ -1339,7 +1346,7 @@ static const struct tty_operations isicom_ops = {
 
 static const struct tty_port_operations isicom_port_ops = {
 	.carrier_raised		= isicom_carrier_raised,
-	.raise_dtr_rts		= isicom_raise_dtr_rts,
+	.dtr_rts		= isicom_dtr_rts,
 };
 
 static int __devinit reset_card(struct pci_dev *pdev,
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index fff19f7e29d..e18800c400b 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -1140,14 +1140,14 @@ static int stli_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stli_raise_dtr_rts(struct tty_port *port)
+static void stli_dtr_rts(struct tty_port *port, int on)
 {
 	struct stliport *portp = container_of(port, struct stliport, port);
 	struct stlibrd *brdp = stli_brds[portp->brdnr];
-	stli_mkasysigs(&portp->asig, 1, 1);
+	stli_mkasysigs(&portp->asig, on, on);
 	if (stli_cmdwait(brdp, portp, A_SETSIGNALS, &portp->asig,
 		sizeof(asysigs_t), 0) < 0)
-			printk(KERN_WARNING "istallion: dtr raise failed.\n");
+			printk(KERN_WARNING "istallion: dtr set failed.\n");
 }
 
 
@@ -4417,7 +4417,7 @@ static const struct tty_operations stli_ops = {
 
 static const struct tty_port_operations stli_port_ops = {
 	.carrier_raised = stli_carrier_raised,
-	.raise_dtr_rts = stli_raise_dtr_rts,
+	.dtr_rts = stli_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 13f8871e5b2..9533f43a30b 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -547,14 +547,18 @@ static int mxser_carrier_raised(struct tty_port *port)
 	return (inb(mp->ioaddr + UART_MSR) & UART_MSR_DCD)?1:0;
 }
 
-static void mxser_raise_dtr_rts(struct tty_port *port)
+static void mxser_dtr_rts(struct tty_port *port, int on)
 {
 	struct mxser_port *mp = container_of(port, struct mxser_port, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&mp->slock, flags);
-	outb(inb(mp->ioaddr + UART_MCR) |
-		UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	if (on)
+		outb(inb(mp->ioaddr + UART_MCR) |
+			UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	else
+		outb(inb(mp->ioaddr + UART_MCR)&~(UART_MCR_DTR | UART_MCR_RTS),
+			mp->ioaddr + UART_MCR);
 	spin_unlock_irqrestore(&mp->slock, flags);
 }
 
@@ -2356,7 +2360,7 @@ static const struct tty_operations mxser_ops = {
 
 struct tty_port_operations mxser_port_ops = {
 	.carrier_raised = mxser_carrier_raised,
-	.raise_dtr_rts = mxser_raise_dtr_rts,
+	.dtr_rts = mxser_dtr_rts,
 };
 
 /*
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 19d79fc5446..77b36488922 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -383,7 +383,7 @@ static void async_mode(MGSLPC_INFO *info);
 static void tx_timeout(unsigned long context);
 
 static int carrier_raised(struct tty_port *port);
-static void raise_dtr_rts(struct tty_port *port);
+static void dtr_rts(struct tty_port *port, int onoff);
 
 #if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
@@ -513,7 +513,7 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 
 static const struct tty_port_operations mgslpc_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts
+	.dtr_rts = dtr_rts
 };
 
 static int mgslpc_probe(struct pcmcia_device *link)
@@ -2528,13 +2528,16 @@ static int carrier_raised(struct tty_port *port)
 	return 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int onoff)
 {
 	MGSLPC_INFO *info = container_of(port, MGSLPC_INFO, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (onoff)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~SerialSignal_RTS + SerialSignal_DTR;
 	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index f59fc5cea06..7399188049d 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -872,11 +872,16 @@ static int carrier_raised(struct tty_port *port)
 	return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	struct r_port *info = container_of(port, struct r_port, port);
-	sSetDTR(&info->channel);
-	sSetRTS(&info->channel);
+	if (on) {
+		sSetDTR(&info->channel);
+		sSetRTS(&info->channel);
+	} else {
+		sClrDTR(&info->channel);
+		sClrRTS(&info->channel);
+	}
 }
 
 /*
@@ -2250,7 +2255,7 @@ static const struct tty_operations rocket_ops = {
 
 static const struct tty_port_operations rocket_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /*
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 2ad813a801d..53e504f41b2 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -772,11 +772,11 @@ static int stl_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stl_raise_dtr_rts(struct tty_port *port)
+static void stl_dtr_rts(struct tty_port *port, int on)
 {
 	struct stlport *portp = container_of(port, struct stlport, port);
 	/* Takes brd_lock internally */
-	stl_setsignals(portp, 1, 1);
+	stl_setsignals(portp, on, on);
 }
 
 /*****************************************************************************/
@@ -2547,7 +2547,7 @@ static const struct tty_operations stl_ops = {
 
 static const struct tty_port_operations stl_port_ops = {
 	.carrier_raised = stl_carrier_raised,
-	.raise_dtr_rts = stl_raise_dtr_rts,
+	.dtr_rts = stl_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index afd0b26ca05..afded3a2379 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3247,13 +3247,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	struct mgsl_struct *info = container_of(port, struct mgsl_struct, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->irq_spinlock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	usc_set_serial_signals(info);
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
 }
@@ -4258,7 +4261,7 @@ static void mgsl_add_device( struct mgsl_struct *info )
 
 static const struct tty_port_operations mgsl_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 5e256494686..67986ea0d47 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -3099,13 +3099,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	unsigned long flags;
 	struct slgt_info *info = container_of(port, struct slgt_info, port);
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3419,7 +3422,7 @@ static void add_device(struct slgt_info *info)
 
 static const struct tty_port_operations slgt_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /*
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 26de60efe4b..6f727e3c53a 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -3277,13 +3277,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	SLMP_INFO *info = container_of(port, SLMP_INFO, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3746,7 +3749,7 @@ static void add_device(SLMP_INFO *info)
 
 static const struct tty_port_operations port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /* Allocate and initialize a device instance structure
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 9b8004c7268..926d4a5593f 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -137,7 +137,7 @@ int tty_port_carrier_raised(struct tty_port *port)
 EXPORT_SYMBOL(tty_port_carrier_raised);
 
 /**
- *	tty_port_raise_dtr_rts	-	Riase DTR/RTS
+ *	tty_port_raise_dtr_rts	-	Raise DTR/RTS
  *	@port: tty port
  *
  *	Wrapper for the DTR/RTS raise logic. For the moment this is used
@@ -147,11 +147,27 @@ EXPORT_SYMBOL(tty_port_carrier_raised);
 
 void tty_port_raise_dtr_rts(struct tty_port *port)
 {
-	if (port->ops->raise_dtr_rts)
-		port->ops->raise_dtr_rts(port);
+	if (port->ops->dtr_rts)
+		port->ops->dtr_rts(port, 1);
 }
 EXPORT_SYMBOL(tty_port_raise_dtr_rts);
 
+/**
+ *	tty_port_lower_dtr_rts	-	Lower DTR/RTS
+ *	@port: tty port
+ *
+ *	Wrapper for the DTR/RTS raise logic. For the moment this is used
+ *	to hide some internal details. This will eventually become entirely
+ *	internal to the tty port.
+ */
+
+void tty_port_lower_dtr_rts(struct tty_port *port)
+{
+	if (port->ops->dtr_rts)
+		port->ops->dtr_rts(port, 0);
+}
+EXPORT_SYMBOL(tty_port_lower_dtr_rts);
+
 /**
  *	tty_port_block_til_ready	-	Waiting logic for tty open
  *	@port: the tty port being opened
@@ -167,7 +183,7 @@ EXPORT_SYMBOL(tty_port_raise_dtr_rts);
  *		- port flags and counts
  *
  *	The passed tty_port must implement the carrier_raised method if it can
- *	do carrier detect and the raise_dtr_rts method if it supports software
+ *	do carrier detect and the dtr_rts method if it supports software
  *	management of these lines. Note that the dtr/rts raise is done each
  *	iteration as a hangup may have previously dropped them while we wait.
  */
@@ -302,6 +318,9 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 
 	tty_ldisc_flush(tty);
 
+	if (tty->termios->c_cflag & HUPCL)
+		tty_port_lower_dtr_rts(port);
+
 	spin_lock_irqsave(&port->lock, flags);
 	tty->closing = 0;
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fc39db95499..98694364c96 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -185,7 +185,7 @@ struct tty_port;
 struct tty_port_operations {
 	/* Return 1 if the carrier is raised */
 	int (*carrier_raised)(struct tty_port *port);
-	void (*raise_dtr_rts)(struct tty_port *port);
+	void (*dtr_rts)(struct tty_port *port, int raise);
 };
 	
 struct tty_port {
@@ -438,6 +438,7 @@ extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
+extern void tty_port_lower_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
-- 
cgit v1.2.3-70-g09d2


From 1ec739be75a6cb961a46ba0b1982d0edb7f27558 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:25:25 +0100
Subject: tty: Implement a drain delay in the tty port

We need this for devices that cannot flush and wait, but which do not order
data and modem events. Without it we will hang up before all the data
clears the hardware. Needed for the USB changes.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_port.c | 11 +++++++++++
 include/linux/tty.h     |  3 +++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 926d4a5593f..4d08b6d27c2 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -308,6 +308,17 @@ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct f
 	if (port->flags & ASYNC_INITIALIZED &&
 			port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, port->closing_wait);
+	if (port->drain_delay) {
+		unsigned int bps = tty_get_baud_rate(tty);
+		long timeout;
+
+		if (bps > 1200)
+			timeout = max_t(long, (HZ * 10 * port->drain_delay) / bps,
+								HZ / 10);
+		else
+			timeout = 2 * HZ;
+		schedule_timeout_interruptible(timeout);
+	}
 	return 1;
 }
 EXPORT_SYMBOL(tty_port_close_start);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 98694364c96..bed5a3d4030 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -201,6 +201,9 @@ struct tty_port {
 	unsigned char		*xmit_buf;	/* Optional buffer */
 	int			close_delay;	/* Close port delay */
 	int			closing_wait;	/* Delay for output */
+	int			drain_delay;	/* Set to zero if no pure time
+						   based drain is needed else
+						   set to size of fifo */
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 335f8514f200e63d689113d29cb7253a5c282967 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:26:29 +0100
Subject: tty: Bring the usb tty port structure into more use

This allows us to clean stuff up, but is probably also going to cause
some app breakage with buggy apps as we now implement proper POSIX behaviour
for USB ports matching all the other ports. This does also mean other apps
that break on USB will now work properly.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/belkin_sa.c        |   6 +-
 drivers/usb/serial/ch341.c            |  46 ++++++-----
 drivers/usb/serial/console.c          |   6 +-
 drivers/usb/serial/cp210x.c           |   6 +-
 drivers/usb/serial/cyberjack.c        |   6 +-
 drivers/usb/serial/cypress_m8.c       |  81 ++++++-------------
 drivers/usb/serial/digi_acceleport.c  |  75 +++++-------------
 drivers/usb/serial/empeg.c            |   6 +-
 drivers/usb/serial/ftdi_sio.c         |  50 ++++++------
 drivers/usb/serial/garmin_gps.c       |   3 +-
 drivers/usb/serial/generic.c          |   3 +-
 drivers/usb/serial/io_edgeport.c      |  10 +--
 drivers/usb/serial/io_ti.c            |   3 +-
 drivers/usb/serial/ipaq.c             |   6 +-
 drivers/usb/serial/ipw.c              |  18 ++---
 drivers/usb/serial/ir-usb.c           |   6 +-
 drivers/usb/serial/iuu_phoenix.c      |  25 +-----
 drivers/usb/serial/keyspan.c          |  13 ++-
 drivers/usb/serial/keyspan.h          |   8 +-
 drivers/usb/serial/keyspan_pda.c      |  48 ++++++++----
 drivers/usb/serial/kl5kusb105.c       |   6 +-
 drivers/usb/serial/kobil_sct.c        |   9 +--
 drivers/usb/serial/mct_u232.c         |  37 ++++-----
 drivers/usb/serial/mos7720.c          |   3 +-
 drivers/usb/serial/mos7840.c          |  48 +-----------
 drivers/usb/serial/navman.c           |   3 +-
 drivers/usb/serial/omninet.c          |   6 +-
 drivers/usb/serial/opticon.c          |   3 +-
 drivers/usb/serial/option.c           |  68 ++++++++--------
 drivers/usb/serial/oti6858.c          |  57 ++------------
 drivers/usb/serial/pl2303.c           |  79 ++++++++-----------
 drivers/usb/serial/sierra.c           |  97 ++++++++++-------------
 drivers/usb/serial/spcp8x5.c          |  85 ++++++++------------
 drivers/usb/serial/symbolserial.c     |   3 +-
 drivers/usb/serial/ti_usb_3410_5052.c |   6 +-
 drivers/usb/serial/usb-serial.c       | 144 +++++++++++++++++++++++-----------
 drivers/usb/serial/visor.c            |   6 +-
 drivers/usb/serial/whiteheat.c        |  33 +-------
 include/linux/usb/serial.h            |  10 ++-
 39 files changed, 467 insertions(+), 661 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index b7eacad4d48..2bfd6dd85b5 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -93,8 +93,7 @@ static int  belkin_sa_startup(struct usb_serial *serial);
 static void belkin_sa_shutdown(struct usb_serial *serial);
 static int  belkin_sa_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void belkin_sa_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void belkin_sa_close(struct usb_serial_port *port);
 static void belkin_sa_read_int_callback(struct urb *urb);
 static void belkin_sa_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios * old);
@@ -244,8 +243,7 @@ exit:
 } /* belkin_sa_open */
 
 
-static void belkin_sa_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void belkin_sa_close(struct usb_serial_port *port)
 {
 	dbg("%s port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index ab4cc277aa6..2830766f5b3 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -262,13 +262,33 @@ error:	kfree(priv);
 	return r;
 }
 
-static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int ch341_carrier_raised(struct usb_serial_port *port)
+{
+	struct ch341_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & CH341_BIT_DCD)
+		return 1;
+	return 0;
+}
+
+static void ch341_dtr_rts(struct usb_serial_port *port, int on)
 {
 	struct ch341_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
 
+	dbg("%s - port %d", __func__, port->number);
+	/* drop DTR and RTS */
+	spin_lock_irqsave(&priv->lock, flags);
+	if (on)
+		priv->line_control |= CH341_BIT_RTS | CH341_BIT_DTR;
+	else
+		priv->line_control &= ~(CH341_BIT_RTS | CH341_BIT_DTR);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ch341_set_handshake(port->serial->dev, priv->line_control);
+	wake_up_interruptible(&priv->delta_msr_wait);
+}
+
+static void ch341_close(struct usb_serial_port *port)
+{
 	dbg("%s - port %d", __func__, port->number);
 
 	/* shutdown our urbs */
@@ -276,18 +296,6 @@ static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
-
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop DTR and RTS */
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			ch341_set_handshake(port->serial->dev, 0);
-		}
-	}
-	wake_up_interruptible(&priv->delta_msr_wait);
 }
 
 
@@ -302,7 +310,6 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 	dbg("ch341_open()");
 
 	priv->baud_rate = DEFAULT_BAUD_RATE;
-	priv->line_control = CH341_BIT_RTS | CH341_BIT_DTR;
 
 	r = ch341_configure(serial->dev, priv);
 	if (r)
@@ -322,7 +329,7 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 	if (r) {
 		dev_err(&port->dev, "%s - failed submitting interrupt urb,"
 			" error %d\n", __func__, r);
-		ch341_close(tty, port, NULL);
+		ch341_close(port);
 		return -EPROTO;
 	}
 
@@ -343,9 +350,6 @@ static void ch341_set_termios(struct tty_struct *tty,
 
 	dbg("ch341_set_termios()");
 
-	if (!tty || !tty->termios)
-		return;
-
 	baud_rate = tty_get_baud_rate(tty);
 
 	priv->baud_rate = baud_rate;
@@ -568,6 +572,8 @@ static struct usb_serial_driver ch341_device = {
 	.usb_driver        = &ch341_driver,
 	.num_ports         = 1,
 	.open              = ch341_open,
+	.dtr_rts	   = ch341_dtr_rts,
+	.carrier_raised	   = ch341_carrier_raised,
 	.close             = ch341_close,
 	.ioctl             = ch341_ioctl,
 	.set_termios       = ch341_set_termios,
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index 19e24045b13..247b61bfb7f 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -169,7 +169,9 @@ static int usb_console_setup(struct console *co, char *options)
 			kfree(tty);
 		}
 	}
-
+	/* So we know not to kill the hardware on a hangup on this
+	   port. We have also bumped the use count by one so it won't go
+	   idle */
 	port->console = 1;
 	retval = 0;
 
@@ -182,7 +184,7 @@ free_tty:
 	kfree(tty);
 reset_open_count:
 	port->port.count = 0;
-goto out;
+	goto out;
 }
 
 static void usb_console_write(struct console *co,
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index e8d5133ce9c..d9f586dc6ec 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -36,8 +36,7 @@
 static int cp2101_open(struct tty_struct *, struct usb_serial_port *,
 							struct file *);
 static void cp2101_cleanup(struct usb_serial_port *);
-static void cp2101_close(struct tty_struct *, struct usb_serial_port *,
-							struct file*);
+static void cp2101_close(struct usb_serial_port *);
 static void cp2101_get_termios(struct tty_struct *,
 	struct usb_serial_port *port);
 static void cp2101_get_termios_port(struct usb_serial_port *port,
@@ -398,8 +397,7 @@ static void cp2101_cleanup(struct usb_serial_port *port)
 	}
 }
 
-static void cp2101_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp)
+static void cp2101_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index dd501bb63ed..933ba913e66 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -61,8 +61,7 @@ static int cyberjack_startup(struct usb_serial *serial);
 static void cyberjack_shutdown(struct usb_serial *serial);
 static int  cyberjack_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void cyberjack_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void cyberjack_close(struct usb_serial_port *port);
 static int cyberjack_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 static int cyberjack_write_room(struct tty_struct *tty);
@@ -185,8 +184,7 @@ static int  cyberjack_open(struct tty_struct *tty,
 	return result;
 }
 
-static void cyberjack_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void cyberjack_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index e568710b263..669f9384853 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -174,8 +174,8 @@ static int  cypress_ca42v2_startup(struct usb_serial *serial);
 static void cypress_shutdown(struct usb_serial *serial);
 static int  cypress_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void cypress_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void cypress_close(struct usb_serial_port *port);
+static void cypress_dtr_rts(struct usb_serial_port *port, int on);
 static int  cypress_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static void cypress_send(struct usb_serial_port *port);
@@ -218,6 +218,7 @@ static struct usb_serial_driver cypress_earthmate_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -244,6 +245,7 @@ static struct usb_serial_driver cypress_hidcom_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -270,6 +272,7 @@ static struct usb_serial_driver cypress_ca42v2_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -656,11 +659,7 @@ static int cypress_open(struct tty_struct *tty,
 	priv->rx_flags = 0;
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* raise both lines and set termios */
-	spin_lock_irqsave(&priv->lock, flags);
-	priv->line_control = CONTROL_DTR | CONTROL_RTS;
-	priv->cmd_ctrl = 1;
-	spin_unlock_irqrestore(&priv->lock, flags);
+	/* Set termios */
 	result = cypress_write(tty, port, NULL, 0);
 
 	if (result) {
@@ -694,76 +693,42 @@ static int cypress_open(struct tty_struct *tty,
 							__func__, result);
 		cypress_set_dead(port);
 	}
-
+	port->port.drain_delay = 256;
 	return result;
 } /* cypress_open */
 
+static void cypress_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct cypress_private *priv = usb_get_serial_port_data(port);
+	/* drop dtr and rts */
+	priv = usb_get_serial_port_data(port);
+	spin_lock_irq(&priv->lock);
+	if (on == 0)
+		priv->line_control = 0;
+	else 
+		priv->line_control = CONTROL_DTR | CONTROL_RTS;
+	priv->cmd_ctrl = 1;
+	spin_unlock_irq(&priv->lock);
+	cypress_write(NULL, port, NULL, 0);
+}
 
-static void cypress_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void cypress_close(struct usb_serial_port *port)
 {
 	struct cypress_private *priv = usb_get_serial_port_data(port);
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from buffer */
-	spin_lock_irq(&priv->lock);
-	timeout = CYPRESS_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (cypress_buf_data_avail(priv->buf) == 0
-		|| timeout == 0 || signal_pending(current)
-		/* without mutex, allowed due to harmless failure mode */
-		|| port->serial->disconnected)
-			break;
-		spin_unlock_irq(&priv->lock);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irq(&priv->lock);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-	/* clear out any remaining data in the buffer */
-	cypress_buf_clear(priv->buf);
-	spin_unlock_irq(&priv->lock);
-
 	/* writing is potentially harmful, lock must be taken */
 	mutex_lock(&port->serial->disc_mutex);
 	if (port->serial->disconnected) {
 		mutex_unlock(&port->serial->disc_mutex);
 		return;
 	}
-	/* wait for characters to drain from device */
-	if (tty) {
-		bps = tty_get_baud_rate(tty);
-		if (bps > 1200)
-			timeout = max((HZ * 2560) / bps, HZ / 10);
-		else
-			timeout = 2 * HZ;
-		schedule_timeout_interruptible(timeout);
-	}
-
+	cypress_buf_clear(priv->buf);
 	dbg("%s - stopping urbs", __func__);
 	usb_kill_urb(port->interrupt_in_urb);
 	usb_kill_urb(port->interrupt_out_urb);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop dtr and rts */
-			priv = usb_get_serial_port_data(port);
-			spin_lock_irq(&priv->lock);
-			priv->line_control = 0;
-			priv->cmd_ctrl = 1;
-			spin_unlock_irq(&priv->lock);
-			cypress_write(tty, port, NULL, 0);
-		}
-	}
 
 	if (stats)
 		dev_info(&port->dev, "Statistics: %d Bytes In | %d Bytes Out | %d Commands Issued\n",
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 38ba4ea8b6b..30f5140eff0 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -422,7 +422,6 @@ struct digi_port {
 	int dp_throttled;
 	int dp_throttle_restart;
 	wait_queue_head_t dp_flush_wait;
-	int dp_in_close;			/* close in progress */
 	wait_queue_head_t dp_close_wait;	/* wait queue for close */
 	struct work_struct dp_wakeup_work;
 	struct usb_serial_port *dp_port;
@@ -456,8 +455,9 @@ static int digi_write_room(struct tty_struct *tty);
 static int digi_chars_in_buffer(struct tty_struct *tty);
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	struct file *filp);
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-	struct file *filp);
+static void digi_close(struct usb_serial_port *port);
+static int digi_carrier_raised(struct usb_serial_port *port);
+static void digi_dtr_rts(struct usb_serial_port *port, int on);
 static int digi_startup_device(struct usb_serial *serial);
 static int digi_startup(struct usb_serial *serial);
 static void digi_shutdown(struct usb_serial *serial);
@@ -510,6 +510,8 @@ static struct usb_serial_driver digi_acceleport_2_device = {
 	.num_ports =			3,
 	.open =				digi_open,
 	.close =			digi_close,
+	.dtr_rts =			digi_dtr_rts,
+	.carrier_raised =		digi_carrier_raised,
 	.write =			digi_write,
 	.write_room =			digi_write_room,
 	.write_bulk_callback = 		digi_write_bulk_callback,
@@ -1328,6 +1330,19 @@ static int digi_chars_in_buffer(struct tty_struct *tty)
 
 }
 
+static void digi_dtr_rts(struct usb_serial_port *port, int on)
+{
+	/* Adjust DTR and RTS */
+	digi_set_modem_signals(port, on * (TIOCM_DTR|TIOCM_RTS), 1);
+}
+
+static int digi_carrier_raised(struct usb_serial_port *port)
+{
+	struct digi_port *priv = usb_get_serial_port_data(port);
+	if (priv->dp_modem_signals & TIOCM_CD)
+		return 1;
+	return 0;
+}
 
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 				struct file *filp)
@@ -1336,7 +1351,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	unsigned char buf[32];
 	struct digi_port *priv = usb_get_serial_port_data(port);
 	struct ktermios not_termios;
-	unsigned long flags = 0;
 
 	dbg("digi_open: TOP: port=%d, open_count=%d",
 		priv->dp_port_num, port->port.count);
@@ -1345,26 +1359,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	if (digi_startup_device(port->serial) != 0)
 		return -ENXIO;
 
-	spin_lock_irqsave(&priv->dp_port_lock, flags);
-
-	/* don't wait on a close in progress for non-blocking opens */
-	if (priv->dp_in_close && (filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0) {
-		spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-		return -EAGAIN;
-	}
-
-	/* wait for a close in progress to finish */
-	while (priv->dp_in_close) {
-		cond_wait_interruptible_timeout_irqrestore(
-			&priv->dp_close_wait, DIGI_RETRY_TIMEOUT,
-			&priv->dp_port_lock, flags);
-		if (signal_pending(current))
-			return -EINTR;
-		spin_lock_irqsave(&priv->dp_port_lock, flags);
-	}
-
-	spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-
 	/* read modem signals automatically whenever they change */
 	buf[0] = DIGI_CMD_READ_INPUT_SIGNALS;
 	buf[1] = priv->dp_port_num;
@@ -1387,16 +1381,11 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 		not_termios.c_iflag = ~tty->termios->c_iflag;
 		digi_set_termios(tty, port, &not_termios);
 	}
-
-	/* set DTR and RTS */
-	digi_set_modem_signals(port, TIOCM_DTR|TIOCM_RTS, 1);
-
 	return 0;
 }
 
 
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static void digi_close(struct usb_serial_port *port)
 {
 	DEFINE_WAIT(wait);
 	int ret;
@@ -1411,28 +1400,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
 	if (port->serial->disconnected)
 		goto exit;
 
-	/* do cleanup only after final close on this port */
-	spin_lock_irq(&priv->dp_port_lock);
-	priv->dp_in_close = 1;
-	spin_unlock_irq(&priv->dp_port_lock);
-
-	/* tell line discipline to process only XON/XOFF */
-	tty->closing = 1;
-
-	/* wait for output to drain */
-	if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-		tty_wait_until_sent(tty, DIGI_CLOSE_TIMEOUT);
-
-	/* flush driver and line discipline buffers */
-	tty_driver_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
 	if (port->serial->dev) {
-		/* wait for transmit idle */
-		if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-			digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
-		/* drop DTR and RTS */
-		digi_set_modem_signals(port, 0, 0);
+		/* FIXME: Transmit idle belongs in the wait_unti_sent path */
+		digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
 
 		/* disable input flow control */
 		buf[0] = DIGI_CMD_SET_INPUT_FLOW_CONTROL;
@@ -1477,11 +1447,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
 		/* shutdown any outstanding bulk writes */
 		usb_kill_urb(port->write_urb);
 	}
-	tty->closing = 0;
 exit:
 	spin_lock_irq(&priv->dp_port_lock);
 	priv->dp_write_urb_in_use = 0;
-	priv->dp_in_close = 0;
 	wake_up_interruptible(&priv->dp_close_wait);
 	spin_unlock_irq(&priv->dp_port_lock);
 	mutex_unlock(&port->serial->disc_mutex);
@@ -1560,7 +1528,6 @@ static int digi_startup(struct usb_serial *serial)
 		priv->dp_throttled = 0;
 		priv->dp_throttle_restart = 0;
 		init_waitqueue_head(&priv->dp_flush_wait);
-		priv->dp_in_close = 0;
 		init_waitqueue_head(&priv->dp_close_wait);
 		INIT_WORK(&priv->dp_wakeup_work, digi_wakeup_write_lock);
 		priv->dp_port = serial->port[i];
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index c709ec474a8..2b141ccb0cd 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -81,8 +81,7 @@ static int debug;
 /* function prototypes for an empeg-car player */
 static int  empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 						struct file *filp);
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-						struct file *filp);
+static void empeg_close(struct usb_serial_port *port);
 static int  empeg_write(struct tty_struct *tty, struct usb_serial_port *port,
 						const unsigned char *buf,
 						int count);
@@ -181,8 +180,7 @@ static int empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static void empeg_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index d9fcdaedf38..d9d87111f9a 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -719,8 +719,8 @@ static int  ftdi_sio_port_probe(struct usb_serial_port *port);
 static int  ftdi_sio_port_remove(struct usb_serial_port *port);
 static int  ftdi_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void ftdi_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void ftdi_close(struct usb_serial_port *port);
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on);
 static int  ftdi_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static int  ftdi_write_room(struct tty_struct *tty);
@@ -758,6 +758,7 @@ static struct usb_serial_driver ftdi_sio_device = {
 	.port_remove =		ftdi_sio_port_remove,
 	.open =			ftdi_open,
 	.close =		ftdi_close,
+	.dtr_rts =		ftdi_dtr_rts,
 	.throttle =		ftdi_throttle,
 	.unthrottle =		ftdi_unthrottle,
 	.write =		ftdi_write,
@@ -1558,6 +1559,30 @@ static int ftdi_open(struct tty_struct *tty,
 } /* ftdi_open */
 
 
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	char buf[1];
+
+	mutex_lock(&port->serial->disc_mutex);
+	if (!port->serial->disconnected) {
+		/* Disable flow control */
+		if (!on && usb_control_msg(port->serial->dev,
+			    usb_sndctrlpipe(port->serial->dev, 0),
+			    FTDI_SIO_SET_FLOW_CTRL_REQUEST,
+			    FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
+			    0, priv->interface, buf, 0,
+			    WDR_TIMEOUT) < 0) {
+			    dev_err(&port->dev, "error from flowcontrol urb\n");
+		}
+		/* drop RTS and DTR */
+		if (on)
+			set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+		else
+			clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+	}
+	mutex_unlock(&port->serial->disc_mutex);
+}
 
 /*
  * usbserial:__serial_close  only calls ftdi_close if the point is open
@@ -1567,31 +1592,12 @@ static int ftdi_open(struct tty_struct *tty,
  *
  */
 
-static void ftdi_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ftdi_close(struct usb_serial_port *port)
 { /* ftdi_close */
-	unsigned int c_cflag = tty->termios->c_cflag;
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	char buf[1];
 
 	dbg("%s", __func__);
 
-	mutex_lock(&port->serial->disc_mutex);
-	if (c_cflag & HUPCL && !port->serial->disconnected) {
-		/* Disable flow control */
-		if (usb_control_msg(port->serial->dev,
-				    usb_sndctrlpipe(port->serial->dev, 0),
-				    FTDI_SIO_SET_FLOW_CTRL_REQUEST,
-				    FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
-				    0, priv->interface, buf, 0,
-				    WDR_TIMEOUT) < 0) {
-			dev_err(&port->dev, "error from flowcontrol urb\n");
-		}
-
-		/* drop RTS and DTR */
-		clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-	} /* Note change no line if hupcl is off */
-	mutex_unlock(&port->serial->disc_mutex);
 
 	/* cancel any scheduled reading */
 	cancel_delayed_work_sync(&priv->rx_work);
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 586d30ff450..ee25a3fe3b0 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -993,8 +993,7 @@ static int garmin_open(struct tty_struct *tty,
 }
 
 
-static void garmin_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void garmin_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 4cec9906ccf..be82ea95672 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -184,8 +184,7 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
 
-void usb_serial_generic_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+void usb_serial_generic_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 	generic_cleanup(port);
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index fb4a73d090f..53ef5996e33 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -207,8 +207,7 @@ static void edge_bulk_out_cmd_callback(struct urb *urb);
 /* function prototypes for the usbserial callbacks */
 static int edge_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filp);
-static void edge_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static void edge_close(struct usb_serial_port *port);
 static int edge_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static int edge_write_room(struct tty_struct *tty);
@@ -965,7 +964,7 @@ static int edge_open(struct tty_struct *tty,
 
 	if (!edge_port->txfifo.fifo) {
 		dbg("%s - no memory", __func__);
-		edge_close(tty, port, filp);
+		edge_close(port);
 		return -ENOMEM;
 	}
 
@@ -975,7 +974,7 @@ static int edge_open(struct tty_struct *tty,
 
 	if (!edge_port->write_urb) {
 		dbg("%s - no memory", __func__);
-		edge_close(tty, port, filp);
+		edge_close(port);
 		return -ENOMEM;
 	}
 
@@ -1099,8 +1098,7 @@ static void block_until_tx_empty(struct edgeport_port *edge_port)
  * edge_close
  *	this function is called by the tty driver when a port is closed
  *****************************************************************************/
-static void edge_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
 	struct edgeport_serial *edge_serial;
 	struct edgeport_port *edge_port;
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 513b25e044c..eabf20eeb37 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2009,8 +2009,7 @@ release_es_lock:
 	return status;
 }
 
-static void edge_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
 	struct edgeport_serial *edge_serial;
 	struct edgeport_port *edge_port;
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index cd62825a9ac..c610a99fa47 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -76,8 +76,7 @@ static int initial_wait;
 /* Function prototypes for an ipaq */
 static int  ipaq_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void ipaq_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void ipaq_close(struct usb_serial_port *port);
 static int  ipaq_calc_num_ports(struct usb_serial *serial);
 static int  ipaq_startup(struct usb_serial *serial);
 static void ipaq_shutdown(struct usb_serial *serial);
@@ -714,8 +713,7 @@ error:
 }
 
 
-static void ipaq_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ipaq_close(struct usb_serial_port *port)
 {
 	struct ipaq_private	*priv = usb_get_serial_port_data(port);
 
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index da2a2b46644..29ad038b9c8 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -302,23 +302,17 @@ static int ipw_open(struct tty_struct *tty,
 	return 0;
 }
 
-static void ipw_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ipw_dtr_rts(struct usb_serial_port *port, int on)
 {
 	struct usb_device *dev = port->serial->dev;
 	int result;
 
-	if (tty_hung_up_p(filp)) {
-		dbg("%s: tty_hung_up_p ...", __func__);
-		return;
-	}
-
 	/*--1: drop the dtr */
 	dbg("%s:dropping dtr", __func__);
 	result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			 IPW_SIO_SET_PIN,
 			 USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT,
-			 IPW_PIN_CLRDTR,
+			 on ? IPW_PIN_SETDTR : IPW_PIN_CLRDTR,
 			 0,
 			 NULL,
 			 0,
@@ -332,7 +326,7 @@ static void ipw_close(struct tty_struct *tty,
 	result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			 IPW_SIO_SET_PIN, USB_TYPE_VENDOR |
 			 		USB_RECIP_INTERFACE | USB_DIR_OUT,
-			 IPW_PIN_CLRRTS,
+			 on ? IPW_PIN_SETRTS : IPW_PIN_CLRRTS,
 			 0,
 			 NULL,
 			 0,
@@ -340,7 +334,12 @@ static void ipw_close(struct tty_struct *tty,
 	if (result < 0)
 		dev_err(&port->dev,
 				"dropping rts failed (error = %d)\n", result);
+}
 
+static void ipw_close(struct usb_serial_port *port)
+{
+	struct usb_device *dev = port->serial->dev;
+	int result;
 
 	/*--3: purge */
 	dbg("%s:sending purge", __func__);
@@ -461,6 +460,7 @@ static struct usb_serial_driver ipw_device = {
 	.num_ports =		1,
 	.open =			ipw_open,
 	.close =		ipw_close,
+	.dtr_rts =		ipw_dtr_rts,
 	.port_probe = 		ipw_probe,
 	.port_remove =		ipw_disconnect,
 	.write =		ipw_write,
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 4e2cda93da5..66009b6b763 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -88,8 +88,7 @@ static int xbof = -1;
 static int  ir_startup (struct usb_serial *serial);
 static int  ir_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filep);
-static void ir_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filep);
+static void ir_close(struct usb_serial_port *port);
 static int  ir_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static void ir_write_bulk_callback (struct urb *urb);
@@ -346,8 +345,7 @@ static int ir_open(struct tty_struct *tty,
 	return result;
 }
 
-static void ir_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file * filp)
+static void ir_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 4473d442b2a..bb572cee6ec 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -70,7 +70,6 @@ static void read_rxcmd_callback(struct urb *urb);
 struct iuu_private {
 	spinlock_t lock;	/* store irq state */
 	wait_queue_head_t delta_msr_wait;
-	u8 line_control;
 	u8 line_status;
 	u8 termios_initialized;
 	int tiostatus;		/* store IUART SIGNAL for tiocmget call */
@@ -946,19 +945,10 @@ static int iuu_uart_baud(struct usb_serial_port *port, u32 baud,
 	return status;
 }
 
-static int set_control_lines(struct usb_device *dev, u8 value)
-{
-	return 0;
-}
-
-static void iuu_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void iuu_close(struct usb_serial_port *port)
 {
 	/* iuu_led (port,255,0,0,0); */
 	struct usb_serial *serial;
-	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned long flags;
-	unsigned int c_cflag;
 
 	serial = port->serial;
 	if (!serial)
@@ -968,17 +958,6 @@ static void iuu_close(struct tty_struct *tty,
 
 	iuu_uart_off(port);
 	if (serial->dev) {
-		if (tty) {
-			c_cflag = tty->termios->c_cflag;
-			if (c_cflag & HUPCL) {
-				/* drop DTR and RTS */
-				priv = usb_get_serial_port_data(port);
-				spin_lock_irqsave(&priv->lock, flags);
-				priv->line_control = 0;
-				spin_unlock_irqrestore(&priv->lock, flags);
-				set_control_lines(port->serial->dev, 0);
-			}
-		}
 		/* free writebuf */
 		/* shutdown our urbs */
 		dbg("%s - shutting down urbs", __func__);
@@ -1154,7 +1133,7 @@ static int iuu_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting read urb,"
 			" error %d\n", __func__, result);
-		iuu_close(tty, port, NULL);
+		iuu_close(port);
 		return -EPROTO;
 	} else {
 		dbg("%s - rxcmd OK", __func__);
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 00daa8f7759..f1195a98f31 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -1298,8 +1298,16 @@ static inline void stop_urb(struct urb *urb)
 		usb_kill_urb(urb);
 }
 
-static void keyspan_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void keyspan_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct keyspan_port_private *p_priv = usb_get_serial_port_data(port);
+
+	p_priv->rts_state = on;
+	p_priv->dtr_state = on;
+	keyspan_send_setup(port, 0);
+}
+
+static void keyspan_close(struct usb_serial_port *port)
 {
 	int			i;
 	struct usb_serial	*serial = port->serial;
@@ -1336,7 +1344,6 @@ static void keyspan_close(struct tty_struct *tty,
 			stop_urb(p_priv->out_urbs[i]);
 		}
 	}
-	tty_port_tty_set(&port->port, NULL);
 }
 
 /* download the firmware to a pre-renumeration device */
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 38b4582e073..0d4569b6076 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -38,9 +38,8 @@
 static int  keyspan_open		(struct tty_struct *tty,
 					 struct usb_serial_port *port,
 					 struct file *filp);
-static void keyspan_close		(struct tty_struct *tty,
-					 struct usb_serial_port *port,
-					 struct file *filp);
+static void keyspan_close		(struct usb_serial_port *port);
+static void keyspan_dtr_rts		(struct usb_serial_port *port, int on);
 static int  keyspan_startup		(struct usb_serial *serial);
 static void keyspan_shutdown		(struct usb_serial *serial);
 static int  keyspan_write_room		(struct tty_struct *tty);
@@ -562,6 +561,7 @@ static struct usb_serial_driver keyspan_1port_device = {
 	.num_ports		= 1,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
@@ -582,6 +582,7 @@ static struct usb_serial_driver keyspan_2port_device = {
 	.num_ports		= 2,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
@@ -602,6 +603,7 @@ static struct usb_serial_driver keyspan_4port_device = {
 	.num_ports		= 4,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index bf1ae247da6..ab769dbea1b 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -651,6 +651,35 @@ static int keyspan_pda_chars_in_buffer(struct tty_struct *tty)
 }
 
 
+static void keyspan_pda_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct usb_serial *serial = port->serial;
+
+	if (serial->dev) {
+		if (on)
+			keyspan_pda_set_modem_info(serial, (1<<7) | (1<< 2));
+		else
+			keyspan_pda_set_modem_info(serial, 0);
+	}
+}
+
+static int keyspan_pda_carrier_raised(struct usb_serial_port *port)
+{
+	struct usb_serial *serial = port->serial;
+	unsigned char modembits;
+
+	/* If we can read the modem status and the DCD is low then
+	   carrier is not raised yet */
+	if (keyspan_pda_get_modem_info(serial, &modembits) >= 0) {
+		if (!(modembits & (1>>6)))
+			return 0;
+	}
+	/* Carrier raised, or we failed (eg disconnected) so
+	   progress accordingly */
+	return 1;
+}
+
+
 static int keyspan_pda_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp)
 {
@@ -682,13 +711,6 @@ static int keyspan_pda_open(struct tty_struct *tty,
 	priv->tx_room = room;
 	priv->tx_throttled = room ? 0 : 1;
 
-	/* the normal serial device seems to always turn on DTR and RTS here,
-	   so do the same */
-	if (tty && (tty->termios->c_cflag & CBAUD))
-		keyspan_pda_set_modem_info(serial, (1<<7) | (1<<2));
-	else
-		keyspan_pda_set_modem_info(serial, 0);
-
 	/*Start reading from the device*/
 	port->interrupt_in_urb->dev = serial->dev;
 	rc = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
@@ -700,19 +722,11 @@ static int keyspan_pda_open(struct tty_struct *tty,
 error:
 	return rc;
 }
-
-
-static void keyspan_pda_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void keyspan_pda_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 
 	if (serial->dev) {
-		/* the normal serial device seems to always shut
-		   off DTR and RTS now */
-		if (tty->termios->c_cflag & HUPCL)
-			keyspan_pda_set_modem_info(serial, 0);
-
 		/* shutdown our bulk reads and writes */
 		usb_kill_urb(port->write_urb);
 		usb_kill_urb(port->interrupt_in_urb);
@@ -839,6 +853,8 @@ static struct usb_serial_driver keyspan_pda_device = {
 	.usb_driver = 		&keyspan_pda_driver,
 	.id_table =		id_table_std,
 	.num_ports =		1,
+	.dtr_rts =		keyspan_pda_dtr_rts,
+	.carrier_raised	=	keyspan_pda_carrier_raised,
 	.open =			keyspan_pda_open,
 	.close =		keyspan_pda_close,
 	.write =		keyspan_pda_write,
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index fcd9082f3e7..fa817c66b3e 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -76,8 +76,7 @@ static int  klsi_105_startup(struct usb_serial *serial);
 static void klsi_105_shutdown(struct usb_serial *serial);
 static int  klsi_105_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void klsi_105_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void klsi_105_close(struct usb_serial_port *port);
 static int  klsi_105_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 static void klsi_105_write_bulk_callback(struct urb *urb);
@@ -447,8 +446,7 @@ exit:
 } /* klsi_105_open */
 
 
-static void klsi_105_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void klsi_105_close(struct usb_serial_port *port)
 {
 	struct klsi_105_private *priv = usb_get_serial_port_data(port);
 	int rc;
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index c148544953b..6b570498287 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -72,8 +72,7 @@ static int  kobil_startup(struct usb_serial *serial);
 static void kobil_shutdown(struct usb_serial *serial);
 static int  kobil_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void kobil_close(struct tty_struct *tty, struct usb_serial_port *port,
-			struct file *filp);
+static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
 			 const unsigned char *buf, int count);
 static int  kobil_write_room(struct tty_struct *tty);
@@ -209,7 +208,7 @@ static void kobil_shutdown(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; ++i) {
 		while (serial->port[i]->port.count > 0)
-			kobil_close(NULL, serial->port[i], NULL);
+			kobil_close(serial->port[i]);
 		kfree(usb_get_serial_port_data(serial->port[i]));
 		usb_set_serial_port_data(serial->port[i], NULL);
 	}
@@ -346,11 +345,11 @@ static int kobil_open(struct tty_struct *tty,
 }
 
 
-static void kobil_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void kobil_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
+	/* FIXME: Add rts/dtr methods */
 	if (port->write_urb) {
 		usb_kill_urb(port->write_urb);
 		usb_free_urb(port->write_urb);
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index 82930a7d509..873795548fc 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -95,8 +95,8 @@ static int  mct_u232_startup(struct usb_serial *serial);
 static void mct_u232_shutdown(struct usb_serial *serial);
 static int  mct_u232_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void mct_u232_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void mct_u232_close(struct usb_serial_port *port);
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on);
 static void mct_u232_read_int_callback(struct urb *urb);
 static void mct_u232_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
@@ -140,6 +140,7 @@ static struct usb_serial_driver mct_u232_device = {
 	.num_ports =	     1,
 	.open =		     mct_u232_open,
 	.close =	     mct_u232_close,
+	.dtr_rts =	     mct_u232_dtr_rts,
 	.throttle =	     mct_u232_throttle,
 	.unthrottle =	     mct_u232_unthrottle,
 	.read_int_callback = mct_u232_read_int_callback,
@@ -496,29 +497,29 @@ error:
 	return retval;
 } /* mct_u232_open */
 
-
-static void mct_u232_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on)
 {
-	unsigned int c_cflag;
 	unsigned int control_state;
 	struct mct_u232_private *priv = usb_get_serial_port_data(port);
-	dbg("%s port %d", __func__, port->number);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		mutex_lock(&port->serial->disc_mutex);
-		if (c_cflag & HUPCL && !port->serial->disconnected) {
-			/* drop DTR and RTS */
-			spin_lock_irq(&priv->lock);
+	mutex_lock(&port->serial->disc_mutex);
+	if (!port->serial->disconnected) {
+		/* drop DTR and RTS */
+		spin_lock_irq(&priv->lock);
+		if (on)
+			priv->control_state |= TIOCM_DTR | TIOCM_RTS;
+		else
 			priv->control_state &= ~(TIOCM_DTR | TIOCM_RTS);
-			control_state = priv->control_state;
-			spin_unlock_irq(&priv->lock);
-			mct_u232_set_modem_ctrl(port->serial, control_state);
-		}
-		mutex_unlock(&port->serial->disc_mutex);
+		control_state = priv->control_state;
+		spin_unlock_irq(&priv->lock);
+		mct_u232_set_modem_ctrl(port->serial, control_state);
 	}
+	mutex_unlock(&port->serial->disc_mutex);
+}
 
+static void mct_u232_close(struct usb_serial_port *port)
+{
+	dbg("%s port %d", __func__, port->number);
 
 	if (port->serial->dev) {
 		/* shutdown our urbs */
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 24e3b5d4b4d..9e1a013ee7f 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -533,8 +533,7 @@ static int mos7720_chars_in_buffer(struct tty_struct *tty)
 	return chars;
 }
 
-static void mos7720_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mos7720_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct moschip_port *mos7720_port;
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 84fb1dcd30d..10b78a37214 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1135,54 +1135,12 @@ static int mos7840_chars_in_buffer(struct tty_struct *tty)
 
 }
 
-/************************************************************************
- *
- * mos7840_block_until_tx_empty
- *
- *	This function will block the close until one of the following:
- *		1. TX count are 0
- *		2. The mos7840 has stopped
- *		3. A timeout of 3 seconds without activity has expired
- *
- ************************************************************************/
-static void mos7840_block_until_tx_empty(struct tty_struct *tty,
-				struct moschip_port *mos7840_port)
-{
-	int timeout = HZ / 10;
-	int wait = 30;
-	int count;
-
-	while (1) {
-
-		count = mos7840_chars_in_buffer(tty);
-
-		/* Check for Buffer status */
-		if (count <= 0)
-			return;
-
-		/* Block the thread for a while */
-		interruptible_sleep_on_timeout(&mos7840_port->wait_chase,
-					       timeout);
-
-		/* No activity.. count down section */
-		wait--;
-		if (wait == 0) {
-			dbg("%s - TIMEOUT", __func__);
-			return;
-		} else {
-			/* Reset timeout value back to seconds */
-			wait = 30;
-		}
-	}
-}
-
 /*****************************************************************************
  * mos7840_close
  *	this function is called by the tty driver when a port is closed
  *****************************************************************************/
 
-static void mos7840_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mos7840_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct moschip_port *mos7840_port;
@@ -1223,10 +1181,6 @@ static void mos7840_close(struct tty_struct *tty,
 		}
 	}
 
-	if (serial->dev)
-		/* flush and block until tx is empty */
-		mos7840_block_until_tx_empty(tty, mos7840_port);
-
 	/* While closing port, shutdown all bulk read, write  *
 	 * and interrupt read if they exists                  */
 	if (serial->dev) {
diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c
index bcdcbb82270..f5f3751a888 100644
--- a/drivers/usb/serial/navman.c
+++ b/drivers/usb/serial/navman.c
@@ -98,8 +98,7 @@ static int navman_open(struct tty_struct *tty,
 	return result;
 }
 
-static void navman_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void navman_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index df653971272..1104617334f 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -66,8 +66,7 @@ static int debug;
 /* function prototypes */
 static int  omninet_open(struct tty_struct *tty, struct usb_serial_port *port,
 							struct file *filp);
-static void omninet_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static void omninet_close(struct usb_serial_port *port);
 static void omninet_read_bulk_callback(struct urb *urb);
 static void omninet_write_bulk_callback(struct urb *urb);
 static int  omninet_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -189,8 +188,7 @@ static int omninet_open(struct tty_struct *tty,
 	return result;
 }
 
-static void omninet_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void omninet_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 	usb_kill_urb(port->read_urb);
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index b500ad10b75..c20480aa975 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -173,8 +173,7 @@ static int opticon_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return result;
 }
 
-static void opticon_close(struct tty_struct *tty, struct usb_serial_port *port,
-			  struct file *filp)
+static void opticon_close(struct usb_serial_port *port)
 {
 	struct opticon_private *priv = usb_get_serial_data(port->serial);
 
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 7817b82889c..a16d69fadba 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -45,8 +45,9 @@
 /* Function prototypes */
 static int  option_open(struct tty_struct *tty, struct usb_serial_port *port,
 							struct file *filp);
-static void option_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static void option_close(struct usb_serial_port *port);
+static void option_dtr_rts(struct usb_serial_port *port, int on);
+
 static int  option_startup(struct usb_serial *serial);
 static void option_shutdown(struct usb_serial *serial);
 static int  option_write_room(struct tty_struct *tty);
@@ -61,7 +62,7 @@ static void option_set_termios(struct tty_struct *tty,
 static int  option_tiocmget(struct tty_struct *tty, struct file *file);
 static int  option_tiocmset(struct tty_struct *tty, struct file *file,
 				unsigned int set, unsigned int clear);
-static int  option_send_setup(struct tty_struct *tty, struct usb_serial_port *port);
+static int  option_send_setup(struct usb_serial_port *port);
 static int  option_suspend(struct usb_serial *serial, pm_message_t message);
 static int  option_resume(struct usb_serial *serial);
 
@@ -551,6 +552,7 @@ static struct usb_serial_driver option_1port_device = {
 	.num_ports         = 1,
 	.open              = option_open,
 	.close             = option_close,
+	.dtr_rts	   = option_dtr_rts,
 	.write             = option_write,
 	.write_room        = option_write_room,
 	.chars_in_buffer   = option_chars_in_buffer,
@@ -630,7 +632,7 @@ static void option_set_termios(struct tty_struct *tty,
 	dbg("%s", __func__);
 	/* Doesn't support option setting */
 	tty_termios_copy_hw(tty->termios, old_termios);
-	option_send_setup(tty, port);
+	option_send_setup(port);
 }
 
 static int option_tiocmget(struct tty_struct *tty, struct file *file)
@@ -669,7 +671,7 @@ static int option_tiocmset(struct tty_struct *tty, struct file *file,
 		portdata->rts_state = 0;
 	if (clear & TIOCM_DTR)
 		portdata->dtr_state = 0;
-	return option_send_setup(tty, port);
+	return option_send_setup(port);
 }
 
 /* Write */
@@ -897,10 +899,6 @@ static int option_open(struct tty_struct *tty,
 
 	dbg("%s", __func__);
 
-	/* Set some sane defaults */
-	portdata->rts_state = 1;
-	portdata->dtr_state = 1;
-
 	/* Reset low level data toggle and start reading from endpoints */
 	for (i = 0; i < N_IN_URB; i++) {
 		urb = portdata->in_urbs[i];
@@ -936,37 +934,43 @@ static int option_open(struct tty_struct *tty,
 				usb_pipeout(urb->pipe), 0); */
 	}
 
-	option_send_setup(tty, port);
+	option_send_setup(port);
 
 	return 0;
 }
 
-static void option_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void option_dtr_rts(struct usb_serial_port *port, int on)
 {
-	int i;
 	struct usb_serial *serial = port->serial;
 	struct option_port_private *portdata;
 
 	dbg("%s", __func__);
 	portdata = usb_get_serial_port_data(port);
+	mutex_lock(&serial->disc_mutex);
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
+	if (serial->dev)
+		option_send_setup(port);
+	mutex_unlock(&serial->disc_mutex);
+}
 
-	portdata->rts_state = 0;
-	portdata->dtr_state = 0;
 
-	if (serial->dev) {
-		mutex_lock(&serial->disc_mutex);
-		if (!serial->disconnected)
-			option_send_setup(tty, port);
-		mutex_unlock(&serial->disc_mutex);
+static void option_close(struct usb_serial_port *port)
+{
+	int i;
+	struct usb_serial *serial = port->serial;
+	struct option_port_private *portdata;
+
+	dbg("%s", __func__);
+	portdata = usb_get_serial_port_data(port);
 
+	if (serial->dev) {
 		/* Stop reading/writing urbs */
 		for (i = 0; i < N_IN_URB; i++)
 			usb_kill_urb(portdata->in_urbs[i]);
 		for (i = 0; i < N_OUT_URB; i++)
 			usb_kill_urb(portdata->out_urbs[i]);
 	}
-	tty_port_tty_set(&port->port, NULL);
 }
 
 /* Helper functions used by option_setup_urbs */
@@ -1032,28 +1036,24 @@ static void option_setup_urbs(struct usb_serial *serial)
  * This is exactly the same as SET_CONTROL_LINE_STATE from the PSTN
  * CDC.
 */
-static int option_send_setup(struct tty_struct *tty,
-						struct usb_serial_port *port)
+static int option_send_setup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct option_port_private *portdata;
 	int ifNum = serial->interface->cur_altsetting->desc.bInterfaceNumber;
+	int val = 0;
 	dbg("%s", __func__);
 
 	portdata = usb_get_serial_port_data(port);
 
-	if (tty) {
-		int val = 0;
-		if (portdata->dtr_state)
-			val |= 0x01;
-		if (portdata->rts_state)
-			val |= 0x02;
+	if (portdata->dtr_state)
+		val |= 0x01;
+	if (portdata->rts_state)
+		val |= 0x02;
 
-		return usb_control_msg(serial->dev,
-			usb_rcvctrlpipe(serial->dev, 0),
-			0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
-	}
-	return 0;
+	return usb_control_msg(serial->dev,
+		usb_rcvctrlpipe(serial->dev, 0),
+		0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
 }
 
 static int option_startup(struct usb_serial *serial)
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index ba551f00f16..7de54781fe6 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -143,8 +143,7 @@ struct oti6858_control_pkt {
 /* function prototypes */
 static int oti6858_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void oti6858_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void oti6858_close(struct usb_serial_port *port);
 static void oti6858_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
 static int oti6858_ioctl(struct tty_struct *tty, struct file *file,
@@ -622,67 +621,30 @@ static int oti6858_open(struct tty_struct *tty,
 	if (result != 0) {
 		dev_err(&port->dev, "%s(): usb_submit_urb() failed"
 			       " with error %d\n", __func__, result);
-		oti6858_close(tty, port, NULL);
+		oti6858_close(port);
 		return -EPROTO;
 	}
 
 	/* setup termios */
 	if (tty)
 		oti6858_set_termios(tty, port, &tmp_termios);
-
+	port->port.drain_delay = 256;	/* FIXME: check the FIFO length */
 	return 0;
 }
 
-static void oti6858_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void oti6858_close(struct usb_serial_port *port)
 {
 	struct oti6858_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s(port = %d)", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = 30 * HZ;	/* PL2303_CLOSING_WAIT */
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	dbg("%s(): entering wait loop", __func__);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (oti6858_buf_data_avail(priv->buf) == 0
-		|| timeout == 0 || signal_pending(current)
-		|| port->serial->disconnected)
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-	dbg("%s(): after wait loop", __func__);
-
 	/* clear out any remaining data in the buffer */
 	oti6858_buf_clear(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device */
-	/* (this is long enough for the entire 256 byte */
-	/* pl2303 hardware buffer to drain with no flow */
-	/* control for data rates of 1200 bps or more, */
-	/* for lower rates we should really know how much */
-	/* data is in the buffer to compute a delay */
-	/* that is not unnecessarily long) */
-	/* FIXME
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560)/bps,HZ/10);
-	else
-	*/
-		timeout = 2*HZ;
-	schedule_timeout_interruptible(timeout);
-	dbg("%s(): after schedule_timeout_interruptible()", __func__);
+	dbg("%s(): after buf_clear()", __func__);
 
 	/* cancel scheduled setup */
 	cancel_delayed_work(&priv->delayed_setup_work);
@@ -694,15 +656,6 @@ static void oti6858_close(struct tty_struct *tty,
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
-
-	/*
-	if (tty && (tty->termios->c_cflag) & HUPCL) {
-		// drop DTR and RTS
-		spin_lock_irqsave(&priv->lock, flags);
-		priv->pending_setup.control &= ~CONTROL_MASK;
-		spin_unlock_irqrestore(&priv->lock, flags);
-	}
-	*/
 }
 
 static int oti6858_tiocmset(struct tty_struct *tty, struct file *file,
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 751a533a434..e02dc3d643c 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -652,69 +652,41 @@ static void pl2303_set_termios(struct tty_struct *tty,
 	kfree(buf);
 }
 
-static void pl2303_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void pl2303_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct pl2303_private *priv = usb_get_serial_port_data(port);
+	unsigned long flags;
+	u8 control;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	/* Change DTR and RTS */
+	if (on)
+		priv->line_control |= (CONTROL_DTR | CONTROL_RTS);
+	else
+		priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS);
+	control = priv->line_control;
+	spin_unlock_irqrestore(&priv->lock, flags);
+	set_control_lines(port->serial->dev, control);
+}
+
+static void pl2303_close(struct usb_serial_port *port)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = PL2303_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (pl2303_buf_data_avail(priv->buf) == 0 ||
-		    timeout == 0 || signal_pending(current) ||
-		    port->serial->disconnected)
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
 	/* clear out any remaining data in the buffer */
 	pl2303_buf_clear(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device */
-	/* (this is long enough for the entire 256 byte */
-	/* pl2303 hardware buffer to drain with no flow */
-	/* control for data rates of 1200 bps or more, */
-	/* for lower rates we should really know how much */
-	/* data is in the buffer to compute a delay */
-	/* that is not unnecessarily long) */
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560)/bps, HZ/10);
-	else
-		timeout = 2*HZ;
-	schedule_timeout_interruptible(timeout);
-
 	/* shutdown our urbs */
 	dbg("%s - shutting down urbs", __func__);
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop DTR and RTS */
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			set_control_lines(port->serial->dev, 0);
-		}
-	}
 }
 
 static int pl2303_open(struct tty_struct *tty,
@@ -748,7 +720,7 @@ static int pl2303_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting read urb,"
 			" error %d\n", __func__, result);
-		pl2303_close(tty, port, NULL);
+		pl2303_close(port);
 		return -EPROTO;
 	}
 
@@ -758,9 +730,10 @@ static int pl2303_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting interrupt urb,"
 			" error %d\n", __func__, result);
-		pl2303_close(tty, port, NULL);
+		pl2303_close(port);
 		return -EPROTO;
 	}
+	port->port.drain_delay = 256;
 	return 0;
 }
 
@@ -821,6 +794,14 @@ static int pl2303_tiocmget(struct tty_struct *tty, struct file *file)
 	return result;
 }
 
+static int pl2303_carrier_raised(struct usb_serial_port *port)
+{
+	struct pl2303_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & UART_DCD)
+		return 1;
+	return 0;
+}
+
 static int wait_modem_info(struct usb_serial_port *port, unsigned int arg)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
@@ -1125,6 +1106,8 @@ static struct usb_serial_driver pl2303_device = {
 	.num_ports =		1,
 	.open =			pl2303_open,
 	.close =		pl2303_close,
+	.dtr_rts = 		pl2303_dtr_rts,
+	.carrier_raised =	pl2303_carrier_raised,
 	.write =		pl2303_write,
 	.ioctl =		pl2303_ioctl,
 	.break_ctl =		pl2303_break_ctl,
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 913225c6161..1319b8968d8 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -240,57 +240,39 @@ struct sierra_port_private {
 	int ri_state;
 };
 
-static int sierra_send_setup(struct tty_struct *tty,
-						struct usb_serial_port *port)
+static int sierra_send_setup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 	__u16 interface = 0;
+	int val = 0;
 
 	dev_dbg(&port->dev, "%s", __func__);
 
 	portdata = usb_get_serial_port_data(port);
 
-	if (tty) {
-		int val = 0;
-		if (portdata->dtr_state)
-			val |= 0x01;
-		if (portdata->rts_state)
-			val |= 0x02;
-
-		/* If composite device then properly report interface */
-		if (serial->num_ports == 1) {
-			interface = sierra_calc_interface(serial);
-
-			/* Control message is sent only to interfaces with
-			 * interrupt_in endpoints
-			 */
-			if (port->interrupt_in_urb) {
-				/* send control message */
-				return usb_control_msg(serial->dev,
-					usb_rcvctrlpipe(serial->dev, 0),
-					0x22, 0x21, val, interface,
-					NULL, 0, USB_CTRL_SET_TIMEOUT);
-			}
-		}
-
-		/* Otherwise the need to do non-composite mapping */
-		else {
-			if (port->bulk_out_endpointAddress == 2)
-				interface = 0;
-			else if (port->bulk_out_endpointAddress == 4)
-				interface = 1;
-			else if (port->bulk_out_endpointAddress == 5)
-				interface = 2;
-
-			return usb_control_msg(serial->dev,
-				usb_rcvctrlpipe(serial->dev, 0),
-				0x22, 0x21, val, interface,
-				NULL, 0, USB_CTRL_SET_TIMEOUT);
-
-		}
+	if (portdata->dtr_state)
+		val |= 0x01;
+	if (portdata->rts_state)
+		val |= 0x02;
+
+	/* If composite device then properly report interface */
+	if (serial->num_ports == 1)
+		interface = sierra_calc_interface(serial);
+
+	/* Otherwise the need to do non-composite mapping */
+	else {
+		if (port->bulk_out_endpointAddress == 2)
+			interface = 0;
+		else if (port->bulk_out_endpointAddress == 4)
+			interface = 1;
+		else if (port->bulk_out_endpointAddress == 5)
+			interface = 2;
 	}
-
+	return usb_control_msg(serial->dev,
+			usb_rcvctrlpipe(serial->dev, 0),
+			0x22, 0x21, val, interface,
+			NULL, 0, USB_CTRL_SET_TIMEOUT);
 	return 0;
 }
 
@@ -299,7 +281,7 @@ static void sierra_set_termios(struct tty_struct *tty,
 {
 	dev_dbg(&port->dev, "%s", __func__);
 	tty_termios_copy_hw(tty->termios, old_termios);
-	sierra_send_setup(tty, port);
+	sierra_send_setup(port);
 }
 
 static int sierra_tiocmget(struct tty_struct *tty, struct file *file)
@@ -338,7 +320,7 @@ static int sierra_tiocmset(struct tty_struct *tty, struct file *file,
 		portdata->rts_state = 0;
 	if (clear & TIOCM_DTR)
 		portdata->dtr_state = 0;
-	return sierra_send_setup(tty, port);
+	return sierra_send_setup(port);
 }
 
 static void sierra_outdat_callback(struct urb *urb)
@@ -598,7 +580,7 @@ static int sierra_open(struct tty_struct *tty,
 		}
 	}
 
-	sierra_send_setup(tty, port);
+	sierra_send_setup(port);
 
 	/* start up the interrupt endpoint if we have one */
 	if (port->interrupt_in_urb) {
@@ -610,32 +592,38 @@ static int sierra_open(struct tty_struct *tty,
 	return 0;
 }
 
-static void sierra_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void sierra_dtr_rts(struct usb_serial_port *port, int on)
 {
-	int i;
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
-	dev_dbg(&port->dev, "%s", __func__);
 	portdata = usb_get_serial_port_data(port);
-
-	portdata->rts_state = 0;
-	portdata->dtr_state = 0;
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
 
 	if (serial->dev) {
 		mutex_lock(&serial->disc_mutex);
 		if (!serial->disconnected)
-			sierra_send_setup(tty, port);
+			sierra_send_setup(port);
 		mutex_unlock(&serial->disc_mutex);
+	}
+}
+
+static void sierra_close(struct usb_serial_port *port)
+{
+	int i;
+	struct usb_serial *serial = port->serial;
+	struct sierra_port_private *portdata;
 
+	dev_dbg(&port->dev, "%s", __func__);
+	portdata = usb_get_serial_port_data(port);
+
+	if (serial->dev) {
 		/* Stop reading/writing urbs */
 		for (i = 0; i < N_IN_URB; i++)
 			usb_kill_urb(portdata->in_urbs[i]);
 	}
-
 	usb_kill_urb(port->interrupt_in_urb);
-	tty_port_tty_set(&port->port, NULL);
 }
 
 static int sierra_startup(struct usb_serial *serial)
@@ -737,6 +725,7 @@ static struct usb_serial_driver sierra_device = {
 	.probe		   = sierra_probe,
 	.open              = sierra_open,
 	.close             = sierra_close,
+	.dtr_rts	   = sierra_dtr_rts,
 	.write             = sierra_write,
 	.write_room        = sierra_write_room,
 	.set_termios       = sierra_set_termios,
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 5e7528cc81a..8f7ed8f1399 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -446,66 +446,47 @@ static void spcp8x5_set_workMode(struct usb_device *dev, u16 value,
 			"RTSCTS usb_control_msg(enable flowctrl) = %d\n", ret);
 }
 
+static int spcp8x5_carrier_raised(struct usb_serial_port *port)
+{
+	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & MSR_STATUS_LINE_DCD)
+		return 1;
+	return 0;
+}
+
+static void spcp8x5_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+	unsigned long flags;
+	u8 control;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	if (on)
+		priv->line_control = MCR_CONTROL_LINE_DTR
+						| MCR_CONTROL_LINE_RTS;
+	else
+		priv->line_control &= ~ (MCR_CONTROL_LINE_DTR
+						| MCR_CONTROL_LINE_RTS);
+	control = priv->line_control;
+	spin_unlock_irqrestore(&priv->lock, flags);
+	spcp8x5_set_ctrlLine(port->serial->dev, control , priv->type);
+}
+
 /* close the serial port. We should wait for data sending to device 1st and
  * then kill all urb. */
-static void spcp8x5_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void spcp8x5_close(struct usb_serial_port *port)
 {
 	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 	int result;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = SPCP8x5_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (ringbuf_avail_data(priv->buf) == 0 ||
-		    timeout == 0 || signal_pending(current))
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-
 	/* clear out any remaining data in the buffer */
 	clear_ringbuf(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device (this is long enough
-	 * for the entire all byte spcp8x5 hardware buffer to drain with no
-	 * flow control for data rates of 1200 bps or more, for lower rates we
-	 * should really know how much data is in the buffer to compute a delay
-	 * that is not unnecessarily long) */
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560) / bps, HZ/10);
-	else
-		timeout = 2*HZ;
-	set_current_state(TASK_INTERRUPTIBLE);
-	schedule_timeout(timeout);
-
-	/* clear control lines */
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			spcp8x5_set_ctrlLine(port->serial->dev, 0 , priv->type);
-		}
-	}
-
 	/* kill urb */
 	if (port->write_urb != NULL) {
 		result = usb_unlink_urb(port->write_urb);
@@ -665,13 +646,6 @@ static int spcp8x5_open(struct tty_struct *tty,
 	if (ret)
 		return ret;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	if (tty && (tty->termios->c_cflag & CBAUD))
-		priv->line_control = MCR_DTR | MCR_RTS;
-	else
-		priv->line_control = 0;
-	spin_unlock_irqrestore(&priv->lock, flags);
-
 	spcp8x5_set_ctrlLine(serial->dev, priv->line_control , priv->type);
 
 	/* Setup termios */
@@ -691,9 +665,10 @@ static int spcp8x5_open(struct tty_struct *tty,
 	port->read_urb->dev = serial->dev;
 	ret = usb_submit_urb(port->read_urb, GFP_KERNEL);
 	if (ret) {
-		spcp8x5_close(tty, port, NULL);
+		spcp8x5_close(port);
 		return -EPROTO;
 	}
+	port->port.drain_delay = 256;
 	return 0;
 }
 
@@ -1033,6 +1008,8 @@ static struct usb_serial_driver spcp8x5_device = {
 	.num_ports		= 1,
 	.open 			= spcp8x5_open,
 	.close 			= spcp8x5_close,
+	.dtr_rts		= spcp8x5_dtr_rts,
+	.carrier_raised		= spcp8x5_carrier_raised,
 	.write 			= spcp8x5_write,
 	.set_termios 		= spcp8x5_set_termios,
 	.ioctl 			= spcp8x5_ioctl,
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 69879e43794..8b07ebc6bae 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -152,8 +152,7 @@ static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return result;
 }
 
-static void symbol_close(struct tty_struct *tty, struct usb_serial_port *port,
-			  struct file *filp)
+static void symbol_close(struct usb_serial_port *port)
 {
 	struct symbol_private *priv = usb_get_serial_data(port->serial);
 
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 0a64bac306e..42cb04c403b 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -100,8 +100,7 @@ static int ti_startup(struct usb_serial *serial);
 static void ti_shutdown(struct usb_serial *serial);
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port,
 		struct file *file);
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-		struct file *file);
+static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *data, int count);
 static int ti_write_room(struct tty_struct *tty);
@@ -647,8 +646,7 @@ release_lock:
 }
 
 
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *file)
+static void ti_close(struct usb_serial_port *port)
 {
 	struct ti_device *tdev;
 	struct ti_port *tport;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index f331e2bde88..1967a7edc10 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -238,9 +238,11 @@ static int serial_open (struct tty_struct *tty, struct file *filp)
 			goto bailout_interface_put;
 		mutex_unlock(&serial->disc_mutex);
 	}
-
 	mutex_unlock(&port->mutex);
-	return 0;
+	/* Now do the correct tty layer semantics */
+	retval = tty_port_block_til_ready(&port->port, tty, filp);
+	if (retval == 0)
+		return 0;
 
 bailout_interface_put:
 	usb_autopm_put_interface(serial->interface);
@@ -259,64 +261,89 @@ bailout_serial_put:
 	return retval;
 }
 
-static void serial_close(struct tty_struct *tty, struct file *filp)
+/**
+ *	serial_do_down		-	shut down hardware
+ *	@port: port to shut down
+ *
+ *	Shut down a USB port unless it is the console. We never shut down the
+ *	console hardware as it will always be in use.
+ *
+ *	Don't free any resources at this point
+ */
+static void serial_do_down(struct usb_serial_port *port)
 {
-	struct usb_serial_port *port = tty->driver_data;
+	struct usb_serial_driver *drv = port->serial->type;
 	struct usb_serial *serial;
 	struct module *owner;
-	int count;
 
-	if (!port)
+	/* The console is magical, do not hang up the console hardware
+	   or there will be tears */
+	if (port->console)
 		return;
 
-	dbg("%s - port %d", __func__, port->number);
-
 	mutex_lock(&port->mutex);
 	serial = port->serial;
 	owner = serial->type->driver.owner;
 
-	if (port->port.count == 0) {
-		mutex_unlock(&port->mutex);
-		return;
-	}
-
-	if (port->port.count == 1)
-		/* only call the device specific close if this
-		 * port is being closed by the last owner. Ensure we do
-		 * this before we drop the port count. The call is protected
-		 * by the port mutex
-		 */
-		serial->type->close(tty, port, filp);
-
-	if (port->port.count == (port->console ? 2 : 1)) {
-		struct tty_struct *tty = tty_port_tty_get(&port->port);
-		if (tty) {
-			/* We must do this before we drop the port count to
-			   zero. */
-			if (tty->driver_data)
-				tty->driver_data = NULL;
-			tty_port_tty_set(&port->port, NULL);
-			tty_kref_put(tty);
-		}
-	}
+	if (drv->close)
+		drv->close(port);
 
-	--port->port.count;
-	count = port->port.count;
 	mutex_unlock(&port->mutex);
-	put_device(&port->dev);
+}
+
+/**
+ *	serial_do_free		-	free resources post close/hangup
+ *	@port: port to free up
+ *
+ *	Do the resource freeing and refcount dropping for the port. We must
+ *	be careful about ordering and we must avoid freeing up the console.
+ */
 
+static void serial_do_free(struct usb_serial_port *port)
+{
+	struct usb_serial *serial;
+	struct module *owner;
+
+	/* The console is magical, do not hang up the console hardware
+	   or there will be tears */
+	if (port->console)
+		return;
+
+	serial = port->serial;
+	owner = serial->type->driver.owner;
+	put_device(&port->dev);
 	/* Mustn't dereference port any more */
-	if (count == 0) {
-		mutex_lock(&serial->disc_mutex);
-		if (!serial->disconnected)
-			usb_autopm_put_interface(serial->interface);
-		mutex_unlock(&serial->disc_mutex);
-	}
+	mutex_lock(&serial->disc_mutex);
+	if (!serial->disconnected)
+		usb_autopm_put_interface(serial->interface);
+	mutex_unlock(&serial->disc_mutex);
 	usb_serial_put(serial);
-
 	/* Mustn't dereference serial any more */
-	if (count == 0)
-		module_put(owner);
+	module_put(owner);
+}
+
+static void serial_close(struct tty_struct *tty, struct file *filp)
+{
+	struct usb_serial_port *port = tty->driver_data;
+
+	dbg("%s - port %d", __func__, port->number);
+
+
+	if (tty_port_close_start(&port->port, tty, filp) == 0)
+		return;
+
+	serial_do_down(port);		
+	tty_port_close_end(&port->port, tty);
+	tty_port_tty_set(&port->port, NULL);
+	serial_do_free(port);
+}
+
+static void serial_hangup(struct tty_struct *tty)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	serial_do_down(port);
+	tty_port_hangup(&port->port);
+	serial_do_free(port);
 }
 
 static int serial_write(struct tty_struct *tty, const unsigned char *buf,
@@ -648,6 +675,29 @@ static struct usb_serial_driver *search_serial_device(
 	return NULL;
 }
 
+static int serial_carrier_raised(struct tty_port *port)
+{
+	struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+	struct usb_serial_driver *drv = p->serial->type;
+	if (drv->carrier_raised)
+		return drv->carrier_raised(p);
+	/* No carrier control - don't block */
+	return 1;	
+}
+
+static void serial_dtr_rts(struct tty_port *port, int on)
+{
+	struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+	struct usb_serial_driver *drv = p->serial->type;
+	if (drv->dtr_rts)
+		drv->dtr_rts(p, on);
+}
+
+static const struct tty_port_operations serial_port_ops = {
+	.carrier_raised = serial_carrier_raised,
+	.dtr_rts = serial_dtr_rts,
+};
+
 int usb_serial_probe(struct usb_interface *interface,
 			       const struct usb_device_id *id)
 {
@@ -841,6 +891,7 @@ int usb_serial_probe(struct usb_interface *interface,
 		if (!port)
 			goto probe_error;
 		tty_port_init(&port->port);
+		port->port.ops = &serial_port_ops;
 		port->serial = serial;
 		spin_lock_init(&port->lock);
 		mutex_init(&port->mutex);
@@ -1071,6 +1122,9 @@ void usb_serial_disconnect(struct usb_interface *interface)
 		if (port) {
 			struct tty_struct *tty = tty_port_tty_get(&port->port);
 			if (tty) {
+				/* The hangup will occur asynchronously but
+				   the object refcounts will sort out all the
+				   cleanup */
 				tty_hangup(tty);
 				tty_kref_put(tty);
 			}
@@ -1135,6 +1189,7 @@ static const struct tty_operations serial_ops = {
 	.open =			serial_open,
 	.close =		serial_close,
 	.write =		serial_write,
+	.hangup = 		serial_hangup,
 	.write_room =		serial_write_room,
 	.ioctl =		serial_ioctl,
 	.set_termios =		serial_set_termios,
@@ -1147,6 +1202,7 @@ static const struct tty_operations serial_ops = {
 	.proc_fops =		&serial_proc_fops,
 };
 
+
 struct tty_driver *usb_serial_tty_driver;
 
 static int __init usb_serial_init(void)
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 5ac414bda71..b15f1c0e1d4 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -38,8 +38,7 @@
 /* function prototypes for a handspring visor */
 static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filp);
-static void visor_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static void visor_close(struct usb_serial_port *port);
 static int  visor_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static int  visor_write_room(struct tty_struct *tty);
@@ -324,8 +323,7 @@ exit:
 }
 
 
-static void visor_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void visor_close(struct usb_serial_port *port)
 {
 	struct visor_private *priv = usb_get_serial_port_data(port);
 	unsigned char *transfer_buffer;
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 5335d3211c0..7c7295d09f3 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -147,8 +147,7 @@ static int  whiteheat_attach(struct usb_serial *serial);
 static void whiteheat_shutdown(struct usb_serial *serial);
 static int  whiteheat_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void whiteheat_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void whiteheat_close(struct usb_serial_port *port);
 static int  whiteheat_write(struct tty_struct *tty,
 			struct usb_serial_port *port,
 			const unsigned char *buf, int count);
@@ -712,8 +711,7 @@ exit:
 }
 
 
-static void whiteheat_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void whiteheat_close(struct usb_serial_port *port)
 {
 	struct whiteheat_private *info = usb_get_serial_port_data(port);
 	struct whiteheat_urb_wrap *wrap;
@@ -723,31 +721,7 @@ static void whiteheat_close(struct tty_struct *tty,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	mutex_lock(&port->serial->disc_mutex);
-	/* filp is NULL when called from usb_serial_disconnect */
-	if ((filp && (tty_hung_up_p(filp))) || port->serial->disconnected) {
-		mutex_unlock(&port->serial->disc_mutex);
-		return;
-	}
-	mutex_unlock(&port->serial->disc_mutex);
-
-	tty->closing = 1;
-
-/*
- * Not currently in use; tty_wait_until_sent() calls
- * serial_chars_in_buffer() which deadlocks on the second semaphore
- * acquisition. This should be fixed at some point. Greg's been
- * notified.
-	if ((filp->f_flags & (O_NDELAY | O_NONBLOCK)) == 0) {
-		tty_wait_until_sent(tty, CLOSING_DELAY);
-	}
-*/
-
-	tty_driver_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
 	firm_report_tx_done(port);
-
 	firm_close(port);
 
 	/* shutdown our bulk reads and writes */
@@ -775,10 +749,7 @@ static void whiteheat_close(struct tty_struct *tty,
 	}
 	spin_unlock_irq(&info->lock);
 	mutex_unlock(&info->deathwarrant);
-
 	stop_command_port(port->serial);
-
-	tty->closing = 0;
 }
 
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 625e9e4639c..8cdfed738fe 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -224,8 +224,7 @@ struct usb_serial_driver {
 	/* Called by console with tty = NULL and by tty */
 	int  (*open)(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-	void (*close)(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+	void (*close)(struct usb_serial_port *port);
 	int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 	/* Called only by the tty layer */
@@ -241,6 +240,10 @@ struct usb_serial_driver {
 	int  (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int  (*tiocmset)(struct tty_struct *tty, struct file *file,
 			 unsigned int set, unsigned int clear);
+	/* Called by the tty layer for port level work. There may or may not
+	   be an attached tty at this point */
+	void (*dtr_rts)(struct usb_serial_port *port, int on);
+	int  (*carrier_raised)(struct usb_serial_port *port);
 	/* USB events */
 	void (*read_int_callback)(struct urb *urb);
 	void (*write_int_callback)(struct urb *urb);
@@ -283,8 +286,7 @@ extern int usb_serial_generic_open(struct tty_struct *tty,
 		struct usb_serial_port *port, struct file *filp);
 extern int usb_serial_generic_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
-extern void usb_serial_generic_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+extern void usb_serial_generic_close(struct usb_serial_port *port);
 extern int usb_serial_generic_resume(struct usb_serial *serial);
 extern int usb_serial_generic_write_room(struct tty_struct *tty);
 extern int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
-- 
cgit v1.2.3-70-g09d2


From 97e87f8ebe978e881c7325ba490574bd5500b133 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:29:27 +0100
Subject: tty: cyclades, plx9060 casts cleanup

Remove ugly all-over-the-code casts of ctl_addr to 9060 space.
Add an union to the cyclades_card structure, which contains
a pointer to both 9050 and 9060 spaces.

The 9050 space layout is unknown, so let it still as a void
__iomem pointer.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c  | 65 ++++++++++++++++++++++++------------------------
 include/linux/cyclades.h | 23 +++++++++--------
 2 files changed, 46 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index ccf68a9e24b..2cbf74134f1 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -666,12 +666,10 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define IS_CYC_Z(card) ((card).num_chips == (unsigned int)-1)
 
 #define Z_FPGA_CHECK(card) \
-	((readl(&((struct RUNTIME_9060 __iomem *) \
-		((card).ctl_addr))->init_ctrl) & (1<<17)) != 0)
+	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
 
-#define ISZLOADED(card)	(((ZO_V1 == readl(&((struct RUNTIME_9060 __iomem *) \
-			((card).ctl_addr))->mail_box_0)) || \
-			Z_FPGA_CHECK(card)) && \
+#define ISZLOADED(card)	(((ZO_V1 == readl(&(card).ctl_addr.p9060->mail_box_0)) \
+			|| Z_FPGA_CHECK(card)) && \
 			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
 			((card).base_addr+ID_ADDRESS))->signature)))
 
@@ -1400,14 +1398,12 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
-	loc_doorbell = readl(&((struct RUNTIME_9060 __iomem *)
-				  (cinfo->ctl_addr))->loc_doorbell);
+	loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
 	if (loc_doorbell) {
 		*cmd = (char)(0xff & loc_doorbell);
 		*channel = readl(&board_ctrl->fwcmd_channel);
 		*param = (__u32) readl(&board_ctrl->fwcmd_param);
-		cy_writel(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-			  loc_doorbell, 0xffffffff);
+		cy_writel(&cinfo->ctl_addr.p9060->loc_doorbell, 0xffffffff);
 		return 1;
 	}
 	return 0;
@@ -1431,8 +1427,7 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
 	index = 0;
-	pci_doorbell =
-	    &((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->pci_doorbell;
+	pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
 	while ((readl(pci_doorbell) & 0xff) != 0) {
 		if (index++ == 1000)
 			return (int)(readl(pci_doorbell) & 0xff);
@@ -1635,8 +1630,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
-	hw_ver = readl(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-			mail_box_0);
+	hw_ver = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
 		special_count = 0;
@@ -2394,8 +2388,8 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
 		if (!ISZLOADED(*cinfo)) {
-			if (((ZE_V1 == readl(&((struct RUNTIME_9060 __iomem *)
-					 (cinfo->ctl_addr))->mail_box_0)) &&
+			if (((ZE_V1 == readl(&cinfo->ctl_addr.p9060->
+							mail_box_0)) &&
 					Z_FPGA_CHECK(*cinfo)) &&
 					(ZFIRM_HLT == readl(
 						&firm_id->signature))) {
@@ -2417,6 +2411,7 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 			if (!cinfo->intr_enabled) {
 				struct ZFW_CTRL __iomem *zfw_ctrl;
 				struct BOARD_CTRL __iomem *board_ctrl;
+				u16 intr;
 
 				zfw_ctrl = cinfo->base_addr +
 					(readl(&firm_id->zfwctrl_addr) &
@@ -2425,8 +2420,10 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 				board_ctrl = &zfw_ctrl->board_ctrl;
 
 				/* Enable interrupts on the PLX chip */
-				cy_writew(cinfo->ctl_addr + 0x68,
-					readw(cinfo->ctl_addr + 0x68) | 0x0900);
+				intr = readw(&cinfo->ctl_addr.p9060->
+						intr_ctrl_stat) | 0x0900;
+				cy_writew(&cinfo->ctl_addr.p9060->
+						intr_ctrl_stat, intr);
 				/* Enable interrupts on the FW */
 				retval = cyz_issue_cmd(cinfo, 0,
 						C_CM_IRQ_ENBL, 0L);
@@ -4347,8 +4344,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	spin_lock_init(&cinfo->card_lock);
 
 	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
-		mailbox = readl(&((struct RUNTIME_9060 __iomem *)
-				     cinfo->ctl_addr)->mail_box_0);
+		mailbox = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 		nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
@@ -4613,7 +4609,7 @@ static int __init cy_detect_isa(void)
 
 		/* set cy_card */
 		cy_card[j].base_addr = cy_isa_address;
-		cy_card[j].ctl_addr = NULL;
+		cy_card[j].ctl_addr.p9050 = NULL;
 		cy_card[j].irq = (int)cy_isa_irq;
 		cy_card[j].bus_index = 0;
 		cy_card[j].first_line = cy_next_channel;
@@ -5013,7 +5009,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 
 		/* Disable interrupts on the PLX before resetting it */
-		cy_writew(addr0 + 0x68, readw(addr0 + 0x68) & ~0x0900);
+		cy_writew(&ctl_addr->intr_ctrl_stat,
+				readw(&ctl_addr->intr_ctrl_stat) & ~0x0900);
 
 		plx_init(pdev, irq, addr0);
 
@@ -5109,7 +5106,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 	/* set cy_card */
 	cy_card[card_no].base_addr = addr2;
-	cy_card[card_no].ctl_addr = addr0;
+	cy_card[card_no].ctl_addr.p9050 = addr0;
 	cy_card[card_no].irq = irq;
 	cy_card[card_no].bus_index = 1;
 	cy_card[card_no].first_line = cy_next_channel;
@@ -5125,17 +5122,20 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		plx_ver = readb(addr2 + CyPLX_VER) & 0x0f;
 		switch (plx_ver) {
 		case PLX_9050:
-
 			cy_writeb(addr0 + 0x4c, 0x43);
 			break;
 
 		case PLX_9060:
 		case PLX_9080:
 		default:	/* Old boards, use PLX_9060 */
-			plx_init(pdev, irq, addr0);
-			cy_writew(addr0 + 0x68, readw(addr0 + 0x68) | 0x0900);
+		{
+			struct RUNTIME_9060 __iomem *ctl_addr = addr0;
+			plx_init(pdev, irq, ctl_addr);
+			cy_writew(&ctl_addr->intr_ctrl_stat,
+				readw(&ctl_addr->intr_ctrl_stat) | 0x0900);
 			break;
 		}
+		}
 	}
 
 	dev_info(&pdev->dev, "%s/PCI #%d found: %d channels starting from "
@@ -5168,17 +5168,18 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 	/* non-Z with old PLX */
 	if (!IS_CYC_Z(*cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
 			PLX_9050)
-		cy_writeb(cinfo->ctl_addr + 0x4c, 0);
+		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
 	else
 #ifndef CONFIG_CYZ_INTR
 		if (!IS_CYC_Z(*cinfo))
 #endif
-		cy_writew(cinfo->ctl_addr + 0x68,
-				readw(cinfo->ctl_addr + 0x68) & ~0x0900);
+		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
+			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
+			~0x0900);
 
 	iounmap(cinfo->base_addr);
-	if (cinfo->ctl_addr)
-		iounmap(cinfo->ctl_addr);
+	if (cinfo->ctl_addr.p9050)
+		iounmap(cinfo->ctl_addr.p9050);
 	if (cinfo->irq
 #ifndef CONFIG_CYZ_INTR
 		&& !IS_CYC_Z(*cinfo)
@@ -5373,8 +5374,8 @@ static void __exit cy_cleanup_module(void)
 			/* clear interrupt */
 			cy_writeb(card->base_addr + Cy_ClrIntr, 0);
 			iounmap(card->base_addr);
-			if (card->ctl_addr)
-				iounmap(card->ctl_addr);
+			if (card->ctl_addr.p9050)
+				iounmap(card->ctl_addr.p9050);
 			if (card->irq
 #ifndef CONFIG_CYZ_INTR
 				&& !IS_CYC_Z(*card)
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 788850ba4e7..9ae03d5b359 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -507,16 +507,19 @@ struct ZFW_CTRL {
 
 /* Per card data structure */
 struct cyclades_card {
-    void __iomem *base_addr;
-    void __iomem *ctl_addr;
-    int irq;
-    unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
-    unsigned int first_line;	/* minor number of first channel on card */
-    unsigned int nports;	/* Number of ports in the card */
-    int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
-    int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
-    spinlock_t card_lock;
-    struct cyclades_port *ports;
+	void __iomem *base_addr;
+	union {
+		void __iomem *p9050;
+		struct RUNTIME_9060 __iomem *p9060;
+	} ctl_addr;
+	int irq;
+	unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
+	unsigned int first_line;	/* minor number of first channel on card */
+	unsigned int nports;	/* Number of ports in the card */
+	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
+	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
+	spinlock_t card_lock;
+	struct cyclades_port *ports;
 };
 
 /***************************************
-- 
cgit v1.2.3-70-g09d2


From 101b81590d8df0a74c33cf739886247c0a13f4af Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:30:10 +0100
Subject: tty: cyclades, cache HW version

Store HW version locally to not read it all the time in interrupts
and alike.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c  | 32 +++++++++++---------------------
 include/linux/cyclades.h |  1 +
 2 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 2cbf74134f1..cf191cc1c92 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -668,8 +668,7 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define Z_FPGA_CHECK(card) \
 	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
 
-#define ISZLOADED(card)	(((ZO_V1 == readl(&(card).ctl_addr.p9060->mail_box_0)) \
-			|| Z_FPGA_CHECK(card)) && \
+#define ISZLOADED(card)	((ZO_V1 == (card).hw_ver || Z_FPGA_CHECK(card)) && \
 			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
 			((card).base_addr+ID_ADDRESS))->signature)))
 
@@ -1393,8 +1392,6 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
 	unsigned long loc_doorbell;
 
 	firm_id = cinfo->base_addr + ID_ADDRESS;
-	if (!ISZLOADED(*cinfo))
-		return -1;
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
@@ -1619,10 +1616,8 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	static struct BOARD_CTRL __iomem *board_ctrl;
 	static struct CH_CTRL __iomem *ch_ctrl;
 	static struct BUF_CTRL __iomem *buf_ctrl;
-	__u32 channel;
+	__u32 channel, param, fw_ver;
 	__u8 cmd;
-	__u32 param;
-	__u32 hw_ver, fw_ver;
 	int special_count;
 	int delta_count;
 
@@ -1630,7 +1625,6 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
-	hw_ver = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
 		special_count = 0;
@@ -2388,11 +2382,9 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
 		if (!ISZLOADED(*cinfo)) {
-			if (((ZE_V1 == readl(&cinfo->ctl_addr.p9060->
-							mail_box_0)) &&
-					Z_FPGA_CHECK(*cinfo)) &&
-					(ZFIRM_HLT == readl(
-						&firm_id->signature))) {
+			if (cinfo->hw_ver == ZE_V1 && Z_FPGA_CHECK(*cinfo) &&
+					readl(&firm_id->signature) ==
+					ZFIRM_HLT) {
 				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
 					"need an external power supply for "
 					"this number of ports.\nFirmware "
@@ -4336,7 +4328,6 @@ static void cy_hangup(struct tty_struct *tty)
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
 	struct cyclades_port *info;
-	u32 uninitialized_var(mailbox);
 	unsigned int nports, port;
 	unsigned short chip_number;
 	int uninitialized_var(index);
@@ -4344,8 +4335,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	spin_lock_init(&cinfo->card_lock);
 
 	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
-		mailbox = readl(&cinfo->ctl_addr.p9060->mail_box_0);
-		nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
+		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
 					   Z FW is loaded */
@@ -4377,7 +4367,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 
 		if (IS_CYC_Z(*cinfo)) {
 			info->type = PORT_STARTECH;
-			if (mailbox == ZO_V1)
+			if (cinfo->hw_ver == ZO_V1)
 				info->xmit_fifo_size = CYZ_FIFO_SIZE;
 			else
 				info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
@@ -4932,7 +4922,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 {
 	void __iomem *addr0 = NULL, *addr2 = NULL;
 	char *card_name = NULL;
-	u32 mailbox;
+	u32 uninitialized_var(mailbox);
 	unsigned int device_id, nchan = 0, card_no, i;
 	unsigned char plx_ver;
 	int retval, irq;
@@ -5014,7 +5004,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 		plx_init(pdev, irq, addr0);
 
-		mailbox = (u32)readl(&ctl_addr->mail_box_0);
+		mailbox = readl(&ctl_addr->mail_box_0);
 
 		addr2 = ioremap_nocache(pci_resource_start(pdev, 2),
 				mailbox == ZE_V1 ? CyPCI_Ze_win : CyPCI_Zwin);
@@ -5026,7 +5016,6 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		if (mailbox == ZE_V1) {
 			card_name = "Cyclades-Ze";
 
-			readl(&ctl_addr->mail_box_0);
 			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
@@ -5089,6 +5078,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 		cy_card[card_no].num_chips = nchan / 4;
 	} else {
+		cy_card[card_no].hw_ver = mailbox;
+		cy_card[card_no].num_chips = (unsigned int)-1;
 #ifdef CONFIG_CYZ_INTR
 		/* allocate IRQ only if board has an IRQ */
 		if (irq != 0 && irq != 255) {
@@ -5101,7 +5092,6 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			}
 		}
 #endif				/* CONFIG_CYZ_INTR */
-		cy_card[card_no].num_chips = (unsigned int)-1;
 	}
 
 	/* set cy_card */
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 9ae03d5b359..a14fe307976 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -518,6 +518,7 @@ struct cyclades_card {
 	unsigned int nports;	/* Number of ports in the card */
 	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
 	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
+	u32 hw_ver;
 	spinlock_t card_lock;
 	struct cyclades_port *ports;
 };
-- 
cgit v1.2.3-70-g09d2


From 08a951e1693e324bd035b194002a82b9057fd9c5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:33:40 +0100
Subject: tty: cyclades, remove typedefs

They are unused anyway.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cyclades.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index a14fe307976..1fbdea4f08e 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -142,19 +142,6 @@ struct CYZ_BOOT_CTRL {
 
 
 #ifndef DP_WINDOW_SIZE
-/* #include "cyclomz.h" */
-/****************** ****************** *******************/
-/*
- *	The data types defined below are used in all ZFIRM interface
- *	data structures. They accomodate differences between HW
- *	architectures and compilers.
- */
-
-typedef __u64  ucdouble;		/* 64 bits, unsigned */
-typedef __u32  uclong;			/* 32 bits, unsigned */
-typedef __u16  ucshort;		/* 16 bits, unsigned */
-typedef __u8   ucchar;			/* 8 bits, unsigned */
-
 /*
  *	Memory Window Sizes
  */
-- 
cgit v1.2.3-70-g09d2


From 70beaed22cbe12979e55d99b370e147e2e168562 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:39:12 +0100
Subject: serial: refactor ASYNC_ flags

Define ASYNCB_* flags which are bit numbers of the ASYNC_* flags.
This is useful for {test,set,clear}_bit.

Also convert each ASYNC_% to be (1 << ASYNCB_%) and define masks
with the macros, not constants.

Tested with:
#include "PATH_TO_KERNEL/include/linux/serial.h"
static struct {
        unsigned int new, old;
} as[] = {
        { ASYNC_HUP_NOTIFY, 0x0001 },
        { ASYNC_FOURPORT, 0x0002 },
...
	{ ASYNC_BOOT_ONLYMCA, 0x00400000 },
        { ASYNC_INTERNAL_FLAGS, 0xFFC00000 }
};
...
        for (a = 0; a < ARRAY_SIZE(as); a++)
                if (as[a].old != as[a].new)
                        printf("%.8x != %.8x\n", as[a].old, as[a].new);

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial.h | 116 +++++++++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial.h b/include/linux/serial.h
index 9136cc5608c..e5bb75a6380 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -96,54 +96,76 @@ struct serial_uart_config {
 
 /*
  * Definitions for async_struct (and serial_struct) flags field
+ *
+ * Define ASYNCB_* for convenient use with {test,set,clear}_bit.
  */
-#define ASYNC_HUP_NOTIFY 0x0001 /* Notify getty on hangups and closes 
-				   on the callout port */
-#define ASYNC_FOURPORT  0x0002	/* Set OU1, OUT2 per AST Fourport settings */
-#define ASYNC_SAK	0x0004	/* Secure Attention Key (Orange book) */
-#define ASYNC_SPLIT_TERMIOS 0x0008 /* Separate termios for dialin/callout */
-
-#define ASYNC_SPD_MASK	0x1030
-#define ASYNC_SPD_HI	0x0010	/* Use 56000 instead of 38400 bps */
-
-#define ASYNC_SPD_VHI	0x0020  /* Use 115200 instead of 38400 bps */
-#define ASYNC_SPD_CUST	0x0030  /* Use user-specified divisor */
-
-#define ASYNC_SKIP_TEST	0x0040 /* Skip UART test during autoconfiguration */
-#define ASYNC_AUTO_IRQ  0x0080 /* Do automatic IRQ during autoconfiguration */
-#define ASYNC_SESSION_LOCKOUT 0x0100 /* Lock out cua opens based on session */
-#define ASYNC_PGRP_LOCKOUT    0x0200 /* Lock out cua opens based on pgrp */
-#define ASYNC_CALLOUT_NOHUP   0x0400 /* Don't do hangups for cua device */
-
-#define ASYNC_HARDPPS_CD	0x0800	/* Call hardpps when CD goes high  */
-
-#define ASYNC_SPD_SHI	0x1000	/* Use 230400 instead of 38400 bps */
-#define ASYNC_SPD_WARP	0x1010	/* Use 460800 instead of 38400 bps */
-
-#define ASYNC_LOW_LATENCY 0x2000 /* Request low latency behaviour */
-
-#define ASYNC_BUGGY_UART  0x4000 /* This is a buggy UART, skip some safety
-				  * checks.  Note: can be dangerous! */
-
-#define ASYNC_AUTOPROBE	 0x8000 /* Port was autoprobed by PCI or PNP code */
-
-#define ASYNC_FLAGS	0x7FFF	/* Possible legal async flags */
-#define ASYNC_USR_MASK	0x3430	/* Legal flags that non-privileged
-				 * users can set or reset */
-
-/* Internal flags used only by kernel/chr_drv/serial.c */
-#define ASYNC_INITIALIZED	0x80000000 /* Serial port was initialized */
-#define ASYNC_NORMAL_ACTIVE	0x20000000 /* Normal device is active */
-#define ASYNC_BOOT_AUTOCONF	0x10000000 /* Autoconfigure port on bootup */
-#define ASYNC_CLOSING		0x08000000 /* Serial port is closing */
-#define ASYNC_CTS_FLOW		0x04000000 /* Do CTS flow control */
-#define ASYNC_CHECK_CD		0x02000000 /* i.e., CLOCAL */
-#define ASYNC_SHARE_IRQ		0x01000000 /* for multifunction cards
-					     --- no longer used */
-#define ASYNC_CONS_FLOW		0x00800000 /* flow control for console  */
-
-#define ASYNC_BOOT_ONLYMCA	0x00400000 /* Probe only if MCA bus */
-#define ASYNC_INTERNAL_FLAGS	0xFFC00000 /* Internal flags */
+#define ASYNCB_HUP_NOTIFY	 0 /* Notify getty on hangups and closes
+				    * on the callout port */
+#define ASYNCB_FOURPORT		 1 /* Set OU1, OUT2 per AST Fourport settings */
+#define ASYNCB_SAK		 2 /* Secure Attention Key (Orange book) */
+#define ASYNCB_SPLIT_TERMIOS	 3 /* Separate termios for dialin/callout */
+#define ASYNCB_SPD_HI		 4 /* Use 56000 instead of 38400 bps */
+#define ASYNCB_SPD_VHI		 5 /* Use 115200 instead of 38400 bps */
+#define ASYNCB_SKIP_TEST	 6 /* Skip UART test during autoconfiguration */
+#define ASYNCB_AUTO_IRQ		 7 /* Do automatic IRQ during
+				    * autoconfiguration */
+#define ASYNCB_SESSION_LOCKOUT	 8 /* Lock out cua opens based on session */
+#define ASYNCB_PGRP_LOCKOUT	 9 /* Lock out cua opens based on pgrp */
+#define ASYNCB_CALLOUT_NOHUP	10 /* Don't do hangups for cua device */
+#define ASYNCB_HARDPPS_CD	11 /* Call hardpps when CD goes high  */
+#define ASYNCB_SPD_SHI		12 /* Use 230400 instead of 38400 bps */
+#define ASYNCB_LOW_LATENCY	13 /* Request low latency behaviour */
+#define ASYNCB_BUGGY_UART	14 /* This is a buggy UART, skip some safety
+				    * checks.  Note: can be dangerous! */
+#define ASYNCB_AUTOPROBE	15 /* Port was autoprobed by PCI or PNP code */
+#define ASYNCB_LAST_USER	15
+
+/* Internal flags used only by kernel */
+#define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
+#define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
+#define ASYNCB_BOOT_AUTOCONF	28 /* Autoconfigure port on bootup */
+#define ASYNCB_CLOSING		27 /* Serial port is closing */
+#define ASYNCB_CTS_FLOW		26 /* Do CTS flow control */
+#define ASYNCB_CHECK_CD		25 /* i.e., CLOCAL */
+#define ASYNCB_SHARE_IRQ	24 /* for multifunction cards, no longer used */
+#define ASYNCB_CONS_FLOW	23 /* flow control for console  */
+#define ASYNCB_BOOT_ONLYMCA	22 /* Probe only if MCA bus */
+#define ASYNCB_FIRST_KERNEL	22
+
+#define ASYNC_HUP_NOTIFY	(1U << ASYNCB_HUP_NOTIFY)
+#define ASYNC_FOURPORT		(1U << ASYNCB_FOURPORT)
+#define ASYNC_SAK		(1U << ASYNCB_SAK)
+#define ASYNC_SPLIT_TERMIOS	(1U << ASYNCB_SPLIT_TERMIOS)
+#define ASYNC_SPD_HI		(1U << ASYNCB_SPD_HI)
+#define ASYNC_SPD_VHI		(1U << ASYNCB_SPD_VHI)
+#define ASYNC_SKIP_TEST		(1U << ASYNCB_SKIP_TEST)
+#define ASYNC_AUTO_IRQ		(1U << ASYNCB_AUTO_IRQ)
+#define ASYNC_SESSION_LOCKOUT	(1U << ASYNCB_SESSION_LOCKOUT)
+#define ASYNC_PGRP_LOCKOUT	(1U << ASYNCB_PGRP_LOCKOUT)
+#define ASYNC_CALLOUT_NOHUP	(1U << ASYNCB_CALLOUT_NOHUP)
+#define ASYNC_HARDPPS_CD	(1U << ASYNCB_HARDPPS_CD)
+#define ASYNC_SPD_SHI		(1U << ASYNCB_SPD_SHI)
+#define ASYNC_LOW_LATENCY	(1U << ASYNCB_LOW_LATENCY)
+#define ASYNC_BUGGY_UART	(1U << ASYNCB_BUGGY_UART)
+#define ASYNC_AUTOPROBE		(1U << ASYNCB_AUTOPROBE)
+
+#define ASYNC_FLAGS		((1U << ASYNCB_LAST_USER) - 1)
+#define ASYNC_USR_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI| \
+		ASYNC_CALLOUT_NOHUP|ASYNC_SPD_SHI|ASYNC_LOW_LATENCY)
+#define ASYNC_SPD_CUST		(ASYNC_SPD_HI|ASYNC_SPD_VHI)
+#define ASYNC_SPD_WARP		(ASYNC_SPD_HI|ASYNC_SPD_SHI)
+#define ASYNC_SPD_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
+
+#define ASYNC_INITIALIZED	(1U << ASYNCB_INITIALIZED)
+#define ASYNC_NORMAL_ACTIVE	(1U << ASYNCB_NORMAL_ACTIVE)
+#define ASYNC_BOOT_AUTOCONF	(1U << ASYNCB_BOOT_AUTOCONF)
+#define ASYNC_CLOSING		(1U << ASYNCB_CLOSING)
+#define ASYNC_CTS_FLOW		(1U << ASYNCB_CTS_FLOW)
+#define ASYNC_CHECK_CD		(1U << ASYNCB_CHECK_CD)
+#define ASYNC_SHARE_IRQ		(1U << ASYNCB_SHARE_IRQ)
+#define ASYNC_CONS_FLOW		(1U << ASYNCB_CONS_FLOW)
+#define ASYNC_BOOT_ONLYMCA	(1U << ASYNCB_BOOT_ONLYMCA)
+#define ASYNC_INTERNAL_FLAGS	(~((1U << ASYNCB_FIRST_KERNEL) - 1))
 
 /*
  * Multiport serial configuration structure --- external structure
-- 
cgit v1.2.3-70-g09d2


From 70fd8fdecc4430ffcede7704dd812d4054d1faf9 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Thu, 11 Jun 2009 12:41:57 +0100
Subject: 8250_pci: add the OXCB950 chip to the 8250 PCI driver.

This adds support for the following serial controller chip:
Oxford Semiconductor OXCB950 for PCI Cardbus interface
http://www.transdimension.com/products/serial/OXCB950.html

on this card:
ExSys EX-1370 1 port high-speed serial card for ExpressCard/34 slot

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 3 +++
 include/linux/pci_ids.h   | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 938bc1b6c3f..e371a9c1534 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -2776,6 +2776,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 	{	PCI_VENDOR_ID_OXSEMI, 0x950a,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_2_1130000 },
+	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_C950,
+		PCI_VENDOR_ID_OXSEMI, PCI_SUBDEVICE_ID_OXSEMI_C950, 0, 0,
+		pbn_b0_1_921600 },
 	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_4_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0f71812d67d..d7d1c41a0b1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1996,10 +1996,12 @@
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_U	0xC118
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_GU	0xC11C
 #define PCI_DEVICE_ID_OXSEMI_16PCI954	0x9501
+#define PCI_DEVICE_ID_OXSEMI_C950	0x950B
 #define PCI_DEVICE_ID_OXSEMI_16PCI95N	0x9511
 #define PCI_DEVICE_ID_OXSEMI_16PCI954PP	0x9513
 #define PCI_DEVICE_ID_OXSEMI_16PCI952	0x9521
 #define PCI_DEVICE_ID_OXSEMI_16PCI952PP	0x9523
+#define PCI_SUBDEVICE_ID_OXSEMI_C950	0x0001
 
 #define PCI_VENDOR_ID_CHELSIO		0x1425
 
-- 
cgit v1.2.3-70-g09d2


From 38db89799bdf11625a831c5af33938dcb11908b6 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:44:17 +0100
Subject: tty: throttling race fix

The tty throttling code can race due to the lock drops. It takes very high
loads but this has been observed and verified by Rob Duncan.

The basic problem is that on an SMP box we can go

	CPU #1				CPU #2
	need to throttle ?
	suppose we should		buffer space cleared
					are we throttled
					yes ? - unthrottle
	call throttle method

This changeet take the termios lock to protect against this. The termios
lock isn't the initial obvious candidate but many implementations of throttle
methods already need to poke around their own termios structures (and nobody
really locks them against a racing change of flow control).

This does mean that anyone who is setting tty->low_latency = 1 and then
calling tty_flip_buffer_push from their unthrottle method is going to end up
collapsing in a pile of locks. However we've removed all the known bogus
users of low_latency = 1 and such use isn't safe anyway for other reasons so
catching it would be an improvement.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_ioctl.c   | 13 +++++++++++++
 include/linux/tty_driver.h |  6 ++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 6f4c7d0a53b..2401dbcbee9 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -97,14 +97,19 @@ EXPORT_SYMBOL(tty_driver_flush_buffer);
  *	@tty: terminal
  *
  *	Indicate that a tty should stop transmitting data down the stack.
+ *	Takes the termios mutex to protect against parallel throttle/unthrottle
+ *	and also to ensure the driver can consistently reference its own
+ *	termios data at this point when implementing software flow control.
  */
 
 void tty_throttle(struct tty_struct *tty)
 {
+	mutex_lock(&tty->termios_mutex);
 	/* check TTY_THROTTLED first so it indicates our state */
 	if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->throttle)
 		tty->ops->throttle(tty);
+	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_throttle);
 
@@ -113,13 +118,21 @@ EXPORT_SYMBOL(tty_throttle);
  *	@tty: terminal
  *
  *	Indicate that a tty may continue transmitting data down the stack.
+ *	Takes the termios mutex to protect against parallel throttle/unthrottle
+ *	and also to ensure the driver can consistently reference its own
+ *	termios data at this point when implementing software flow control.
+ *
+ *	Drivers should however remember that the stack can issue a throttle,
+ *	then change flow control method, then unthrottle.
  */
 
 void tty_unthrottle(struct tty_struct *tty)
 {
+	mutex_lock(&tty->termios_mutex);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->unthrottle)
 		tty->ops->unthrottle(tty);
+	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_unthrottle);
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index bcba84ea2d8..3566129384a 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -127,7 +127,8 @@
  * 	the line discipline are close to full, and it should somehow
  * 	signal that no more characters should be sent to the tty.
  *
- *	Optional: Always invoke via tty_throttle();
+ *	Optional: Always invoke via tty_throttle(), called under the
+ *	termios lock.
  * 
  * void (*unthrottle)(struct tty_struct * tty);
  *
@@ -135,7 +136,8 @@
  * 	that characters can now be sent to the tty without fear of
  * 	overrunning the input buffers of the line disciplines.
  * 
- *	Optional: Always invoke via tty_unthrottle();
+ *	Optional: Always invoke via tty_unthrottle(), called under the
+ *	termios lock.
  *
  * void (*stop)(struct tty_struct *tty);
  *
-- 
cgit v1.2.3-70-g09d2


From e8b70e7d3e86319a8b2aaabde3866833d92cd80f Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:48:02 +0100
Subject: tty: Extract various bits of ldisc code

Before trying to tackle the ldisc bugs the code needs to be a good deal
more readable, so do the simple extractions of routines first.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c    | 24 +++++++++---
 drivers/char/tty_ldisc.c | 97 ++++++++++++++++++++++++++++++++----------------
 include/linux/tty.h      |  3 ++
 3 files changed, 88 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6c817398232..be49d0730bb 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2481,6 +2481,24 @@ static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int
 	return tty->ops->tiocmset(tty, file, set, clear);
 }
 
+struct tty_struct *tty_pair_get_tty(struct tty_struct *tty)
+{
+	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+	    tty->driver->subtype == PTY_TYPE_MASTER)
+		tty = tty->link;
+	return tty;
+}
+EXPORT_SYMBOL(tty_pair_get_tty);
+
+struct tty_struct *tty_pair_get_pty(struct tty_struct *tty)
+{
+	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+	    tty->driver->subtype == PTY_TYPE_MASTER)
+	    return tty;
+	return tty->link;
+}
+EXPORT_SYMBOL(tty_pair_get_pty);
+
 /*
  * Split this up, as gcc can choke on it otherwise..
  */
@@ -2496,11 +2514,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 
-	real_tty = tty;
-	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
-	    tty->driver->subtype == PTY_TYPE_MASTER)
-		real_tty = tty->link;
-
+	real_tty = tty_pair_get_tty(tty);
 
 	/*
 	 * Factor out some common prep work
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index f78f5b0127a..e3c6416aa86 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -443,6 +443,50 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 	}
 }
 
+/**
+ *	tty_ldisc_halt		-	shutdown the line discipline
+ *	@tty: tty device
+ *
+ *	Shut down the line discipline and work queue for this tty device.
+ *	The TTY_LDISC flag being cleared ensures no further references can
+ *	be obtained while the delayed work queue halt ensures that no more
+ *	data is fed to the ldisc.
+ *
+ *	In order to wait for any existing references to complete see 
+ *	tty_ldisc_wait_idle.
+ */
+
+static void tty_ldisc_halt(struct tty_struct *tty)
+{
+	clear_bit(TTY_LDISC, &tty->flags);
+	cancel_delayed_work(&tty->buf.work);
+	/*
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate
+	 */
+	flush_scheduled_work();
+}
+
+/**
+ *	tty_ldisc_wait_idle	-	wait for the ldisc to become idle
+ *	@tty: tty to wait for
+ *
+ *	Wait for the line discipline to become idle. The discipline must
+ *	have been halted for this to guarantee it remains idle.
+ *
+ */
+
+static void tty_ldisc_wait_idle(struct tty_struct *tty)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&tty_ldisc_lock, flags);
+	while (tty->ldisc.refcount) {
+		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
+		spin_lock_irqsave(&tty_ldisc_lock, flags);
+	}
+	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+}
+
 /**
  *	tty_set_ldisc		-	set line discipline
  *	@tty: the terminal to set
@@ -636,6 +680,21 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	return 0;
 }
 
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+	struct tty_ldisc ld;
+
+	if (tty->ldisc.ops->close)
+		(tty->ldisc.ops->close)(tty);
+	tty_ldisc_put(tty->ldisc.ops);
+	/*
+	 *	Switch the line discipline back
+	 */
+	WARN_ON(tty_ldisc_get(N_TTY, &ld));
+	tty_ldisc_assign(tty, &ld);
+	tty_set_termios_ldisc(tty, N_TTY);
+}
+
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
@@ -647,58 +706,34 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-	unsigned long flags;
-	struct tty_ldisc ld;
+
 	/*
 	 * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
 	 * kill any delayed work. As this is the final close it does not
 	 * race with the set_ldisc code path.
 	 */
-	clear_bit(TTY_LDISC, &tty->flags);
-	cancel_delayed_work(&tty->buf.work);
 
-	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 */
-
-	flush_scheduled_work();
+	tty_ldisc_halt(tty);
 
 	/*
 	 * Wait for any short term users (we know they are just driver
 	 * side waiters as the file is closing so user count on the file
 	 * side is zero.
 	 */
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	while (tty->ldisc.refcount) {
-		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
-		spin_lock_irqsave(&tty_ldisc_lock, flags);
-	}
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+
+	tty_ldisc_wait_idle(tty);
+
 	/*
 	 * Shutdown the current line discipline, and reset it to N_TTY.
 	 *
 	 * FIXME: this MUST get fixed for the new reflocking
 	 */
-	if (tty->ldisc.ops->close)
-		(tty->ldisc.ops->close)(tty);
-	tty_ldisc_put(tty->ldisc.ops);
 
-	/*
-	 *	Switch the line discipline back
-	 */
-	WARN_ON(tty_ldisc_get(N_TTY, &ld));
-	tty_ldisc_assign(tty, &ld);
-	tty_set_termios_ldisc(tty, N_TTY);
+	tty_ldisc_reinit(tty);
 	if (o_tty) {
 		/* FIXME: could o_tty be in setldisc here ? */
 		clear_bit(TTY_LDISC, &o_tty->flags);
-		if (o_tty->ldisc.ops->close)
-			(o_tty->ldisc.ops->close)(o_tty);
-		tty_ldisc_put(o_tty->ldisc.ops);
-		WARN_ON(tty_ldisc_get(N_TTY, &ld));
-		tty_ldisc_assign(o_tty, &ld);
-		tty_set_termios_ldisc(o_tty, N_TTY);
+		tty_ldisc_reinit(o_tty);
 	}
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bed5a3d4030..f9c13c83790 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -428,6 +428,9 @@ extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
 extern void tty_release_dev(struct file *filp);
 extern int tty_init_termios(struct tty_struct *tty);
 
+extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty);
+extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
+
 extern struct mutex tty_mutex;
 
 extern void tty_write_unlock(struct tty_struct *tty);
-- 
cgit v1.2.3-70-g09d2


From c65c9bc3efa5589f691276bb9db689119a711222 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:50:12 +0100
Subject: tty: rewrite the ldisc locking

There are several pretty much unfixable races in the old ldisc code, especially
with respect to pty behaviour and also to hangup. It's easier to rewrite the
code than simply try and patch it up.

This patch
- splits the ldisc from the tty (so we will be able to refcount it more cleanly
  later)
- introduces a mutex lock for ldisc changing on an active device
- fixes the complete mess that hangup caused
- implements hopefully correct setldisc/close/hangup locking

There are still some problems around pty pairs that have always been there but
at least it is now possible to understand the code and fix further problems.

This fixes the following known bugs
- hang up can leak ldisc references
- hang up may not call open/close on ldisc in a matched way
- pty/tty pairs can deadlock during an ldisc change
- reading the ldisc proc files can cause every ldisc to be loaded

and probably a few other of the mysterious ldisc race reports.

I'm sure it also adds the odd new one.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/bluetooth/hci_ldisc.c |   4 +-
 drivers/char/cyclades.c       |   2 +-
 drivers/char/epca.c           |   4 +-
 drivers/char/ip2/i2lib.c      |   4 +-
 drivers/char/ip2/ip2main.c    |   4 +-
 drivers/char/n_hdlc.c         |   4 +-
 drivers/char/pty.c            |  10 +-
 drivers/char/selection.c      |   2 +-
 drivers/char/tty_io.c         |  64 +-----
 drivers/char/tty_ldisc.c      | 477 ++++++++++++++++++++++++++----------------
 include/linux/tty.h           |   9 +-
 11 files changed, 330 insertions(+), 254 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index af761dc434f..68801512859 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -277,8 +277,8 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 	/* FIXME: why is this needed. Note don't use ldisc_ref here as the
 	   open path is before the ldisc is referencable */
 
-	if (tty->ldisc.ops->flush_buffer)
-		tty->ldisc.ops->flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
 	tty_driver_flush_buffer(tty);
 
 	return 0;
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 45601905174..f3366d3f06c 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -5200,7 +5200,7 @@ static int cyclades_proc_show(struct seq_file *m, void *v)
 					(cur_jifs - info->idle_stats.recv_idle)/
 					HZ, info->idle_stats.overruns,
 					/* FIXME: double check locking */
-					(long)info->port.tty->ldisc.ops->num);
+					(long)info->port.tty->ldisc->ops->num);
 			else
 				seq_printf(m, "%3d %8lu %10lu %8lu "
 					"%10lu %8lu %9lu %6ld\n",
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 710ee933305..abef1f7d84f 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -2114,8 +2114,8 @@ static int pc_ioctl(struct tty_struct *tty, struct file *file,
 			tty_wait_until_sent(tty, 0);
 		} else {
 			/* ldisc lock already held in ioctl */
-			if (tty->ldisc.ops->flush_buffer)
-				tty->ldisc.ops->flush_buffer(tty);
+			if (tty->ldisc->ops->flush_buffer)
+				tty->ldisc->ops->flush_buffer(tty);
 		}
 		unlock_kernel();
 		/* Fall Thru */
diff --git a/drivers/char/ip2/i2lib.c b/drivers/char/ip2/i2lib.c
index 0061e18aff6..0d10b89218e 100644
--- a/drivers/char/ip2/i2lib.c
+++ b/drivers/char/ip2/i2lib.c
@@ -868,11 +868,11 @@ i2Input(i2ChanStrPtr pCh)
 		amountToMove = count;
 	}
 	// Move the first block
-	pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+	pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
 		 &(pCh->Ibuf[stripIndex]), NULL, amountToMove );
 	// If we needed to wrap, do the second data move
 	if (count > amountToMove) {
-		pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+		pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
 		 pCh->Ibuf, NULL, count - amountToMove );
 	}
 	// Bump and wrap the stripIndex all at once by the amount of data read. This
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index afd9247cf08..517271c762e 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -1315,8 +1315,8 @@ static inline void  isig(int sig, struct tty_struct *tty, int flush)
 	if (tty->pgrp)
 		kill_pgrp(tty->pgrp, sig, 1);
 	if (flush || !L_NOFLSH(tty)) {
-		if ( tty->ldisc.ops->flush_buffer )  
-			tty->ldisc.ops->flush_buffer(tty);
+		if ( tty->ldisc->ops->flush_buffer )  
+			tty->ldisc->ops->flush_buffer(tty);
 		i2InputFlush( tty->driver_data );
 	}
 }
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index bacb3e2872a..461ece591a5 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -342,8 +342,8 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 #endif
 	
 	/* Flush any pending characters in the driver and discipline. */
-	if (tty->ldisc.ops->flush_buffer)
-		tty->ldisc.ops->flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
 
 	tty_driver_flush_buffer(tty);
 		
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index da2cb8c70c1..5acd29e6e04 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -110,7 +110,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf,
 	c = to->receive_room;
 	if (c > count)
 		c = count;
-	to->ldisc.ops->receive_buf(to, buf, NULL, c);
+	to->ldisc->ops->receive_buf(to, buf, NULL, c);
 
 	return c;
 }
@@ -148,11 +148,11 @@ static int pty_chars_in_buffer(struct tty_struct *tty)
 	int count;
 
 	/* We should get the line discipline lock for "tty->link" */
-	if (!to || !to->ldisc.ops->chars_in_buffer)
+	if (!to || !to->ldisc->ops->chars_in_buffer)
 		return 0;
 
 	/* The ldisc must report 0 if no characters available to be read */
-	count = to->ldisc.ops->chars_in_buffer(to);
+	count = to->ldisc->ops->chars_in_buffer(to);
 
 	if (tty->driver->subtype == PTY_TYPE_SLAVE)
 		return count;
@@ -186,8 +186,8 @@ static void pty_flush_buffer(struct tty_struct *tty)
 	if (!to)
 		return;
 
-	if (to->ldisc.ops->flush_buffer)
-		to->ldisc.ops->flush_buffer(to);
+	if (to->ldisc->ops->flush_buffer)
+		to->ldisc->ops->flush_buffer(to);
 
 	if (to->packet) {
 		spin_lock_irqsave(&tty->ctrl_lock, flags);
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index cb8ca569896..f97b9e84806 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -327,7 +327,7 @@ int paste_selection(struct tty_struct *tty)
 		}
 		count = sel_buffer_lth - pasted;
 		count = min(count, tty->receive_room);
-		tty->ldisc.ops->receive_buf(tty, sel_buffer + pasted,
+		tty->ldisc->ops->receive_buf(tty, sel_buffer + pasted,
 								NULL, count);
 		pasted += count;
 	}
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index be49d0730bb..2f44b0b241c 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -491,22 +491,6 @@ void tty_ldisc_flush(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 
-/**
- *	tty_reset_termios	-	reset terminal state
- *	@tty: tty to reset
- *
- *	Restore a terminal to the driver default state
- */
-
-static void tty_reset_termios(struct tty_struct *tty)
-{
-	mutex_lock(&tty->termios_mutex);
-	*tty->termios = tty->driver->init_termios;
-	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
-	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
-	mutex_unlock(&tty->termios_mutex);
-}
-
 /**
  *	do_tty_hangup		-	actual handler for hangup events
  *	@work: tty device
@@ -536,7 +520,6 @@ static void do_tty_hangup(struct work_struct *work)
 	struct file *cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
-	struct tty_ldisc *ld;
 	int    closecount = 0, n;
 	unsigned long flags;
 	int refs = 0;
@@ -567,40 +550,8 @@ static void do_tty_hangup(struct work_struct *work)
 		filp->f_op = &hung_up_tty_fops;
 	}
 	file_list_unlock();
-	/*
-	 * FIXME! What are the locking issues here? This may me overdoing
-	 * things... This question is especially important now that we've
-	 * removed the irqlock.
-	 */
-	ld = tty_ldisc_ref(tty);
-	if (ld != NULL) {
-		/* We may have no line discipline at this point */
-		if (ld->ops->flush_buffer)
-			ld->ops->flush_buffer(tty);
-		tty_driver_flush_buffer(tty);
-		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
-		    ld->ops->write_wakeup)
-			ld->ops->write_wakeup(tty);
-		if (ld->ops->hangup)
-			ld->ops->hangup(tty);
-	}
-	/*
-	 * FIXME: Once we trust the LDISC code better we can wait here for
-	 * ldisc completion and fix the driver call race
-	 */
-	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
-	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
-	/*
-	 * Shutdown the current line discipline, and reset it to
-	 * N_TTY.
-	 */
-	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS)
-		tty_reset_termios(tty);
-	/* Defer ldisc switch */
-	/* tty_deferred_ldisc_switch(N_TTY);
 
-	  This should get done automatically when the port closes and
-	  tty_release is called */
+	tty_ldisc_hangup(tty);
 
 	read_lock(&tasklist_lock);
 	if (tty->session) {
@@ -629,12 +580,15 @@ static void do_tty_hangup(struct work_struct *work)
 	read_unlock(&tasklist_lock);
 
 	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	tty->flags = 0;
+	clear_bit(TTY_THROTTLED, &tty->flags);
+	clear_bit(TTY_PUSH, &tty->flags);
+	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
 	put_pid(tty->session);
 	put_pid(tty->pgrp);
 	tty->session = NULL;
 	tty->pgrp = NULL;
 	tty->ctrl_status = 0;
+	set_bit(TTY_HUPPED, &tty->flags);
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
 	/* Account for the p->signal references we killed */
@@ -660,10 +614,7 @@ static void do_tty_hangup(struct work_struct *work)
 	 * can't yet guarantee all that.
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
-	if (ld) {
-		tty_ldisc_enable(tty);
-		tty_ldisc_deref(ld);
-	}
+	tty_ldisc_enable(tty);
 	unlock_kernel();
 	if (f)
 		fput(f);
@@ -2570,7 +2521,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc.ops->num, (int __user *)p);
+		return put_user(tty->ldisc->ops->num, (int __user *)p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 	/*
@@ -2785,6 +2736,7 @@ void initialize_tty_struct(struct tty_struct *tty,
 	tty->buf.head = tty->buf.tail = NULL;
 	tty_buffer_init(tty);
 	mutex_init(&tty->termios_mutex);
+	mutex_init(&tty->ldisc_mutex);
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index e3c6416aa86..a58a19a6a5e 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -115,19 +115,22 @@ EXPORT_SYMBOL(tty_unregister_ldisc);
 /**
  *	tty_ldisc_try_get	-	try and reference an ldisc
  *	@disc: ldisc number
- *	@ld: tty ldisc structure to complete
  *
  *	Attempt to open and lock a line discipline into place. Return
- *	the line discipline refcounted and assigned in ld. On an error
- *	report the error code back
+ *	the line discipline refcounted or an error.
  */
 
-static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_try_get(int disc)
 {
 	unsigned long flags;
+	struct tty_ldisc *ld;
 	struct tty_ldisc_ops *ldops;
 	int err = -EINVAL;
 	
+	ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL);
+	if (ld == NULL)
+		return ERR_PTR(-ENOMEM);
+
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
 	ld->ops = NULL;
 	ldops = tty_ldiscs[disc];
@@ -140,17 +143,19 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
 			/* lock it */
 			ldops->refcount++;
 			ld->ops = ldops;
+			ld->refcount = 0;
 			err = 0;
 		}
 	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	return err;
+	if (err)
+		return ERR_PTR(err);
+	return ld;
 }
 
 /**
  *	tty_ldisc_get		-	take a reference to an ldisc
  *	@disc: ldisc number
- *	@ld: tty line discipline structure to use
  *
  *	Takes a reference to a line discipline. Deals with refcounts and
  *	module locking counts. Returns NULL if the discipline is not available.
@@ -161,44 +166,46 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-static int tty_ldisc_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_get(int disc)
 {
-	int err;
+	struct tty_ldisc *ld;
 
 	if (disc < N_TTY || disc >= NR_LDISCS)
-		return -EINVAL;
-	err = tty_ldisc_try_get(disc, ld);
-	if (err < 0) {
+		return ERR_PTR(-EINVAL);
+	ld = tty_ldisc_try_get(disc);
+	if (IS_ERR(ld)) {
 		request_module("tty-ldisc-%d", disc);
-		err = tty_ldisc_try_get(disc, ld);
+		ld = tty_ldisc_try_get(disc);
 	}
-	return err;
+	return ld;
 }
 
 /**
  *	tty_ldisc_put		-	drop ldisc reference
- *	@disc: ldisc number
+ *	@ld: ldisc
  *
  *	Drop a reference to a line discipline. Manage refcounts and
- *	module usage counts
+ *	module usage counts. Free the ldisc once the recount hits zero.
  *
  *	Locking:
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-static void tty_ldisc_put(struct tty_ldisc_ops *ld)
+static void tty_ldisc_put(struct tty_ldisc *ld)
 {
 	unsigned long flags;
-	int disc = ld->num;
+	int disc = ld->ops->num;
+	struct tty_ldisc_ops *ldo;
 
 	BUG_ON(disc < N_TTY || disc >= NR_LDISCS);
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = tty_ldiscs[disc];
-	BUG_ON(ld->refcount == 0);
-	ld->refcount--;
-	module_put(ld->owner);
+	ldo = tty_ldiscs[disc];
+	BUG_ON(ldo->refcount == 0);
+	ldo->refcount--;
+	module_put(ldo->owner);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	kfree(ld);
 }
 
 static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
@@ -219,12 +226,13 @@ static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
 static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 {
 	int i = *(loff_t *)v;
-	struct tty_ldisc ld;
+	struct tty_ldisc *ld;
 	
-	if (tty_ldisc_get(i, &ld) < 0)
+	ld = tty_ldisc_try_get(i);
+	if (IS_ERR(ld))
 		return 0;
-	seq_printf(m, "%-10s %2d\n", ld.ops->name ? ld.ops->name : "???", i);
-	tty_ldisc_put(ld.ops);
+	seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i);
+	tty_ldisc_put(ld);
 	return 0;
 }
 
@@ -263,8 +271,7 @@ const struct file_operations tty_ldiscs_proc_fops = {
 
 static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
 {
-	ld->refcount = 0;
-	tty->ldisc = *ld;
+	tty->ldisc = ld;
 }
 
 /**
@@ -286,7 +293,7 @@ static int tty_ldisc_try(struct tty_struct *tty)
 	int ret = 0;
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = &tty->ldisc;
+	ld = tty->ldisc;
 	if (test_bit(TTY_LDISC, &tty->flags)) {
 		ld->refcount++;
 		ret = 1;
@@ -315,8 +322,8 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 {
 	/* wait_event is a macro */
 	wait_event(tty_ldisc_wait, tty_ldisc_try(tty));
-	WARN_ON(tty->ldisc.refcount == 0);
-	return &tty->ldisc;
+	WARN_ON(tty->ldisc->refcount == 0);
+	return tty->ldisc;
 }
 
 EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
@@ -335,7 +342,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
 {
 	if (tty_ldisc_try(tty))
-		return &tty->ldisc;
+		return tty->ldisc;
 	return NULL;
 }
 
@@ -407,6 +414,39 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 	mutex_unlock(&tty->termios_mutex);
 }
 
+/**
+ *	tty_ldisc_open		-	open a line discipline
+ *	@tty: tty we are opening the ldisc on
+ *	@ld: discipline to open
+ *
+ *	A helper opening method. Also a convenient debugging and check
+ *	point.
+ */
+
+static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+	WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags));
+	if (ld->ops->open)
+		return ld->ops->open(tty);
+	return 0;
+}
+
+/**
+ *	tty_ldisc_close		-	close a line discipline
+ *	@tty: tty we are opening the ldisc on
+ *	@ld: discipline to close
+ *
+ *	A helper close method. Also a convenient debugging and check
+ *	point.
+ */
+
+static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+	WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags));
+	clear_bit(TTY_LDISC_OPEN, &tty->flags);
+	if (ld->ops->close)
+		ld->ops->close(tty);
+}
 
 /**
  *	tty_ldisc_restore	-	helper for tty ldisc change
@@ -420,31 +460,32 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 {
 	char buf[64];
-	struct tty_ldisc new_ldisc;
+	struct tty_ldisc *new_ldisc;
+	int r;
 
 	/* There is an outstanding reference here so this is safe */
-	tty_ldisc_get(old->ops->num, old);
+	old = tty_ldisc_get(old->ops->num);
+	WARN_ON(IS_ERR(old));
 	tty_ldisc_assign(tty, old);
 	tty_set_termios_ldisc(tty, old->ops->num);
-	if (old->ops->open && (old->ops->open(tty) < 0)) {
-		tty_ldisc_put(old->ops);
+	if (tty_ldisc_open(tty, old) < 0) {
+		tty_ldisc_put(old);
 		/* This driver is always present */
-		if (tty_ldisc_get(N_TTY, &new_ldisc) < 0)
+		new_ldisc =tty_ldisc_get(N_TTY);
+		if (IS_ERR(new_ldisc))
 			panic("n_tty: get");
-		tty_ldisc_assign(tty, &new_ldisc);
+		tty_ldisc_assign(tty, new_ldisc);
 		tty_set_termios_ldisc(tty, N_TTY);
-		if (new_ldisc.ops->open) {
-			int r = new_ldisc.ops->open(tty);
-				if (r < 0)
-				panic("Couldn't open N_TTY ldisc for "
-				      "%s --- error %d.",
-				      tty_name(tty, buf), r);
-		}
+		r = tty_ldisc_open(tty, new_ldisc);
+		if (r < 0)
+			panic("Couldn't open N_TTY ldisc for "
+			      "%s --- error %d.",
+			      tty_name(tty, buf), r);
 	}
 }
 
 /**
- *	tty_ldisc_halt		-	shutdown the line discipline
+ *	tty_ldisc_halt		-	shut down the line discipline
  *	@tty: tty device
  *
  *	Shut down the line discipline and work queue for this tty device.
@@ -456,14 +497,10 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
  *	tty_ldisc_wait_idle.
  */
 
-static void tty_ldisc_halt(struct tty_struct *tty)
+static int tty_ldisc_halt(struct tty_struct *tty)
 {
 	clear_bit(TTY_LDISC, &tty->flags);
-	cancel_delayed_work(&tty->buf.work);
-	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 */
-	flush_scheduled_work();
+	return cancel_delayed_work(&tty->buf.work);
 }
 
 /**
@@ -473,18 +510,22 @@ static void tty_ldisc_halt(struct tty_struct *tty)
  *	Wait for the line discipline to become idle. The discipline must
  *	have been halted for this to guarantee it remains idle.
  *
+ *	tty_ldisc_lock protects the ref counts currently.
  */
 
-static void tty_ldisc_wait_idle(struct tty_struct *tty)
+static int tty_ldisc_wait_idle(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	while (tty->ldisc.refcount) {
+	while (tty->ldisc->refcount) {
 		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
+		if (wait_event_timeout(tty_ldisc_wait,
+				tty->ldisc->refcount == 0, 5 * HZ) == 0)
+			return -EBUSY;
 		spin_lock_irqsave(&tty_ldisc_lock, flags);
 	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	return 0;
 }
 
 /**
@@ -493,38 +534,63 @@ static void tty_ldisc_wait_idle(struct tty_struct *tty)
  *	@ldisc: the line discipline
  *
  *	Set the discipline of a tty line. Must be called from a process
- *	context.
+ *	context. The ldisc change logic has to protect itself against any
+ *	overlapping ldisc change (including on the other end of pty pairs),
+ *	the close of one side of a tty/pty pair, and eventually hangup.
  *
- *	Locking: takes tty_ldisc_lock.
- *		 called functions take termios_mutex
+ *	Locking: takes tty_ldisc_lock, termios_mutex
  */
 
 int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 {
 	int retval;
-	struct tty_ldisc o_ldisc, new_ldisc;
-	int work;
-	unsigned long flags;
+	struct tty_ldisc *o_ldisc, *new_ldisc;
+	int work, o_work = 0;
 	struct tty_struct *o_tty;
 
-restart:
-	/* This is a bit ugly for now but means we can break the 'ldisc
-	   is part of the tty struct' assumption later */
-	retval = tty_ldisc_get(ldisc, &new_ldisc);
-	if (retval)
-		return retval;
+	new_ldisc = tty_ldisc_get(ldisc);
+	if (IS_ERR(new_ldisc))
+		return PTR_ERR(new_ldisc);
 
 	/*
-	 *	Problem: What do we do if this blocks ?
+	 *	We need to look at the tty locking here for pty/tty pairs
+	 *	when both sides try to change in parallel.
 	 */
 
-	tty_wait_until_sent(tty, 0);
+	o_tty = tty->link;	/* o_tty is the pty side or NULL */
+
 
-	if (tty->ldisc.ops->num == ldisc) {
-		tty_ldisc_put(new_ldisc.ops);
+	/*
+	 *	Check the no-op case
+	 */
+
+	if (tty->ldisc->ops->num == ldisc) {
+		tty_ldisc_put(new_ldisc);
 		return 0;
 	}
 
+	/*
+	 *	Problem: What do we do if this blocks ?
+	 *	We could deadlock here
+	 */
+
+	tty_wait_until_sent(tty, 0);
+
+	mutex_lock(&tty->ldisc_mutex);
+
+	/*
+	 *	We could be midstream of another ldisc change which has
+	 *	dropped the lock during processing. If so we need to wait.
+	 */
+
+	while (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
+		mutex_unlock(&tty->ldisc_mutex);
+		wait_event(tty_ldisc_wait,
+			test_bit(TTY_LDISC_CHANGING, &tty->flags) == 0);
+		mutex_lock(&tty->ldisc_mutex);
+	}
+	set_bit(TTY_LDISC_CHANGING, &tty->flags);
+		
 	/*
 	 *	No more input please, we are switching. The new ldisc
 	 *	will update this value in the ldisc open function
@@ -533,8 +599,6 @@ restart:
 	tty->receive_room = 0;
 
 	o_ldisc = tty->ldisc;
-	o_tty = tty->link;
-
 	/*
 	 *	Make sure we don't change while someone holds a
 	 *	reference to the line discipline. The TTY_LDISC bit
@@ -545,108 +609,181 @@ restart:
 	 *	with a userspace app continually trying to use the tty in
 	 *	parallel to the change and re-referencing the tty.
 	 */
-	clear_bit(TTY_LDISC, &tty->flags);
-	if (o_tty)
-		clear_bit(TTY_LDISC, &o_tty->flags);
 
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	if (tty->ldisc.refcount || (o_tty && o_tty->ldisc.refcount)) {
-		if (tty->ldisc.refcount) {
-			/* Free the new ldisc we grabbed. Must drop the lock
-			   first. */
-			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(o_ldisc.ops);
-			/*
-			 * There are several reasons we may be busy, including
-			 * random momentary I/O traffic. We must therefore
-			 * retry. We could distinguish between blocking ops
-			 * and retries if we made tty_ldisc_wait() smarter.
-			 * That is up for discussion.
-			 */
-			if (wait_event_interruptible(tty_ldisc_wait, tty->ldisc.refcount == 0) < 0)
-				return -ERESTARTSYS;
-			goto restart;
-		}
-		if (o_tty && o_tty->ldisc.refcount) {
-			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(o_tty->ldisc.ops);
-			if (wait_event_interruptible(tty_ldisc_wait, o_tty->ldisc.refcount == 0) < 0)
-				return -ERESTARTSYS;
-			goto restart;
-		}
-	}
-	/*
-	 *	If the TTY_LDISC bit is set, then we are racing against
-	 *	another ldisc change
-	 */
-	if (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
-		struct tty_ldisc *ld;
-		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		tty_ldisc_put(new_ldisc.ops);
-		ld = tty_ldisc_ref_wait(tty);
-		tty_ldisc_deref(ld);
-		goto restart;
-	}
-	/*
-	 *	This flag is used to avoid two parallel ldisc changes. Once
-	 *	open and close are fine grained locked this may work better
-	 *	as a mutex shared with the open/close/hup paths
-	 */
-	set_bit(TTY_LDISC_CHANGING, &tty->flags);
+	work = tty_ldisc_halt(tty);
 	if (o_tty)
-		set_bit(TTY_LDISC_CHANGING, &o_tty->flags);
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	
-	/*
-	 *	From this point on we know nobody has an ldisc
-	 *	usage reference, nor can they obtain one until
-	 *	we say so later on.
-	 */
+		o_work = tty_ldisc_halt(o_tty);
 
-	work = cancel_delayed_work(&tty->buf.work);
 	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 * MUST NOT hold locks here.
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate.
+	 * We must drop the mutex here in case a hangup is also in process.
 	 */
+
+	mutex_unlock(&tty->ldisc_mutex);
+
 	flush_scheduled_work();
+
+	/* Let any existing reference holders finish */
+	retval = tty_ldisc_wait_idle(tty);
+	if (retval < 0) {
+		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+		tty_ldisc_put(new_ldisc);
+		return retval;
+	}
+
+	mutex_lock(&tty->ldisc_mutex);
+	if (test_bit(TTY_HUPPED, &tty->flags)) {
+		/* We were raced by the hangup method. It will have stomped
+		   the ldisc data and closed the ldisc down */
+		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+		mutex_unlock(&tty->ldisc_mutex);
+		tty_ldisc_put(new_ldisc);
+		return -EIO;
+	}
+
 	/* Shutdown the current discipline. */
-	if (o_ldisc.ops->close)
-		(o_ldisc.ops->close)(tty);
+	tty_ldisc_close(tty, o_ldisc);
 
 	/* Now set up the new line discipline. */
-	tty_ldisc_assign(tty, &new_ldisc);
+	tty_ldisc_assign(tty, new_ldisc);
 	tty_set_termios_ldisc(tty, ldisc);
-	if (new_ldisc.ops->open)
-		retval = (new_ldisc.ops->open)(tty);
+
+	retval = tty_ldisc_open(tty, new_ldisc);
 	if (retval < 0) {
-		tty_ldisc_put(new_ldisc.ops);
-		tty_ldisc_restore(tty, &o_ldisc);
+		/* Back to the old one or N_TTY if we can't */
+		tty_ldisc_put(new_ldisc);
+		tty_ldisc_restore(tty, o_ldisc);
 	}
+
 	/* At this point we hold a reference to the new ldisc and a
 	   a reference to the old ldisc. If we ended up flipping back
 	   to the existing ldisc we have two references to it */
 
-	if (tty->ldisc.ops->num != o_ldisc.ops->num && tty->ops->set_ldisc)
+	if (tty->ldisc->ops->num != o_ldisc->ops->num && tty->ops->set_ldisc)
 		tty->ops->set_ldisc(tty);
 
-	tty_ldisc_put(o_ldisc.ops);
+	tty_ldisc_put(o_ldisc);
 
 	/*
-	 *	Allow ldisc referencing to occur as soon as the driver
-	 *	ldisc callback completes.
+	 *	Allow ldisc referencing to occur again
 	 */
 
 	tty_ldisc_enable(tty);
 	if (o_tty)
 		tty_ldisc_enable(o_tty);
 
-	/* Restart it in case no characters kick it off. Safe if
+	/* Restart the work queue in case no characters kick it off. Safe if
 	   already running */
 	if (work)
 		schedule_delayed_work(&tty->buf.work, 1);
+	if (o_work)
+		schedule_delayed_work(&o_tty->buf.work, 1);
+	mutex_unlock(&tty->ldisc_mutex);
 	return retval;
 }
 
+/**
+ *	tty_reset_termios	-	reset terminal state
+ *	@tty: tty to reset
+ *
+ *	Restore a terminal to the driver default state.
+ */
+
+static void tty_reset_termios(struct tty_struct *tty)
+{
+	mutex_lock(&tty->termios_mutex);
+	*tty->termios = tty->driver->init_termios;
+	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
+	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
+	mutex_unlock(&tty->termios_mutex);
+}
+
+
+/**
+ *	tty_ldisc_reinit	-	reinitialise the tty ldisc
+ *	@tty: tty to reinit
+ *
+ *	Switch the tty back to N_TTY line discipline and leave the
+ *	ldisc state closed
+ */
+
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld;
+
+	tty_ldisc_close(tty, tty->ldisc);
+	tty_ldisc_put(tty->ldisc);
+	tty->ldisc = NULL;
+	/*
+	 *	Switch the line discipline back
+	 */
+	ld = tty_ldisc_get(N_TTY);
+	BUG_ON(IS_ERR(ld));
+	tty_ldisc_assign(tty, ld);
+	tty_set_termios_ldisc(tty, N_TTY);
+}
+
+/**
+ *	tty_ldisc_hangup		-	hangup ldisc reset
+ *	@tty: tty being hung up
+ *
+ *	Some tty devices reset their termios when they receive a hangup
+ *	event. In that situation we must also switch back to N_TTY properly
+ *	before we reset the termios data.
+ *
+ *	Locking: We can take the ldisc mutex as the rest of the code is
+ *	careful to allow for this.
+ *
+ *	In the pty pair case this occurs in the close() path of the
+ *	tty itself so we must be careful about locking rules.
+ */
+
+void tty_ldisc_hangup(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld;
+
+	/*
+	 * FIXME! What are the locking issues here? This may me overdoing
+	 * things... This question is especially important now that we've
+	 * removed the irqlock.
+	 */
+	ld = tty_ldisc_ref(tty);
+	if (ld != NULL) {
+		/* We may have no line discipline at this point */
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
+		tty_driver_flush_buffer(tty);
+		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
+		    ld->ops->write_wakeup)
+			ld->ops->write_wakeup(tty);
+		if (ld->ops->hangup)
+			ld->ops->hangup(tty);
+		tty_ldisc_deref(ld);
+	}
+	/*
+	 * FIXME: Once we trust the LDISC code better we can wait here for
+	 * ldisc completion and fix the driver call race
+	 */
+	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+	/*
+	 * Shutdown the current line discipline, and reset it to
+	 * N_TTY.
+	 */
+	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) {
+		/* Avoid racing set_ldisc */
+		mutex_lock(&tty->ldisc_mutex);
+		/* Switch back to N_TTY */
+		tty_ldisc_reinit(tty);
+		/* At this point we have a closed ldisc and we want to
+		   reopen it. We could defer this to the next open but
+		   it means auditing a lot of other paths so this is a FIXME */
+		WARN_ON(tty_ldisc_open(tty, tty->ldisc));
+		tty_ldisc_enable(tty);
+		mutex_unlock(&tty->ldisc_mutex);
+		tty_reset_termios(tty);
+	}
+}
 
 /**
  *	tty_ldisc_setup			-	open line discipline
@@ -654,24 +791,23 @@ restart:
  *	@o_tty: pair tty for pty/tty pairs
  *
  *	Called during the initial open of a tty/pty pair in order to set up the
- *	line discplines and bind them to the tty.
+ *	line disciplines and bind them to the tty. This has no locking issues
+ *	as the device isn't yet active.
  */
 
 int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-	struct tty_ldisc *ld = &tty->ldisc;
+	struct tty_ldisc *ld = tty->ldisc;
 	int retval;
 
-	if (ld->ops->open) {
-		retval = (ld->ops->open)(tty);
-		if (retval)
-			return retval;
-	}
-	if (o_tty && o_tty->ldisc.ops->open) {
-		retval = (o_tty->ldisc.ops->open)(o_tty);
+	retval = tty_ldisc_open(tty, ld);
+	if (retval)
+		return retval;
+
+	if (o_tty) {
+		retval = tty_ldisc_open(o_tty, o_tty->ldisc);
 		if (retval) {
-			if (ld->ops->close)
-				(ld->ops->close)(tty);
+			tty_ldisc_close(tty, ld);
 			return retval;
 		}
 		tty_ldisc_enable(o_tty);
@@ -679,34 +815,18 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	tty_ldisc_enable(tty);
 	return 0;
 }
-
-static void tty_ldisc_reinit(struct tty_struct *tty)
-{
-	struct tty_ldisc ld;
-
-	if (tty->ldisc.ops->close)
-		(tty->ldisc.ops->close)(tty);
-	tty_ldisc_put(tty->ldisc.ops);
-	/*
-	 *	Switch the line discipline back
-	 */
-	WARN_ON(tty_ldisc_get(N_TTY, &ld));
-	tty_ldisc_assign(tty, &ld);
-	tty_set_termios_ldisc(tty, N_TTY);
-}
-
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
  *	@o_tty: pair tty for pty/tty pairs
  *
  *	Called during the final close of a tty/pty pair in order to shut down the
- *	line discpline layer.
+ *	line discpline layer. On exit the ldisc assigned is N_TTY and the
+ *	ldisc has not been opened.
  */
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-
 	/*
 	 * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
 	 * kill any delayed work. As this is the final close it does not
@@ -714,6 +834,7 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 */
 
 	tty_ldisc_halt(tty);
+	flush_scheduled_work();
 
 	/*
 	 * Wait for any short term users (we know they are just driver
@@ -730,11 +851,9 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 */
 
 	tty_ldisc_reinit(tty);
-	if (o_tty) {
-		/* FIXME: could o_tty be in setldisc here ? */
-		clear_bit(TTY_LDISC, &o_tty->flags);
-		tty_ldisc_reinit(o_tty);
-	}
+	/* This will need doing differently if we need to lock */
+	if (o_tty)
+		tty_ldisc_release(o_tty, NULL);
 }
 
 /**
@@ -747,10 +866,10 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_init(struct tty_struct *tty)
 {
-	struct tty_ldisc ld;
-	if (tty_ldisc_get(N_TTY, &ld) < 0)
+	struct tty_ldisc *ld = tty_ldisc_get(N_TTY);
+	if (IS_ERR(ld))
 		panic("n_tty: init_tty");
-	tty_ldisc_assign(tty, &ld);
+	tty_ldisc_assign(tty, ld);
 }
 
 void tty_ldisc_begin(void)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f9c13c83790..1488d8c81aa 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -226,8 +226,11 @@ struct tty_struct {
 	struct tty_driver *driver;
 	const struct tty_operations *ops;
 	int index;
-	/* The ldisc objects are protected by tty_ldisc_lock at the moment */
-	struct tty_ldisc ldisc;
+
+	/* Protects ldisc changes: Lock tty not pty */
+	struct mutex ldisc_mutex;
+	struct tty_ldisc *ldisc;
+
 	struct mutex termios_mutex;
 	spinlock_t ctrl_lock;
 	/* Termios values are protected by the termios mutex */
@@ -314,6 +317,7 @@ struct tty_struct {
 #define TTY_CLOSING 		7	/* ->close() in progress */
 #define TTY_LDISC 		9	/* Line discipline attached */
 #define TTY_LDISC_CHANGING 	10	/* Line discipline changing */
+#define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
 #define TTY_HW_COOK_OUT 	14	/* Hardware can do output cooking */
 #define TTY_HW_COOK_IN 		15	/* Hardware can do input cooking */
 #define TTY_PTY_LOCK 		16	/* pty private */
@@ -406,6 +410,7 @@ extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
+extern void tty_ldisc_hangup(struct tty_struct *tty);
 extern const struct file_operations tty_ldiscs_proc_fops;
 
 extern void tty_wakeup(struct tty_struct *tty);
-- 
cgit v1.2.3-70-g09d2


From 08e0992f60ad44025a8a8b8a821838ca4a562686 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Thu, 11 Jun 2009 13:21:24 +0100
Subject: serial: add support for the TI AR7 internal UART

This patch adds support for the TI AR7 internal UART.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 7 +++++++
 include/linux/serial_core.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index a0127e93ade..fb867a9f55e 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -287,6 +287,13 @@ static const struct serial8250_config uart_config[] = {
 		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
 		.flags		= UART_CAP_FIFO,
 	},
+	[PORT_AR7] = {
+		.name		= "AR7",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
 };
 
 #if defined (CONFIG_SERIAL_8250_AU1X00)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 57a97e52e58..48766ea845c 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -41,7 +41,8 @@
 #define PORT_XSCALE	15
 #define PORT_RM9000	16	/* PMC-Sierra RM9xxx internal UART */
 #define PORT_OCTEON	17	/* Cavium OCTEON internal UART */
-#define PORT_MAX_8250	17	/* max port ID */
+#define PORT_AR7	18	/* Texas Instruments AR7 internal UART */
+#define PORT_MAX_8250	18	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
-- 
cgit v1.2.3-70-g09d2


From 34aec591847c696339189b070cce2a11f901cfea Mon Sep 17 00:00:00 2001
From: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Date: Thu, 11 Jun 2009 14:05:39 +0100
Subject: serial: Added Timberdale UART driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Driver for the UART found in the Timberdale FPGA

Signed-off-by: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/Kconfig      |   7 +
 drivers/serial/Makefile     |   1 +
 drivers/serial/timbuart.c   | 520 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/timbuart.h   |  58 +++++
 include/linux/serial_core.h |   3 +
 5 files changed, 589 insertions(+)
 create mode 100644 drivers/serial/timbuart.c
 create mode 100644 drivers/serial/timbuart.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 343e3a35b6a..3391ef0d761 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1433,4 +1433,11 @@ config SPORT_BAUD_RATE
 	default 19200 if (SERIAL_SPORT_BAUD_RATE_19200)
 	default 9600 if (SERIAL_SPORT_BAUD_RATE_9600)
 
+config SERIAL_TIMBERDALE
+	tristate "Support for timberdale UART"
+	depends on MFD_TIMBERDALE
+	select SERIAL_CORE
+	---help---
+	Add support for UART controller on timberdale.
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index d438eb2a73d..45a8658f54d 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -77,3 +77,4 @@ obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
 obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
+obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
new file mode 100644
index 00000000000..30ba3c4cc18
--- /dev/null
+++ b/drivers/serial/timbuart.c
@@ -0,0 +1,520 @@
+/*
+ * timbuart.c timberdale FPGA UART driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/serial_core.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/ioport.h>
+
+#include "timbuart.h"
+
+struct timbuart_port {
+	struct uart_port	port;
+	struct tasklet_struct	tasklet;
+	int			usedma;
+	u8			last_ier;
+	struct platform_device  *dev;
+};
+
+static int baudrates[] = {9600, 19200, 38400, 57600, 115200, 230400, 460800,
+	921600, 1843200, 3250000};
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier);
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid);
+
+static void timbuart_stop_rx(struct uart_port *port)
+{
+	/* spin lock held by upper layer, disable all RX interrupts */
+	u8 ier = ioread8(port->membase + TIMBUART_IER) & ~RXFLAGS;
+	iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_stop_tx(struct uart_port *port)
+{
+	/* spinlock held by upper layer, disable TX interrupt */
+	u8 ier = ioread8(port->membase + TIMBUART_IER) & ~TXBAE;
+	iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_start_tx(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+
+	/* do not transfer anything here -> fire off the tasklet */
+	tasklet_schedule(&uart->tasklet);
+}
+
+static void timbuart_flush_buffer(struct uart_port *port)
+{
+	u8 ctl = ioread8(port->membase + TIMBUART_CTRL) | TIMBUART_CTRL_FLSHTX;
+
+	iowrite8(ctl, port->membase + TIMBUART_CTRL);
+	iowrite8(TXBF, port->membase + TIMBUART_ISR);
+}
+
+static void timbuart_rx_chars(struct uart_port *port)
+{
+	struct tty_struct *tty = port->info->port.tty;
+
+	while (ioread8(port->membase + TIMBUART_ISR) & RXDP) {
+		u8 ch = ioread8(port->membase + TIMBUART_RXFIFO);
+		port->icount.rx++;
+		tty_insert_flip_char(tty, ch, TTY_NORMAL);
+	}
+
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(port->info->port.tty);
+	spin_lock(&port->lock);
+
+	dev_dbg(port->dev, "%s - total read %d bytes\n",
+		__func__, port->icount.rx);
+}
+
+static void timbuart_tx_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+
+	while (!(ioread8(port->membase + TIMBUART_ISR) & TXBF) &&
+		!uart_circ_empty(xmit)) {
+		iowrite8(xmit->buf[xmit->tail],
+			port->membase + TIMBUART_TXFIFO);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	dev_dbg(port->dev,
+		"%s - total written %d bytes, CTL: %x, RTS: %x, baud: %x\n",
+		 __func__,
+		port->icount.tx,
+		ioread8(port->membase + TIMBUART_CTRL),
+		port->mctrl & TIOCM_RTS,
+		ioread8(port->membase + TIMBUART_BAUDRATE));
+}
+
+static void timbuart_handle_tx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+	struct circ_buf *xmit = &port->info->xmit;
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+		return;
+
+	if (port->x_char)
+		return;
+
+	if (isr & TXFLAGS) {
+		timbuart_tx_chars(port);
+		/* clear all TX interrupts */
+		iowrite8(TXFLAGS, port->membase + TIMBUART_ISR);
+
+		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+			uart_write_wakeup(port);
+	} else
+		/* Re-enable any tx interrupt */
+		*ier |= uart->last_ier & TXFLAGS;
+
+	/* enable interrupts if there are chars in the transmit buffer,
+	 * Or if we delivered some bytes and want the almost empty interrupt
+	 * we wake up the upper layer later when we got the interrupt
+	 * to give it some time to go out...
+	 */
+	if (!uart_circ_empty(xmit))
+		*ier |= TXBAE;
+
+	dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_handle_rx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+	if (isr & RXFLAGS) {
+		/* Some RX status is set */
+		if (isr & RXBF) {
+			u8 ctl = ioread8(port->membase + TIMBUART_CTRL) |
+				TIMBUART_CTRL_FLSHRX;
+			iowrite8(ctl, port->membase + TIMBUART_CTRL);
+			port->icount.overrun++;
+		} else if (isr & (RXDP))
+			timbuart_rx_chars(port);
+
+		/* ack all RX interrupts */
+		iowrite8(RXFLAGS, port->membase + TIMBUART_ISR);
+	}
+
+	/* always have the RX interrupts enabled */
+	*ier |= RXBAF | RXBF | RXTT;
+
+	dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_tasklet(unsigned long arg)
+{
+	struct timbuart_port *uart = (struct timbuart_port *)arg;
+	u8 isr, ier = 0;
+
+	spin_lock(&uart->port.lock);
+
+	isr = ioread8(uart->port.membase + TIMBUART_ISR);
+	dev_dbg(uart->port.dev, "%s ISR: %x\n", __func__, isr);
+
+	if (!uart->usedma)
+		timbuart_handle_tx_port(&uart->port, isr, &ier);
+
+	timbuart_mctrl_check(&uart->port, isr, &ier);
+
+	if (!uart->usedma)
+		timbuart_handle_rx_port(&uart->port, isr, &ier);
+
+	iowrite8(ier, uart->port.membase + TIMBUART_IER);
+
+	spin_unlock(&uart->port.lock);
+	dev_dbg(uart->port.dev, "%s leaving\n", __func__);
+}
+
+static unsigned int timbuart_tx_empty(struct uart_port *port)
+{
+	u8 isr = ioread8(port->membase + TIMBUART_ISR);
+
+	return (isr & TXBAE) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int timbuart_get_mctrl(struct uart_port *port)
+{
+	u8 cts = ioread8(port->membase + TIMBUART_CTRL);
+	dev_dbg(port->dev, "%s - cts %x\n", __func__, cts);
+
+	if (cts & TIMBUART_CTRL_CTS)
+		return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+	else
+		return TIOCM_DSR | TIOCM_CAR;
+}
+
+static void timbuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	dev_dbg(port->dev, "%s - %x\n", __func__, mctrl);
+
+	if (mctrl & TIOCM_RTS)
+		iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+	else
+		iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+}
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier)
+{
+	unsigned int cts;
+
+	if (isr & CTS_DELTA) {
+		/* ack */
+		iowrite8(CTS_DELTA, port->membase + TIMBUART_ISR);
+		cts = timbuart_get_mctrl(port);
+		uart_handle_cts_change(port, cts & TIOCM_CTS);
+		wake_up_interruptible(&port->info->delta_msr_wait);
+	}
+
+	*ier |= CTS_DELTA;
+}
+
+static void timbuart_enable_ms(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void timbuart_break_ctl(struct uart_port *port, int ctl)
+{
+	/* N/A */
+}
+
+static int timbuart_startup(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+
+	dev_dbg(port->dev, "%s\n", __func__);
+
+	iowrite8(TIMBUART_CTRL_FLSHRX, port->membase + TIMBUART_CTRL);
+	iowrite8(0xff, port->membase + TIMBUART_ISR);
+	/* Enable all but TX interrupts */
+	iowrite8(RXBAF | RXBF | RXTT | CTS_DELTA,
+		port->membase + TIMBUART_IER);
+
+	return request_irq(port->irq, timbuart_handleinterrupt, IRQF_SHARED,
+		"timb-uart", uart);
+}
+
+static void timbuart_shutdown(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+	dev_dbg(port->dev, "%s\n", __func__);
+	free_irq(port->irq, uart);
+	iowrite8(0, port->membase + TIMBUART_IER);
+}
+
+static int get_bindex(int baud)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(baudrates); i++)
+		if (baud == baudrates[i])
+			return i;
+
+	return -1;
+}
+
+static void timbuart_set_termios(struct uart_port *port,
+	struct ktermios *termios,
+	struct ktermios *old)
+{
+	unsigned int baud;
+	short bindex;
+	unsigned long flags;
+
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+	bindex = get_bindex(baud);
+	dev_dbg(port->dev, "%s - bindex %d\n", __func__, bindex);
+
+	if (bindex < 0) {
+		printk(KERN_ALERT "timbuart: Unsupported baud rate\n");
+	} else {
+		spin_lock_irqsave(&port->lock, flags);
+		iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
+		uart_update_timeout(port, termios->c_cflag, baud);
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+}
+
+static const char *timbuart_type(struct uart_port *port)
+{
+	return port->type == PORT_UNKNOWN ? "timbuart" : NULL;
+}
+
+/* We do not request/release mappings of the registers here,
+ * currently it's done in the proble function.
+ */
+static void timbuart_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size =
+		resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+	if (port->flags & UPF_IOREMAP) {
+		iounmap(port->membase);
+		port->membase = NULL;
+	}
+
+	release_mem_region(port->mapbase, size);
+}
+
+static int timbuart_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size =
+		resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+	if (!request_mem_region(port->mapbase, size, "timb-uart"))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = ioremap(port->mapbase, size);
+		if (port->membase == NULL) {
+			release_mem_region(port->mapbase, size);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid)
+{
+	struct timbuart_port *uart = (struct timbuart_port *)devid;
+
+	if (ioread8(uart->port.membase + TIMBUART_IPR)) {
+		uart->last_ier = ioread8(uart->port.membase + TIMBUART_IER);
+
+		/* disable interrupts, the tasklet enables them again */
+		iowrite8(0, uart->port.membase + TIMBUART_IER);
+
+		/* fire off bottom half */
+		tasklet_schedule(&uart->tasklet);
+
+		return IRQ_HANDLED;
+	} else
+		return IRQ_NONE;
+}
+
+/*
+ * Configure/autoconfigure the port.
+ */
+static void timbuart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_TIMBUART;
+		timbuart_request_port(port);
+	}
+}
+
+static int timbuart_verify_port(struct uart_port *port,
+	struct serial_struct *ser)
+{
+	/* we don't want the core code to modify any port params */
+	return -EINVAL;
+}
+
+static struct uart_ops timbuart_ops = {
+	.tx_empty = timbuart_tx_empty,
+	.set_mctrl = timbuart_set_mctrl,
+	.get_mctrl = timbuart_get_mctrl,
+	.stop_tx = timbuart_stop_tx,
+	.start_tx = timbuart_start_tx,
+	.flush_buffer = timbuart_flush_buffer,
+	.stop_rx = timbuart_stop_rx,
+	.enable_ms = timbuart_enable_ms,
+	.break_ctl = timbuart_break_ctl,
+	.startup = timbuart_startup,
+	.shutdown = timbuart_shutdown,
+	.set_termios = timbuart_set_termios,
+	.type = timbuart_type,
+	.release_port = timbuart_release_port,
+	.request_port = timbuart_request_port,
+	.config_port = timbuart_config_port,
+	.verify_port = timbuart_verify_port
+};
+
+static struct uart_driver timbuart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "timberdale_uart",
+	.dev_name = "ttyTU",
+	.major = TIMBUART_MAJOR,
+	.minor = TIMBUART_MINOR,
+	.nr = 1
+};
+
+static int timbuart_probe(struct platform_device *dev)
+{
+	int err;
+	struct timbuart_port *uart;
+	struct resource *iomem;
+
+	dev_dbg(&dev->dev, "%s\n", __func__);
+
+	uart = kzalloc(sizeof(*uart), GFP_KERNEL);
+	if (!uart) {
+		err = -EINVAL;
+		goto err_mem;
+	}
+
+	uart->usedma = 0;
+
+	uart->port.uartclk = 3250000 * 16;
+	uart->port.fifosize  = TIMBUART_FIFO_SIZE;
+	uart->port.regshift  = 2;
+	uart->port.iotype  = UPIO_MEM;
+	uart->port.ops = &timbuart_ops;
+	uart->port.irq = 0;
+	uart->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP;
+	uart->port.line  = 0;
+	uart->port.dev	= &dev->dev;
+
+	iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!iomem) {
+		err = -ENOMEM;
+		goto err_register;
+	}
+	uart->port.mapbase = iomem->start;
+	uart->port.membase = NULL;
+
+	uart->port.irq = platform_get_irq(dev, 0);
+	if (uart->port.irq < 0) {
+		err = -EINVAL;
+		goto err_register;
+	}
+
+	tasklet_init(&uart->tasklet, timbuart_tasklet, (unsigned long)uart);
+
+	err = uart_register_driver(&timbuart_driver);
+	if (err)
+		goto err_register;
+
+	err = uart_add_one_port(&timbuart_driver, &uart->port);
+	if (err)
+		goto err_add_port;
+
+	platform_set_drvdata(dev, uart);
+
+	return 0;
+
+err_add_port:
+	uart_unregister_driver(&timbuart_driver);
+err_register:
+	kfree(uart);
+err_mem:
+	printk(KERN_ERR "timberdale: Failed to register Timberdale UART: %d\n",
+		err);
+
+	return err;
+}
+
+static int timbuart_remove(struct platform_device *dev)
+{
+	struct timbuart_port *uart = platform_get_drvdata(dev);
+
+	tasklet_kill(&uart->tasklet);
+	uart_remove_one_port(&timbuart_driver, &uart->port);
+	uart_unregister_driver(&timbuart_driver);
+	kfree(uart);
+
+	return 0;
+}
+
+static struct platform_driver timbuart_platform_driver = {
+	.driver = {
+		.name	= "timb-uart",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= timbuart_probe,
+	.remove		= timbuart_remove,
+};
+
+/*--------------------------------------------------------------------------*/
+
+static int __init timbuart_init(void)
+{
+	return platform_driver_register(&timbuart_platform_driver);
+}
+
+static void __exit timbuart_exit(void)
+{
+	platform_driver_unregister(&timbuart_platform_driver);
+}
+
+module_init(timbuart_init);
+module_exit(timbuart_exit);
+
+MODULE_DESCRIPTION("Timberdale UART driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:timb-uart");
+
diff --git a/drivers/serial/timbuart.h b/drivers/serial/timbuart.h
new file mode 100644
index 00000000000..7e566766bc4
--- /dev/null
+++ b/drivers/serial/timbuart.h
@@ -0,0 +1,58 @@
+/*
+ * timbuart.c timberdale FPGA GPIO driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#ifndef _TIMBUART_H
+#define _TIMBUART_H
+
+#define TIMBUART_FIFO_SIZE	2048
+
+#define TIMBUART_RXFIFO		0x08
+#define TIMBUART_TXFIFO		0x0c
+#define TIMBUART_IER		0x10
+#define TIMBUART_IPR		0x14
+#define TIMBUART_ISR		0x18
+#define TIMBUART_CTRL		0x1c
+#define TIMBUART_BAUDRATE	0x20
+
+#define TIMBUART_CTRL_RTS	0x01
+#define TIMBUART_CTRL_CTS	0x02
+#define TIMBUART_CTRL_FLSHTX	0x40
+#define TIMBUART_CTRL_FLSHRX	0x80
+
+#define TXBF		0x01
+#define TXBAE		0x02
+#define CTS_DELTA	0x04
+#define RXDP		0x08
+#define RXBAF		0x10
+#define RXBF		0x20
+#define RXTT		0x40
+#define RXBNAE		0x80
+#define TXBE		0x100
+
+#define RXFLAGS (RXDP | RXBAF | RXBF | RXTT | RXBNAE)
+#define TXFLAGS (TXBF | TXBAE)
+
+#define TIMBUART_MAJOR 204
+#define TIMBUART_MINOR 192
+
+#endif /* _TIMBUART_H */
+
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 48766ea845c..6fd80c4243f 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -168,6 +168,9 @@
 /* MAX3100 */
 #define PORT_MAX3100    86
 
+/* Timberdale UART */
+#define PORT_TIMBUART	87
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3-70-g09d2


From 8759ef32d992fc6c0bcbe40fca7aa302190918a5 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:51:15 +0100
Subject: lib: isolate rational fractions helper function

Provide a helper function to determine optimum numerator
denominator value pairs taking into account restricted
register size. Useful especially with PLL and other clock
configurations.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rational.h | 19 +++++++++++++++
 lib/Kconfig              |  3 +++
 lib/Makefile             |  1 +
 lib/rational.c           | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+)
 create mode 100644 include/linux/rational.h
 create mode 100644 lib/rational.c

(limited to 'include/linux')

diff --git a/include/linux/rational.h b/include/linux/rational.h
new file mode 100644
index 00000000000..4f532fcd9ee
--- /dev/null
+++ b/include/linux/rational.h
@@ -0,0 +1,19 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers,
+ * e.g. when calculating optimum numerator/denominator pairs for
+ * pll configuration taking into account restricted register size
+ */
+
+#ifndef _LINUX_RATIONAL_H
+#define _LINUX_RATIONAL_H
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator);
+
+#endif /* _LINUX_RATIONAL_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 8ade0a7a91e..9960be04cbb 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -10,6 +10,9 @@ menu "Library routines"
 config BITREVERSE
 	tristate
 
+config RATIONAL
+	boolean
+
 config GENERIC_FIND_FIRST_BIT
 	bool
 
diff --git a/lib/Makefile b/lib/Makefile
index 33a40e40e3e..1f6edefebff 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -50,6 +50,7 @@ ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
 endif
 
 obj-$(CONFIG_BITREVERSE) += bitrev.o
+obj-$(CONFIG_RATIONAL)	+= rational.o
 obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
diff --git a/lib/rational.c b/lib/rational.c
new file mode 100644
index 00000000000..b3c099b5478
--- /dev/null
+++ b/lib/rational.c
@@ -0,0 +1,62 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers
+ */
+
+#include <linux/rational.h>
+
+/*
+ * calculate best rational approximation for a given fraction
+ * taking into account restricted register size, e.g. to find
+ * appropriate values for a pll with 5 bit denominator and
+ * 8 bit numerator register fields, trying to set up with a
+ * frequency ratio of 3.1415, one would say:
+ *
+ * rational_best_approximation(31415, 10000,
+ *		(1 << 8) - 1, (1 << 5) - 1, &n, &d);
+ *
+ * you may look at given_numerator as a fixed point number,
+ * with the fractional part size described in given_denominator.
+ *
+ * for theoretical background, see:
+ * http://en.wikipedia.org/wiki/Continued_fraction
+ */
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator)
+{
+	unsigned long n, d, n0, d0, n1, d1;
+	n = given_numerator;
+	d = given_denominator;
+	n0 = d1 = 0;
+	n1 = d0 = 1;
+	for (;;) {
+		unsigned long t, a;
+		if ((n1 > max_numerator) || (d1 > max_denominator)) {
+			n1 = n0;
+			d1 = d0;
+			break;
+		}
+		if (d == 0)
+			break;
+		t = d;
+		a = n / d;
+		d = n % d;
+		n = t;
+		t = n0 + a * n1;
+		n0 = n1;
+		n1 = t;
+		t = d0 + a * d1;
+		d0 = d1;
+		d1 = t;
+	}
+	*best_numerator = n1;
+	*best_denominator = d1;
+}
+
+EXPORT_SYMBOL(rational_best_approximation);
-- 
cgit v1.2.3-70-g09d2


From 1c432d899d32d36371ee4ee310fa3609cf0e5742 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 13:19:29 +0200
Subject: perf_counter: Rename enums

Rename the perf enums to be in the 'perf_' namespace and strictly
enumerate the ABI bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 53 +++++++++++++++++++++-----------------------
 kernel/perf_counter.c        |  6 ++---
 2 files changed, 28 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 95c797c480e..d5911b02bc8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,24 +24,21 @@
 /*
  * attr.type
  */
-enum perf_event_types {
+enum perf_type_id {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
 	PERF_TYPE_HW_CACHE		= 3,
+	PERF_TYPE_RAW			= 4,
 
-	/*
-	 * available TYPE space, raw is the max value.
-	 */
-
-	PERF_TYPE_RAW			= 128,
+	PERF_TYPE_MAX,			/* non ABI */
 };
 
 /*
  * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum attr_ids {
+enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -53,7 +50,7 @@ enum attr_ids {
 	PERF_COUNT_BRANCH_MISSES	= 5,
 	PERF_COUNT_BUS_CYCLES		= 6,
 
-	PERF_HW_EVENTS_MAX		= 7,
+	PERF_HW_EVENTS_MAX,		/* non ABI */
 };
 
 /*
@@ -63,30 +60,30 @@ enum attr_ids {
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
-enum hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D,
-	PERF_COUNT_HW_CACHE_L1I,
-	PERF_COUNT_HW_CACHE_L2,
-	PERF_COUNT_HW_CACHE_DTLB,
-	PERF_COUNT_HW_CACHE_ITLB,
-	PERF_COUNT_HW_CACHE_BPU,
-
-	PERF_COUNT_HW_CACHE_MAX,
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D		= 0,
+	PERF_COUNT_HW_CACHE_L1I		= 1,
+	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_DTLB	= 3,
+	PERF_COUNT_HW_CACHE_ITLB	= 4,
+	PERF_COUNT_HW_CACHE_BPU		= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
 };
 
-enum hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ,
-	PERF_COUNT_HW_CACHE_OP_WRITE,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ	= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,
 
-	PERF_COUNT_HW_CACHE_OP_MAX,
+	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
 };
 
-enum hw_cache_op_result_id {
-	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
-	PERF_COUNT_HW_CACHE_RESULT_MISS,
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
 
-	PERF_COUNT_HW_CACHE_RESULT_MAX,
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
 };
 
 /*
@@ -95,7 +92,7 @@ enum hw_cache_op_result_id {
  * physical and sw events of the kernel (and allow the profiling of them as
  * well):
  */
-enum sw_event_ids {
+enum perf_sw_ids {
 	PERF_COUNT_CPU_CLOCK		= 0,
 	PERF_COUNT_TASK_CLOCK		= 1,
 	PERF_COUNT_PAGE_FAULTS		= 2,
@@ -104,7 +101,7 @@ enum sw_event_ids {
 	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
 	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 
-	PERF_SW_EVENTS_MAX		= 7,
+	PERF_SW_EVENTS_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b2829de559..c02535bed26 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3162,7 +3162,7 @@ static int perf_swcounter_is_counting(struct perf_counter *counter)
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum perf_event_types type,
+				enum perf_type_id type,
 				u32 event, struct pt_regs *regs)
 {
 	if (!perf_swcounter_is_counting(counter))
@@ -3194,7 +3194,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_event_types type, u32 event,
+				     enum perf_type_id type, u32 event,
 				     u64 nr, int nmi, struct pt_regs *regs,
 				     u64 addr)
 {
@@ -3225,7 +3225,7 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 	return &cpuctx->recursion[0];
 }
 
-static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs,
 				   u64 addr)
 {
-- 
cgit v1.2.3-70-g09d2


From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:06:28 +0200
Subject: perf_counter: Standardize event names

Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   | 12 +++++------
 arch/powerpc/kernel/power5+-pmu.c  | 12 +++++------
 arch/powerpc/kernel/power5-pmu.c   | 12 +++++------
 arch/powerpc/kernel/power6-pmu.c   | 12 +++++------
 arch/powerpc/kernel/ppc970-pmu.c   | 12 +++++------
 arch/powerpc/mm/fault.c            |  6 +++---
 arch/x86/kernel/cpu/perf_counter.c | 32 +++++++++++++--------------
 arch/x86/mm/fault.c                |  6 +++---
 include/linux/perf_counter.h       | 36 +++++++++++++++----------------
 kernel/perf_counter.c              | 20 ++++++++---------
 tools/perf/builtin-record.c        |  4 ++--
 tools/perf/builtin-stat.c          | 31 ++++++++++++++-------------
 tools/perf/builtin-top.c           |  4 ++--
 tools/perf/design.txt              | 28 ++++++++++++------------
 tools/perf/util/parse-events.c     | 44 +++++++++++++++++++-------------------
 15 files changed, 136 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 0e94b685722..73956f084b2 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int p4_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index bbf2cbb0738..5f8b7741e97 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5p_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 670cf10b91e..d54723ab627 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 4da70786609..0cd406ee765 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power6_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0x1e,
-	[PERF_COUNT_INSTRUCTIONS] = 2,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 336adf1736a..46a20640942 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int ppc970_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 1,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 1,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index ac0e112031b..5beffc8f481 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 57ae1bec81b..572fb434a66 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
  */
 static const u64 intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
-  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
 static u64 intel_pmu_event_map(int event)
@@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
 };
 
 static u64 amd_pmu_event_map(int event)
@@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6f9df2babe4..5c6d816f30b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,11 +1142,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d5911b02bc8..887df88a9c2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,15 +42,15 @@ enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
-
-	PERF_HW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,		/* non ABI */
 };
 
 /*
@@ -93,15 +93,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_SW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK	= 1,
+	PERF_COUNT_SW_PAGE_FAULTS	= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_COUNT_SW_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c02535bed26..8859b97390e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1024,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	int do_switch = 1;
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
 
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
@@ -3411,13 +3411,13 @@ void perf_counter_task_migration(struct task_struct *task, int cpu)
 	struct perf_counter_context *ctx;
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_CPU_MIGRATIONS,
+				 PERF_COUNT_SW_CPU_MIGRATIONS,
 				 1, 1, NULL, 0);
 
 	ctx = perf_pin_task_context(task);
 	if (ctx) {
 		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_CPU_MIGRATIONS,
+					 PERF_COUNT_SW_CPU_MIGRATIONS,
 					 1, 1, NULL, 0);
 		perf_unpin_context(ctx);
 	}
@@ -3475,11 +3475,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * events.
 	 */
 	switch (counter->attr.config) {
-	case PERF_COUNT_CPU_CLOCK:
+	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_TASK_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -3490,11 +3490,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 			pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_PAGE_FAULTS:
-	case PERF_COUNT_PAGE_FAULTS_MIN:
-	case PERF_COUNT_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_CONTEXT_SWITCHES:
-	case PERF_COUNT_CPU_MIGRATIONS:
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
 		pmu = &perf_ops_generic;
 		break;
 	}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 84cd336ae79..29259e74dcf 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -378,12 +378,12 @@ try_again:
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6404906924f..c43e4a97dc4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,15 +46,16 @@
 
 static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
-
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS	},
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES	},
+
 };
 
 static int			system_wide			=  0;
@@ -120,10 +121,10 @@ static inline int nsec_counter(int counter)
 	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
 		return 0;
 
-	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
 		return 1;
 
-	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -176,10 +177,10 @@ static void read_counter(int counter)
 	 * Save the full runtime - to allow normalization during printout:
 	 */
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		runtime_nsecs = count[0];
 	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
-		attrs[counter].config == PERF_COUNT_CPU_CYCLES)
+		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
 		runtime_cycles = count[0];
 }
 
@@ -206,7 +207,7 @@ static void print_counter(int counter)
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
 		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
+			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
 
 			if (walltime_nsecs)
 				fprintf(stderr, " # %11.3f CPU utilization factor",
@@ -220,7 +221,7 @@ static void print_counter(int counter)
 				(double)count[0]/runtime_nsecs*1000.0);
 		if (runtime_cycles &&
 			attrs[counter].type == PERF_TYPE_HARDWARE &&
-				attrs[counter].config == PERF_COUNT_INSTRUCTIONS) {
+				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
 
 			fprintf(stderr, " # %1.3f per cycle",
 				(double)count[0] / (double)runtime_cycles);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 309dbc76ec8..fe338d3c5d7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -562,13 +562,13 @@ try_again:
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index d3250763dc9..860e116d979 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -99,13 +99,13 @@ enum hw_event_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES	= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES	= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
 };
 
 These are standardized types of events that work relatively uniformly
@@ -130,13 +130,13 @@ software events, selected by 'event_id':
  * well):
  */
 enum sw_event_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f18a9a006e1..9d5f1ca50e6 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -22,26 +22,26 @@ struct event_symbol {
 #define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
-  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
-  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
-  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
-  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
-  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
-  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
-  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
-
-  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
-  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, HW_INSTRUCTIONS),	"instructions",		},
+  { C(HARDWARE, HW_CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, HW_CACHE_MISSES),	"cache-misses",		},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions",	},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches",		},
+  { C(HARDWARE, HW_BRANCH_MISSES),	"branch-misses",	},
+  { C(HARDWARE, HW_BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, SW_CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, SW_TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"page-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -107,7 +107,7 @@ char *event_name(int counter)
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (config < PERF_HW_EVENTS_MAX)
+		if (config < PERF_COUNT_HW_MAX)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
@@ -136,7 +136,7 @@ char *event_name(int counter)
 	}
 
 	case PERF_TYPE_SOFTWARE:
-		if (config < PERF_SW_EVENTS_MAX)
+		if (config < PERF_COUNT_SW_MAX)
 			return sw_event_names[config];
 		return "unknown-software";
 
-- 
cgit v1.2.3-70-g09d2


From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:19:11 +0200
Subject: perf_counter: Rename L2 to LL cache

The top (fastest) and last level (biggest) caches are the most
interesting ones, performance wise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
[ Fixed the Nehalem LL table to LLC Reference/Miss events ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   |  2 +-
 arch/powerpc/kernel/power5+-pmu.c  |  2 +-
 arch/powerpc/kernel/power5-pmu.c   |  2 +-
 arch/powerpc/kernel/power6-pmu.c   |  2 +-
 arch/powerpc/kernel/power7-pmu.c   |  2 +-
 arch/powerpc/kernel/ppc970-pmu.c   |  2 +-
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 include/linux/perf_counter.h       |  4 ++--
 8 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 73956f084b2..07bd308a5fa 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0xc34,		0	},
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 5f8b7741e97..41e5d2d958d 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0		},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d54723ab627..05600b66221 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0x3c309b	},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 0cd406ee765..46f74bebcfd 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0x4008c,	0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x150730,	0x250532	},
 		[C(OP_WRITE)] = {	0x250432,	0x150432	},
 		[C(OP_PREFETCH)] = {	0x810a6,	0		},
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 060e0deb399..b3f7d1216ba 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0x408a,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x6080,		0x6084	},
 		[C(OP_WRITE)] = {	0x6082,		0x6086	},
 		[C(OP_PREFETCH)] = {	0,		0	},
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 46a20640942..ba0a357a89f 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0x733,		0	},
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 572fb434a66..895c82e7845 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
 		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
@@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
-		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
 	},
  },
  [ C(DTLB) ] = {
@@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 887df88a9c2..20cf5af27ad 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -56,14 +56,14 @@ enum perf_hw_id {
 /*
  * Generalized hardware cache counters:
  *
- *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
 	PERF_COUNT_HW_CACHE_L1D		= 0,
 	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_LL		= 2,
 	PERF_COUNT_HW_CACHE_DTLB	= 3,
 	PERF_COUNT_HW_CACHE_ITLB	= 4,
 	PERF_COUNT_HW_CACHE_BPU		= 5,
-- 
cgit v1.2.3-70-g09d2


From a308444ceb576d3089f9ca0dfd097eba6f1e623f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 14:44:26 +0200
Subject: perf_counter: Better align code

Whitespace and comment bits. Also update copyrights.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
---
 include/linux/perf_counter.h | 165 ++++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 20cf5af27ad..1fa1a26cb1b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -1,12 +1,13 @@
 /*
  *  Performance counters:
  *
- *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
- *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
  *
  *  Data type definitions, declarations, prototypes.
  *
- *  Started by: Thomas Gleixner and Ingo Molnar
+ *    Started by: Thomas Gleixner and Ingo Molnar
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -25,18 +26,19 @@
  * attr.type
  */
 enum perf_type_id {
-	PERF_TYPE_HARDWARE		= 0,
-	PERF_TYPE_SOFTWARE		= 1,
-	PERF_TYPE_TRACEPOINT		= 2,
-	PERF_TYPE_HW_CACHE		= 3,
-	PERF_TYPE_RAW			= 4,
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
 
-	PERF_TYPE_MAX,			/* non ABI */
+	PERF_TYPE_MAX,				/* non-ABI */
 };
 
 /*
- * Generalized performance counter event types, used by the attr.event_id
- * parameter of the sys_perf_counter_open() syscall:
+ * Generalized performance counter event types, used by the
+ * attr.event_id parameter of the sys_perf_counter_open()
+ * syscall:
  */
 enum perf_hw_id {
 	/*
@@ -50,7 +52,7 @@ enum perf_hw_id {
 	PERF_COUNT_HW_BRANCH_MISSES		= 5,
 	PERF_COUNT_HW_BUS_CYCLES		= 6,
 
-	PERF_COUNT_HW_MAX,		/* non ABI */
+	PERF_COUNT_HW_MAX,			/* non-ABI */
 };
 
 /*
@@ -61,29 +63,29 @@ enum perf_hw_id {
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D		= 0,
-	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_LL		= 2,
-	PERF_COUNT_HW_CACHE_DTLB	= 3,
-	PERF_COUNT_HW_CACHE_ITLB	= 4,
-	PERF_COUNT_HW_CACHE_BPU		= 5,
-
-	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
 };
 
 enum perf_hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ	= 0,
-	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
 
-	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
 };
 
 enum perf_hw_cache_op_result_id {
 	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
 	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
 
-	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
 };
 
 /*
@@ -93,15 +95,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_SW_CPU_CLOCK		= 0,
-	PERF_COUNT_SW_TASK_CLOCK	= 1,
-	PERF_COUNT_SW_PAGE_FAULTS	= 2,
-	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_COUNT_SW_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
 
 /*
@@ -109,15 +111,15 @@ enum perf_sw_ids {
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
-	PERF_SAMPLE_IP			= 1U << 0,
-	PERF_SAMPLE_TID			= 1U << 1,
-	PERF_SAMPLE_TIME		= 1U << 2,
-	PERF_SAMPLE_ADDR		= 1U << 3,
-	PERF_SAMPLE_GROUP		= 1U << 4,
-	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
-	PERF_SAMPLE_ID			= 1U << 6,
-	PERF_SAMPLE_CPU			= 1U << 7,
-	PERF_SAMPLE_PERIOD		= 1U << 8,
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_GROUP			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
 };
 
 /*
@@ -126,9 +128,9 @@ enum perf_counter_sample_format {
  * in increasing order of bit value, after the counter value.
  */
 enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
-	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
-	PERF_FORMAT_ID			=  1U << 2,
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
 };
 
 /*
@@ -229,12 +231,12 @@ struct perf_counter_mmap_page {
 	__u64   data_head;		/* head in the data section */
 };
 
-#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
-#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
-#define PERF_EVENT_MISC_KERNEL		(1 << 0)
-#define PERF_EVENT_MISC_USER		(2 << 0)
-#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
-#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
+#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_EVENT_MISC_KERNEL			(1 << 0)
+#define PERF_EVENT_MISC_USER			(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
+#define PERF_EVENT_MISC_OVERFLOW		(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -351,14 +353,14 @@ struct hw_perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
 	union {
 		struct { /* hardware */
-			u64				config;
-			unsigned long			config_base;
-			unsigned long			counter_base;
-			int				idx;
+			u64		config;
+			unsigned long	config_base;
+			unsigned long	counter_base;
+			int		idx;
 		};
 		union { /* software */
-			atomic64_t			count;
-			struct hrtimer			hrtimer;
+			atomic64_t	count;
+			struct hrtimer	hrtimer;
 		};
 	};
 	atomic64_t			prev_count;
@@ -523,37 +525,37 @@ struct perf_counter_context {
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
 	 */
-	spinlock_t		lock;
+	spinlock_t			lock;
 	/*
 	 * Protect the list of counters.  Locking either mutex or lock
 	 * is sufficient to ensure the list doesn't change; to change
 	 * the list you need to lock both the mutex and the spinlock.
 	 */
-	struct mutex		mutex;
+	struct mutex			mutex;
 
-	struct list_head	counter_list;
-	struct list_head	event_list;
-	int			nr_counters;
-	int			nr_active;
-	int			is_active;
-	atomic_t		refcount;
-	struct task_struct	*task;
+	struct list_head		counter_list;
+	struct list_head		event_list;
+	int				nr_counters;
+	int				nr_active;
+	int				is_active;
+	atomic_t			refcount;
+	struct task_struct		*task;
 
 	/*
 	 * Context clock, runs when context enabled.
 	 */
-	u64			time;
-	u64			timestamp;
+	u64				time;
+	u64				timestamp;
 
 	/*
 	 * These fields let us detect when two contexts have both
 	 * been cloned (inherited) from a common ancestor.
 	 */
-	struct perf_counter_context *parent_ctx;
-	u64			parent_gen;
-	u64			generation;
-	int			pin_count;
-	struct rcu_head		rcu_head;
+	struct perf_counter_context	*parent_ctx;
+	u64				parent_gen;
+	u64				generation;
+	int				pin_count;
+	struct rcu_head			rcu_head;
 };
 
 /**
@@ -604,9 +606,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs		*regs;
-	u64			addr;
-	u64			period;
+	struct pt_regs			*regs;
+	u64				addr;
+	u64				period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -636,11 +638,14 @@ extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH			255
 
 struct perf_callchain_entry {
-	u16	nr, hv, kernel, user;
-	u64	ip[MAX_STACK_DEPTH];
+	u16				nr;
+	u16				hv;
+	u16				kernel;
+	u16				user;
+	u64				ip[MAX_STACK_DEPTH];
 };
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
-- 
cgit v1.2.3-70-g09d2


From cca3f454a85ff42d426401bce7ac804541b2bd03 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:57:55 +0200
Subject: perf_counter: Add counter->id to the throttle event

So as to be able to distuinguish between multiple counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 +
 kernel/perf_counter.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fa1a26cb1b..6e133954e2e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -286,6 +286,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
+	 *	u64				id;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8859b97390e..ef5d8a5b245 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2950,13 +2950,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 	} throttle_event = {
 		.header = {
 			.type = PERF_EVENT_THROTTLE + 1,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time = sched_clock(),
+		.time	= sched_clock(),
+		.id	= counter->id,
 	};
 
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-- 
cgit v1.2.3-70-g09d2


From 3c7b4e6b8be4c16f1e6e5c558e33b7ff0db2dfaf Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:39 +0100
Subject: kmemleak: Add the base support

This patch adds the base support for the kernel memory leak
detector. It traces the memory allocation/freeing in a way similar to
the Boehm's conservative garbage collector, the difference being that
the unreferenced objects are not freed but only shown in
/sys/kernel/debug/kmemleak. Enabling this feature introduces an
overhead to memory allocations.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/kmemleak.h |   96 +++
 init/main.c              |    4 +-
 mm/kmemleak.c            | 1498 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1597 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kmemleak.h
 create mode 100644 mm/kmemleak.c

(limited to 'include/linux')

diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
new file mode 100644
index 00000000000..7796aed6cdd
--- /dev/null
+++ b/include/linux/kmemleak.h
@@ -0,0 +1,96 @@
+/*
+ * include/linux/kmemleak.h
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __KMEMLEAK_H
+#define __KMEMLEAK_H
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+
+extern void kmemleak_init(void);
+extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+			   gfp_t gfp);
+extern void kmemleak_free(const void *ptr);
+extern void kmemleak_padding(const void *ptr, unsigned long offset,
+			     size_t size);
+extern void kmemleak_not_leak(const void *ptr);
+extern void kmemleak_ignore(const void *ptr);
+extern void kmemleak_scan_area(const void *ptr, unsigned long offset,
+			       size_t length, gfp_t gfp);
+extern void kmemleak_no_scan(const void *ptr);
+
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, unsigned long flags,
+					    gfp_t gfp)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_alloc(ptr, size, min_count, gfp);
+}
+
+static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_free(ptr);
+}
+
+static inline void kmemleak_erase(void **ptr)
+{
+	*ptr = NULL;
+}
+
+#else
+
+static inline void kmemleak_init(void)
+{
+}
+static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+				  gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, unsigned long flags,
+					    gfp_t gfp)
+{
+}
+static inline void kmemleak_free(const void *ptr)
+{
+}
+static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+{
+}
+static inline void kmemleak_not_leak(const void *ptr)
+{
+}
+static inline void kmemleak_ignore(const void *ptr)
+{
+}
+static inline void kmemleak_scan_area(const void *ptr, unsigned long offset,
+				      size_t length, gfp_t gfp)
+{
+}
+static inline void kmemleak_erase(void **ptr)
+{
+}
+static inline void kmemleak_no_scan(const void *ptr)
+{
+}
+
+#endif	/* CONFIG_DEBUG_KMEMLEAK */
+
+#endif	/* __KMEMLEAK_H */
diff --git a/init/main.c b/init/main.c
index bb7dc57eee3..9d759d68d7a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -56,6 +56,7 @@
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
+#include <linux/kmemleak.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
@@ -603,6 +604,7 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
+	prio_tree_init();
 	pidhash_init();
 	init_timers();
 	hrtimers_init();
@@ -654,6 +656,7 @@ asmlinkage void __init start_kernel(void)
 	cpu_hotplug_init();
 	kmem_cache_init();
 	kmemtrace_init();
+	kmemleak_init();
 	debug_objects_mem_init();
 	idr_init_cache();
 	setup_per_cpu_pageset();
@@ -663,7 +666,6 @@ asmlinkage void __init start_kernel(void)
 	calibrate_delay();
 	pidmap_init();
 	pgtable_cache_init();
-	prio_tree_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 00000000000..58ec86c9e58
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1498 @@
+/*
+ * mm/kmemleak.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * For more information on the algorithm and kmemleak usage, please see
+ * Documentation/kmemleak.txt.
+ *
+ * Notes on locking
+ * ----------------
+ *
+ * The following locks and mutexes are used by kmemleak:
+ *
+ * - kmemleak_lock (rwlock): protects the object_list modifications and
+ *   accesses to the object_tree_root. The object_list is the main list
+ *   holding the metadata (struct kmemleak_object) for the allocated memory
+ *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   metadata based on a pointer to the corresponding memory block.  The
+ *   kmemleak_object structures are added to the object_list and
+ *   object_tree_root in the create_object() function called from the
+ *   kmemleak_alloc() callback and removed in delete_object() called from the
+ *   kmemleak_free() callback
+ * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
+ *   the metadata (e.g. count) are protected by this lock. Note that some
+ *   members of this structure may be protected by other means (atomic or
+ *   kmemleak_lock). This lock is also held when scanning the corresponding
+ *   memory block to avoid the kernel freeing it via the kmemleak_free()
+ *   callback. This is less heavyweight than holding a global lock like
+ *   kmemleak_lock during scanning
+ * - scan_mutex (mutex): ensures that only one thread may scan the memory for
+ *   unreferenced objects at a time. The gray_list contains the objects which
+ *   are already referenced or marked as false positives and need to be
+ *   scanned. This list is only modified during a scanning episode when the
+ *   scan_mutex is held. At the end of a scan, the gray_list is always empty.
+ *   Note that the kmemleak_object.use_count is incremented when an object is
+ *   added to the gray_list and therefore cannot be freed
+ * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
+ *   file together with modifications to the memory scanning parameters
+ *   including the scan_thread pointer
+ *
+ * The kmemleak_object structures have a use_count incremented or decremented
+ * using the get_object()/put_object() functions. When the use_count becomes
+ * 0, this count can no longer be incremented and put_object() schedules the
+ * kmemleak_object freeing via an RCU callback. All calls to the get_object()
+ * function must be protected by rcu_read_lock() to avoid accessing a freed
+ * structure.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/prio_tree.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/stacktrace.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/atomic.h>
+
+#include <linux/kmemleak.h>
+
+/*
+ * Kmemleak configuration and common defines.
+ */
+#define MAX_TRACE		16	/* stack trace length */
+#define REPORTS_NR		50	/* maximum number of reported leaks */
+#define MSECS_MIN_AGE		5000	/* minimum object age for reporting */
+#define MSECS_SCAN_YIELD	10	/* CPU yielding period */
+#define SECS_FIRST_SCAN		60	/* delay before the first scan */
+#define SECS_SCAN_WAIT		600	/* subsequent auto scanning delay */
+
+#define BYTES_PER_POINTER	sizeof(void *)
+
+/* scanning area inside a memory block */
+struct kmemleak_scan_area {
+	struct hlist_node node;
+	unsigned long offset;
+	size_t length;
+};
+
+/*
+ * Structure holding the metadata for each allocated memory block.
+ * Modifications to such objects should be made while holding the
+ * object->lock. Insertions or deletions from object_list, gray_list or
+ * tree_node are already protected by the corresponding locks or mutex (see
+ * the notes on locking above). These objects are reference-counted
+ * (use_count) and freed using the RCU mechanism.
+ */
+struct kmemleak_object {
+	spinlock_t lock;
+	unsigned long flags;		/* object status flags */
+	struct list_head object_list;
+	struct list_head gray_list;
+	struct prio_tree_node tree_node;
+	struct rcu_head rcu;		/* object_list lockless traversal */
+	/* object usage count; object freed when use_count == 0 */
+	atomic_t use_count;
+	unsigned long pointer;
+	size_t size;
+	/* minimum number of a pointers found before it is considered leak */
+	int min_count;
+	/* the total number of pointers found pointing to this object */
+	int count;
+	/* memory ranges to be scanned inside an object (empty for all) */
+	struct hlist_head area_list;
+	unsigned long trace[MAX_TRACE];
+	unsigned int trace_len;
+	unsigned long jiffies;		/* creation timestamp */
+	pid_t pid;			/* pid of the current task */
+	char comm[TASK_COMM_LEN];	/* executable name */
+};
+
+/* flag representing the memory block allocation status */
+#define OBJECT_ALLOCATED	(1 << 0)
+/* flag set after the first reporting of an unreference object */
+#define OBJECT_REPORTED		(1 << 1)
+/* flag set to not scan the object */
+#define OBJECT_NO_SCAN		(1 << 2)
+
+/* the list of all allocated objects */
+static LIST_HEAD(object_list);
+/* the list of gray-colored objects (see color_gray comment below) */
+static LIST_HEAD(gray_list);
+/* prio search tree for object boundaries */
+static struct prio_tree_root object_tree_root;
+/* rw_lock protecting the access to object_list and prio_tree_root */
+static DEFINE_RWLOCK(kmemleak_lock);
+
+/* allocation caches for kmemleak internal data */
+static struct kmem_cache *object_cache;
+static struct kmem_cache *scan_area_cache;
+
+/* set if tracing memory operations is enabled */
+static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
+/* set in the late_initcall if there were no errors */
+static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
+/* enables or disables early logging of the memory operations */
+static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
+/* set if a fata kmemleak error has occurred */
+static atomic_t kmemleak_error = ATOMIC_INIT(0);
+
+/* minimum and maximum address that may be valid pointers */
+static unsigned long min_addr = ULONG_MAX;
+static unsigned long max_addr;
+
+/* used for yielding the CPU to other tasks during scanning */
+static unsigned long next_scan_yield;
+static struct task_struct *scan_thread;
+static unsigned long jiffies_scan_yield;
+static unsigned long jiffies_min_age;
+/* delay between automatic memory scannings */
+static signed long jiffies_scan_wait;
+/* enables or disables the task stacks scanning */
+static int kmemleak_stack_scan;
+/* mutex protecting the memory scanning */
+static DEFINE_MUTEX(scan_mutex);
+/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
+static DEFINE_MUTEX(kmemleak_mutex);
+
+/* number of leaks reported (for limitation purposes) */
+static int reported_leaks;
+
+/*
+ * Early object allocation/freeing logging. Kkmemleak is initialized after the
+ * kernel allocator. However, both the kernel allocator and kmemleak may
+ * allocate memory blocks which need to be tracked. Kkmemleak defines an
+ * arbitrary buffer to hold the allocation/freeing information before it is
+ * fully initialized.
+ */
+
+/* kmemleak operation type for early logging */
+enum {
+	KMEMLEAK_ALLOC,
+	KMEMLEAK_FREE,
+	KMEMLEAK_NOT_LEAK,
+	KMEMLEAK_IGNORE,
+	KMEMLEAK_SCAN_AREA,
+	KMEMLEAK_NO_SCAN
+};
+
+/*
+ * Structure holding the information passed to kmemleak callbacks during the
+ * early logging.
+ */
+struct early_log {
+	int op_type;			/* kmemleak operation type */
+	const void *ptr;		/* allocated/freed memory block */
+	size_t size;			/* memory block size */
+	int min_count;			/* minimum reference count */
+	unsigned long offset;		/* scan area offset */
+	size_t length;			/* scan area length */
+};
+
+/* early logging buffer and current position */
+static struct early_log early_log[200];
+static int crt_early_log;
+
+static void kmemleak_disable(void);
+
+/*
+ * Print a warning and dump the stack trace.
+ */
+#define kmemleak_warn(x...)	do {	\
+	pr_warning(x);			\
+	dump_stack();			\
+} while (0)
+
+/*
+ * Macro invoked when a serious kmemleak condition occured and cannot be
+ * recovered from. Kkmemleak will be disabled and further allocation/freeing
+ * tracing no longer available.
+ */
+#define kmemleak_panic(x...)	do {	\
+	kmemleak_warn(x);		\
+	kmemleak_disable();		\
+} while (0)
+
+/*
+ * Object colors, encoded with count and min_count:
+ * - white - orphan object, not enough references to it (count < min_count)
+ * - gray  - not orphan, not marked as false positive (min_count == 0) or
+ *		sufficient references to it (count >= min_count)
+ * - black - ignore, it doesn't contain references (e.g. text section)
+ *		(min_count == -1). No function defined for this color.
+ * Newly created objects don't have any color assigned (object->count == -1)
+ * before the next memory scan when they become white.
+ */
+static int color_white(const struct kmemleak_object *object)
+{
+	return object->count != -1 && object->count < object->min_count;
+}
+
+static int color_gray(const struct kmemleak_object *object)
+{
+	return object->min_count != -1 && object->count >= object->min_count;
+}
+
+/*
+ * Objects are considered referenced if their color is gray and they have not
+ * been deleted.
+ */
+static int referenced_object(struct kmemleak_object *object)
+{
+	return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
+}
+
+/*
+ * Objects are considered unreferenced only if their color is white, they have
+ * not be deleted and have a minimum age to avoid false positives caused by
+ * pointers temporarily stored in CPU registers.
+ */
+static int unreferenced_object(struct kmemleak_object *object)
+{
+	return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+		time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
+}
+
+/*
+ * Printing of the (un)referenced objects information, either to the seq file
+ * or to the kernel log. The print_referenced/print_unreferenced functions
+ * must be called with the object->lock held.
+ */
+#define print_helper(seq, x...)	do {	\
+	struct seq_file *s = (seq);	\
+	if (s)				\
+		seq_printf(s, x);	\
+	else				\
+		pr_info(x);		\
+} while (0)
+
+static void print_referenced(struct kmemleak_object *object)
+{
+	pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n",
+		object->pointer, object->size);
+}
+
+static void print_unreferenced(struct seq_file *seq,
+			       struct kmemleak_object *object)
+{
+	int i;
+
+	print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n",
+		     object->pointer, object->size);
+	print_helper(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
+		     object->comm, object->pid, object->jiffies);
+	print_helper(seq, "  backtrace:\n");
+
+	for (i = 0; i < object->trace_len; i++) {
+		void *ptr = (void *)object->trace[i];
+		print_helper(seq, "    [<%p>] %pS\n", ptr, ptr);
+	}
+}
+
+/*
+ * Print the kmemleak_object information. This function is used mainly for
+ * debugging special cases when kmemleak operations. It must be called with
+ * the object->lock held.
+ */
+static void dump_object_info(struct kmemleak_object *object)
+{
+	struct stack_trace trace;
+
+	trace.nr_entries = object->trace_len;
+	trace.entries = object->trace;
+
+	pr_notice("kmemleak: Object 0x%08lx (size %zu):\n",
+		  object->tree_node.start, object->size);
+	pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
+		  object->comm, object->pid, object->jiffies);
+	pr_notice("  min_count = %d\n", object->min_count);
+	pr_notice("  count = %d\n", object->count);
+	pr_notice("  backtrace:\n");
+	print_stack_trace(&trace, 4);
+}
+
+/*
+ * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * tree based on a pointer value. If alias is 0, only values pointing to the
+ * beginning of the memory block are allowed. The kmemleak_lock must be held
+ * when calling this function.
+ */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+	struct prio_tree_node *node;
+	struct prio_tree_iter iter;
+	struct kmemleak_object *object;
+
+	prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
+	node = prio_tree_next(&iter);
+	if (node) {
+		object = prio_tree_entry(node, struct kmemleak_object,
+					 tree_node);
+		if (!alias && object->pointer != ptr) {
+			kmemleak_warn("kmemleak: Found object by alias");
+			object = NULL;
+		}
+	} else
+		object = NULL;
+
+	return object;
+}
+
+/*
+ * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
+ * that once an object's use_count reached 0, the RCU freeing was already
+ * registered and the object should no longer be used. This function must be
+ * called under the protection of rcu_read_lock().
+ */
+static int get_object(struct kmemleak_object *object)
+{
+	return atomic_inc_not_zero(&object->use_count);
+}
+
+/*
+ * RCU callback to free a kmemleak_object.
+ */
+static void free_object_rcu(struct rcu_head *rcu)
+{
+	struct hlist_node *elem, *tmp;
+	struct kmemleak_scan_area *area;
+	struct kmemleak_object *object =
+		container_of(rcu, struct kmemleak_object, rcu);
+
+	/*
+	 * Once use_count is 0 (guaranteed by put_object), there is no other
+	 * code accessing this object, hence no need for locking.
+	 */
+	hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
+		hlist_del(elem);
+		kmem_cache_free(scan_area_cache, area);
+	}
+	kmem_cache_free(object_cache, object);
+}
+
+/*
+ * Decrement the object use_count. Once the count is 0, free the object using
+ * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
+ * delete_object() path, the delayed RCU freeing ensures that there is no
+ * recursive call to the kernel allocator. Lock-less RCU object_list traversal
+ * is also possible.
+ */
+static void put_object(struct kmemleak_object *object)
+{
+	if (!atomic_dec_and_test(&object->use_count))
+		return;
+
+	/* should only get here after delete_object was called */
+	WARN_ON(object->flags & OBJECT_ALLOCATED);
+
+	call_rcu(&object->rcu, free_object_rcu);
+}
+
+/*
+ * Look up an object in the prio search tree and increase its use_count.
+ */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+	unsigned long flags;
+	struct kmemleak_object *object = NULL;
+
+	rcu_read_lock();
+	read_lock_irqsave(&kmemleak_lock, flags);
+	if (ptr >= min_addr && ptr < max_addr)
+		object = lookup_object(ptr, alias);
+	read_unlock_irqrestore(&kmemleak_lock, flags);
+
+	/* check whether the object is still available */
+	if (object && !get_object(object))
+		object = NULL;
+	rcu_read_unlock();
+
+	return object;
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root.
+ */
+static void create_object(unsigned long ptr, size_t size, int min_count,
+			  gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+	struct prio_tree_node *node;
+	struct stack_trace trace;
+
+	object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK);
+	if (!object) {
+		kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object "
+			       "structure\n");
+		return;
+	}
+
+	INIT_LIST_HEAD(&object->object_list);
+	INIT_LIST_HEAD(&object->gray_list);
+	INIT_HLIST_HEAD(&object->area_list);
+	spin_lock_init(&object->lock);
+	atomic_set(&object->use_count, 1);
+	object->flags = OBJECT_ALLOCATED;
+	object->pointer = ptr;
+	object->size = size;
+	object->min_count = min_count;
+	object->count = -1;			/* no color initially */
+	object->jiffies = jiffies;
+
+	/* task information */
+	if (in_irq()) {
+		object->pid = 0;
+		strncpy(object->comm, "hardirq", sizeof(object->comm));
+	} else if (in_softirq()) {
+		object->pid = 0;
+		strncpy(object->comm, "softirq", sizeof(object->comm));
+	} else {
+		object->pid = current->pid;
+		/*
+		 * There is a small chance of a race with set_task_comm(),
+		 * however using get_task_comm() here may cause locking
+		 * dependency issues with current->alloc_lock. In the worst
+		 * case, the command line is not correct.
+		 */
+		strncpy(object->comm, current->comm, sizeof(object->comm));
+	}
+
+	/* kernel backtrace */
+	trace.max_entries = MAX_TRACE;
+	trace.nr_entries = 0;
+	trace.entries = object->trace;
+	trace.skip = 1;
+	save_stack_trace(&trace);
+	object->trace_len = trace.nr_entries;
+
+	INIT_PRIO_TREE_NODE(&object->tree_node);
+	object->tree_node.start = ptr;
+	object->tree_node.last = ptr + size - 1;
+
+	write_lock_irqsave(&kmemleak_lock, flags);
+	min_addr = min(min_addr, ptr);
+	max_addr = max(max_addr, ptr + size);
+	node = prio_tree_insert(&object_tree_root, &object->tree_node);
+	/*
+	 * The code calling the kernel does not yet have the pointer to the
+	 * memory block to be able to free it.  However, we still hold the
+	 * kmemleak_lock here in case parts of the kernel started freeing
+	 * random memory blocks.
+	 */
+	if (node != &object->tree_node) {
+		unsigned long flags;
+
+		kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object "
+			       "search tree (already existing)\n", ptr);
+		object = lookup_object(ptr, 1);
+		spin_lock_irqsave(&object->lock, flags);
+		dump_object_info(object);
+		spin_unlock_irqrestore(&object->lock, flags);
+
+		goto out;
+	}
+	list_add_tail_rcu(&object->object_list, &object_list);
+out:
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
+ * Remove the metadata (struct kmemleak_object) for a memory block from the
+ * object_list and object_tree_root and decrement its use_count.
+ */
+static void delete_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	write_lock_irqsave(&kmemleak_lock, flags);
+	object = lookup_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n",
+			      ptr);
+		write_unlock_irqrestore(&kmemleak_lock, flags);
+		return;
+	}
+	prio_tree_remove(&object_tree_root, &object->tree_node);
+	list_del_rcu(&object->object_list);
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+
+	WARN_ON(!(object->flags & OBJECT_ALLOCATED));
+	WARN_ON(atomic_read(&object->use_count) < 1);
+
+	/*
+	 * Locking here also ensures that the corresponding memory block
+	 * cannot be freed when it is being scanned.
+	 */
+	spin_lock_irqsave(&object->lock, flags);
+	if (object->flags & OBJECT_REPORTED)
+		print_referenced(object);
+	object->flags &= ~OBJECT_ALLOCATED;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Make a object permanently as gray-colored so that it can no longer be
+ * reported as a leak. This is used in general to mark a false positive.
+ */
+static void make_gray_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->min_count = 0;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Mark the object as black-colored so that it is ignored from scans and
+ * reporting.
+ */
+static void make_black_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->min_count = -1;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Add a scanning area to the object. If at least one such area is added,
+ * kmemleak will only scan these ranges rather than the whole memory block.
+ */
+static void add_scan_area(unsigned long ptr, unsigned long offset,
+			  size_t length, gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+	struct kmemleak_scan_area *area;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Adding scan area to unknown "
+			      "object at 0x%08lx\n", ptr);
+		return;
+	}
+
+	area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK);
+	if (!area) {
+		kmemleak_warn("kmemleak: Cannot allocate a scan area\n");
+		goto out;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	if (offset + length > object->size) {
+		kmemleak_warn("kmemleak: Scan area larger than object "
+			      "0x%08lx\n", ptr);
+		dump_object_info(object);
+		kmem_cache_free(scan_area_cache, area);
+		goto out_unlock;
+	}
+
+	INIT_HLIST_NODE(&area->node);
+	area->offset = offset;
+	area->length = length;
+
+	hlist_add_head(&area->node, &object->area_list);
+out_unlock:
+	spin_unlock_irqrestore(&object->lock, flags);
+out:
+	put_object(object);
+}
+
+/*
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * pointer. Such object will not be scanned by kmemleak but references to it
+ * are searched.
+ */
+static void object_no_scan(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Not scanning unknown object at "
+			      "0x%08lx\n", ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->flags |= OBJECT_NO_SCAN;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Log an early kmemleak_* call to the early_log buffer. These calls will be
+ * processed later once kmemleak is fully initialized.
+ */
+static void log_early(int op_type, const void *ptr, size_t size,
+		      int min_count, unsigned long offset, size_t length)
+{
+	unsigned long flags;
+	struct early_log *log;
+
+	if (crt_early_log >= ARRAY_SIZE(early_log)) {
+		kmemleak_panic("kmemleak: Early log buffer exceeded\n");
+		return;
+	}
+
+	/*
+	 * There is no need for locking since the kernel is still in UP mode
+	 * at this stage. Disabling the IRQs is enough.
+	 */
+	local_irq_save(flags);
+	log = &early_log[crt_early_log];
+	log->op_type = op_type;
+	log->ptr = ptr;
+	log->size = size;
+	log->min_count = min_count;
+	log->offset = offset;
+	log->length = length;
+	crt_early_log++;
+	local_irq_restore(flags);
+}
+
+/*
+ * Memory allocation function callback. This function is called from the
+ * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
+ * vmalloc etc.).
+ */
+void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp)
+{
+	pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		create_object((unsigned long)ptr, size, min_count, gfp);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc);
+
+/*
+ * Memory freeing function callback. This function is called from the kernel
+ * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+ */
+void kmemleak_free(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		delete_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free);
+
+/*
+ * Mark an already allocated memory block as a false positive. This will cause
+ * the block to no longer be reported as leak and always be scanned.
+ */
+void kmemleak_not_leak(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		make_gray_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_not_leak);
+
+/*
+ * Ignore a memory block. This is usually done when it is known that the
+ * corresponding block is not a leak and does not contain any references to
+ * other allocated memory blocks.
+ */
+void kmemleak_ignore(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		make_black_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_ignore);
+
+/*
+ * Limit the range to be scanned in an allocated memory block.
+ */
+void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length,
+			gfp_t gfp)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		add_scan_area((unsigned long)ptr, offset, length, gfp);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+}
+EXPORT_SYMBOL(kmemleak_scan_area);
+
+/*
+ * Inform kmemleak not to scan the given memory block.
+ */
+void kmemleak_no_scan(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		object_no_scan((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_no_scan);
+
+/*
+ * Yield the CPU so that other tasks get a chance to run.  The yielding is
+ * rate-limited to avoid excessive number of calls to the schedule() function
+ * during memory scanning.
+ */
+static void scan_yield(void)
+{
+	might_sleep();
+
+	if (time_is_before_eq_jiffies(next_scan_yield)) {
+		schedule();
+		next_scan_yield = jiffies + jiffies_scan_yield;
+	}
+}
+
+/*
+ * Memory scanning is a long process and it needs to be interruptable. This
+ * function checks whether such interrupt condition occured.
+ */
+static int scan_should_stop(void)
+{
+	if (!atomic_read(&kmemleak_enabled))
+		return 1;
+
+	/*
+	 * This function may be called from either process or kthread context,
+	 * hence the need to check for both stop conditions.
+	 */
+	if (current->mm)
+		return signal_pending(current);
+	else
+		return kthread_should_stop();
+
+	return 0;
+}
+
+/*
+ * Scan a memory block (exclusive range) for valid pointers and add those
+ * found to the gray list.
+ */
+static void scan_block(void *_start, void *_end,
+		       struct kmemleak_object *scanned)
+{
+	unsigned long *ptr;
+	unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
+	unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+
+	for (ptr = start; ptr < end; ptr++) {
+		unsigned long flags;
+		unsigned long pointer = *ptr;
+		struct kmemleak_object *object;
+
+		if (scan_should_stop())
+			break;
+
+		/*
+		 * When scanning a memory block with a corresponding
+		 * kmemleak_object, the CPU yielding is handled in the calling
+		 * code since it holds the object->lock to avoid the block
+		 * freeing.
+		 */
+		if (!scanned)
+			scan_yield();
+
+		object = find_and_get_object(pointer, 1);
+		if (!object)
+			continue;
+		if (object == scanned) {
+			/* self referenced, ignore */
+			put_object(object);
+			continue;
+		}
+
+		/*
+		 * Avoid the lockdep recursive warning on object->lock being
+		 * previously acquired in scan_object(). These locks are
+		 * enclosed by scan_mutex.
+		 */
+		spin_lock_irqsave_nested(&object->lock, flags,
+					 SINGLE_DEPTH_NESTING);
+		if (!color_white(object)) {
+			/* non-orphan, ignored or new */
+			spin_unlock_irqrestore(&object->lock, flags);
+			put_object(object);
+			continue;
+		}
+
+		/*
+		 * Increase the object's reference count (number of pointers
+		 * to the memory block). If this count reaches the required
+		 * minimum, the object's color will become gray and it will be
+		 * added to the gray_list.
+		 */
+		object->count++;
+		if (color_gray(object))
+			list_add_tail(&object->gray_list, &gray_list);
+		else
+			put_object(object);
+		spin_unlock_irqrestore(&object->lock, flags);
+	}
+}
+
+/*
+ * Scan a memory block corresponding to a kmemleak_object. A condition is
+ * that object->use_count >= 1.
+ */
+static void scan_object(struct kmemleak_object *object)
+{
+	struct kmemleak_scan_area *area;
+	struct hlist_node *elem;
+	unsigned long flags;
+
+	/*
+	 * Once the object->lock is aquired, the corresponding memory block
+	 * cannot be freed (the same lock is aquired in delete_object).
+	 */
+	spin_lock_irqsave(&object->lock, flags);
+	if (object->flags & OBJECT_NO_SCAN)
+		goto out;
+	if (!(object->flags & OBJECT_ALLOCATED))
+		/* already freed object */
+		goto out;
+	if (hlist_empty(&object->area_list))
+		scan_block((void *)object->pointer,
+			   (void *)(object->pointer + object->size), object);
+	else
+		hlist_for_each_entry(area, elem, &object->area_list, node)
+			scan_block((void *)(object->pointer + area->offset),
+				   (void *)(object->pointer + area->offset
+					    + area->length), object);
+out:
+	spin_unlock_irqrestore(&object->lock, flags);
+}
+
+/*
+ * Scan data sections and all the referenced memory blocks allocated via the
+ * kernel's standard allocators. This function must be called with the
+ * scan_mutex held.
+ */
+static void kmemleak_scan(void)
+{
+	unsigned long flags;
+	struct kmemleak_object *object, *tmp;
+	struct task_struct *task;
+	int i;
+
+	/* prepare the kmemleak_object's */
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list) {
+		spin_lock_irqsave(&object->lock, flags);
+#ifdef DEBUG
+		/*
+		 * With a few exceptions there should be a maximum of
+		 * 1 reference to any object at this point.
+		 */
+		if (atomic_read(&object->use_count) > 1) {
+			pr_debug("kmemleak: object->use_count = %d\n",
+				 atomic_read(&object->use_count));
+			dump_object_info(object);
+		}
+#endif
+		/* reset the reference count (whiten the object) */
+		object->count = 0;
+		if (color_gray(object) && get_object(object))
+			list_add_tail(&object->gray_list, &gray_list);
+
+		spin_unlock_irqrestore(&object->lock, flags);
+	}
+	rcu_read_unlock();
+
+	/* data/bss scanning */
+	scan_block(_sdata, _edata, NULL);
+	scan_block(__bss_start, __bss_stop, NULL);
+
+#ifdef CONFIG_SMP
+	/* per-cpu sections scanning */
+	for_each_possible_cpu(i)
+		scan_block(__per_cpu_start + per_cpu_offset(i),
+			   __per_cpu_end + per_cpu_offset(i), NULL);
+#endif
+
+	/*
+	 * Struct page scanning for each node. The code below is not yet safe
+	 * with MEMORY_HOTPLUG.
+	 */
+	for_each_online_node(i) {
+		pg_data_t *pgdat = NODE_DATA(i);
+		unsigned long start_pfn = pgdat->node_start_pfn;
+		unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+		unsigned long pfn;
+
+		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+			struct page *page;
+
+			if (!pfn_valid(pfn))
+				continue;
+			page = pfn_to_page(pfn);
+			/* only scan if page is in use */
+			if (page_count(page) == 0)
+				continue;
+			scan_block(page, page + 1, NULL);
+		}
+	}
+
+	/*
+	 * Scanning the task stacks may introduce false negatives and it is
+	 * not enabled by default.
+	 */
+	if (kmemleak_stack_scan) {
+		read_lock(&tasklist_lock);
+		for_each_process(task)
+			scan_block(task_stack_page(task),
+				   task_stack_page(task) + THREAD_SIZE, NULL);
+		read_unlock(&tasklist_lock);
+	}
+
+	/*
+	 * Scan the objects already referenced from the sections scanned
+	 * above. More objects will be referenced and, if there are no memory
+	 * leaks, all the objects will be scanned. The list traversal is safe
+	 * for both tail additions and removals from inside the loop. The
+	 * kmemleak objects cannot be freed from outside the loop because their
+	 * use_count was increased.
+	 */
+	object = list_entry(gray_list.next, typeof(*object), gray_list);
+	while (&object->gray_list != &gray_list) {
+		scan_yield();
+
+		/* may add new objects to the list */
+		if (!scan_should_stop())
+			scan_object(object);
+
+		tmp = list_entry(object->gray_list.next, typeof(*object),
+				 gray_list);
+
+		/* remove the object from the list and release it */
+		list_del(&object->gray_list);
+		put_object(object);
+
+		object = tmp;
+	}
+	WARN_ON(!list_empty(&gray_list));
+}
+
+/*
+ * Thread function performing automatic memory scanning. Unreferenced objects
+ * at the end of a memory scan are reported but only the first time.
+ */
+static int kmemleak_scan_thread(void *arg)
+{
+	static int first_run = 1;
+
+	pr_info("kmemleak: Automatic memory scanning thread started\n");
+
+	/*
+	 * Wait before the first scan to allow the system to fully initialize.
+	 */
+	if (first_run) {
+		first_run = 0;
+		ssleep(SECS_FIRST_SCAN);
+	}
+
+	while (!kthread_should_stop()) {
+		struct kmemleak_object *object;
+		signed long timeout = jiffies_scan_wait;
+
+		mutex_lock(&scan_mutex);
+
+		kmemleak_scan();
+		reported_leaks = 0;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(object, &object_list, object_list) {
+			unsigned long flags;
+
+			if (reported_leaks >= REPORTS_NR)
+				break;
+			spin_lock_irqsave(&object->lock, flags);
+			if (!(object->flags & OBJECT_REPORTED) &&
+			    unreferenced_object(object)) {
+				print_unreferenced(NULL, object);
+				object->flags |= OBJECT_REPORTED;
+				reported_leaks++;
+			} else if ((object->flags & OBJECT_REPORTED) &&
+				   referenced_object(object)) {
+				print_referenced(object);
+				object->flags &= ~OBJECT_REPORTED;
+			}
+			spin_unlock_irqrestore(&object->lock, flags);
+		}
+		rcu_read_unlock();
+
+		mutex_unlock(&scan_mutex);
+		/* wait before the next scan */
+		while (timeout && !kthread_should_stop())
+			timeout = schedule_timeout_interruptible(timeout);
+	}
+
+	pr_info("kmemleak: Automatic memory scanning thread ended\n");
+
+	return 0;
+}
+
+/*
+ * Start the automatic memory scanning thread. This function must be called
+ * with the kmemleak_mutex held.
+ */
+void start_scan_thread(void)
+{
+	if (scan_thread)
+		return;
+	scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
+	if (IS_ERR(scan_thread)) {
+		pr_warning("kmemleak: Failed to create the scan thread\n");
+		scan_thread = NULL;
+	}
+}
+
+/*
+ * Stop the automatic memory scanning thread. This function must be called
+ * with the kmemleak_mutex held.
+ */
+void stop_scan_thread(void)
+{
+	if (scan_thread) {
+		kthread_stop(scan_thread);
+		scan_thread = NULL;
+	}
+}
+
+/*
+ * Iterate over the object_list and return the first valid object at or after
+ * the required position with its use_count incremented. The function triggers
+ * a memory scanning when the pos argument points to the first position.
+ */
+static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct kmemleak_object *object;
+	loff_t n = *pos;
+
+	if (!n) {
+		kmemleak_scan();
+		reported_leaks = 0;
+	}
+	if (reported_leaks >= REPORTS_NR)
+		return NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list) {
+		if (n-- > 0)
+			continue;
+		if (get_object(object))
+			goto out;
+	}
+	object = NULL;
+out:
+	rcu_read_unlock();
+	return object;
+}
+
+/*
+ * Return the next object in the object_list. The function decrements the
+ * use_count of the previous object and increases that of the next one.
+ */
+static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct kmemleak_object *prev_obj = v;
+	struct kmemleak_object *next_obj = NULL;
+	struct list_head *n = &prev_obj->object_list;
+
+	++(*pos);
+	if (reported_leaks >= REPORTS_NR)
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_continue_rcu(n, &object_list) {
+		next_obj = list_entry(n, struct kmemleak_object, object_list);
+		if (get_object(next_obj))
+			break;
+	}
+	rcu_read_unlock();
+out:
+	put_object(prev_obj);
+	return next_obj;
+}
+
+/*
+ * Decrement the use_count of the last object required, if any.
+ */
+static void kmemleak_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v)
+		put_object(v);
+}
+
+/*
+ * Print the information for an unreferenced object to the seq file.
+ */
+static int kmemleak_seq_show(struct seq_file *seq, void *v)
+{
+	struct kmemleak_object *object = v;
+	unsigned long flags;
+
+	spin_lock_irqsave(&object->lock, flags);
+	if (!unreferenced_object(object))
+		goto out;
+	print_unreferenced(seq, object);
+	reported_leaks++;
+out:
+	spin_unlock_irqrestore(&object->lock, flags);
+	return 0;
+}
+
+static const struct seq_operations kmemleak_seq_ops = {
+	.start = kmemleak_seq_start,
+	.next  = kmemleak_seq_next,
+	.stop  = kmemleak_seq_stop,
+	.show  = kmemleak_seq_show,
+};
+
+static int kmemleak_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (!atomic_read(&kmemleak_enabled))
+		return -EBUSY;
+
+	ret = mutex_lock_interruptible(&kmemleak_mutex);
+	if (ret < 0)
+		goto out;
+	if (file->f_mode & FMODE_READ) {
+		ret = mutex_lock_interruptible(&scan_mutex);
+		if (ret < 0)
+			goto kmemleak_unlock;
+		ret = seq_open(file, &kmemleak_seq_ops);
+		if (ret < 0)
+			goto scan_unlock;
+	}
+	return ret;
+
+scan_unlock:
+	mutex_unlock(&scan_mutex);
+kmemleak_unlock:
+	mutex_unlock(&kmemleak_mutex);
+out:
+	return ret;
+}
+
+static int kmemleak_release(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (file->f_mode & FMODE_READ) {
+		seq_release(inode, file);
+		mutex_unlock(&scan_mutex);
+	}
+	mutex_unlock(&kmemleak_mutex);
+
+	return ret;
+}
+
+/*
+ * File write operation to configure kmemleak at run-time. The following
+ * commands can be written to the /sys/kernel/debug/kmemleak file:
+ *   off	- disable kmemleak (irreversible)
+ *   stack=on	- enable the task stacks scanning
+ *   stack=off	- disable the tasks stacks scanning
+ *   scan=on	- start the automatic memory scanning thread
+ *   scan=off	- stop the automatic memory scanning thread
+ *   scan=...	- set the automatic memory scanning period in seconds (0 to
+ *		  disable it)
+ */
+static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
+			      size_t size, loff_t *ppos)
+{
+	char buf[64];
+	int buf_size;
+
+	if (!atomic_read(&kmemleak_enabled))
+		return -EBUSY;
+
+	buf_size = min(size, (sizeof(buf) - 1));
+	if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	if (strncmp(buf, "off", 3) == 0)
+		kmemleak_disable();
+	else if (strncmp(buf, "stack=on", 8) == 0)
+		kmemleak_stack_scan = 1;
+	else if (strncmp(buf, "stack=off", 9) == 0)
+		kmemleak_stack_scan = 0;
+	else if (strncmp(buf, "scan=on", 7) == 0)
+		start_scan_thread();
+	else if (strncmp(buf, "scan=off", 8) == 0)
+		stop_scan_thread();
+	else if (strncmp(buf, "scan=", 5) == 0) {
+		unsigned long secs;
+		int err;
+
+		err = strict_strtoul(buf + 5, 0, &secs);
+		if (err < 0)
+			return err;
+		stop_scan_thread();
+		if (secs) {
+			jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+			start_scan_thread();
+		}
+	} else
+		return -EINVAL;
+
+	/* ignore the rest of the buffer, only one command at a time */
+	*ppos += size;
+	return size;
+}
+
+static const struct file_operations kmemleak_fops = {
+	.owner		= THIS_MODULE,
+	.open		= kmemleak_open,
+	.read		= seq_read,
+	.write		= kmemleak_write,
+	.llseek		= seq_lseek,
+	.release	= kmemleak_release,
+};
+
+/*
+ * Perform the freeing of the kmemleak internal objects after waiting for any
+ * current memory scan to complete.
+ */
+static int kmemleak_cleanup_thread(void *arg)
+{
+	struct kmemleak_object *object;
+
+	mutex_lock(&kmemleak_mutex);
+	stop_scan_thread();
+	mutex_unlock(&kmemleak_mutex);
+
+	mutex_lock(&scan_mutex);
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list)
+		delete_object(object->pointer);
+	rcu_read_unlock();
+	mutex_unlock(&scan_mutex);
+
+	return 0;
+}
+
+/*
+ * Start the clean-up thread.
+ */
+static void kmemleak_cleanup(void)
+{
+	struct task_struct *cleanup_thread;
+
+	cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
+				     "kmemleak-clean");
+	if (IS_ERR(cleanup_thread))
+		pr_warning("kmemleak: Failed to create the clean-up thread\n");
+}
+
+/*
+ * Disable kmemleak. No memory allocation/freeing will be traced once this
+ * function is called. Disabling kmemleak is an irreversible operation.
+ */
+static void kmemleak_disable(void)
+{
+	/* atomically check whether it was already invoked */
+	if (atomic_cmpxchg(&kmemleak_error, 0, 1))
+		return;
+
+	/* stop any memory operation tracing */
+	atomic_set(&kmemleak_early_log, 0);
+	atomic_set(&kmemleak_enabled, 0);
+
+	/* check whether it is too early for a kernel thread */
+	if (atomic_read(&kmemleak_initialized))
+		kmemleak_cleanup();
+
+	pr_info("Kernel memory leak detector disabled\n");
+}
+
+/*
+ * Allow boot-time kmemleak disabling (enabled by default).
+ */
+static int kmemleak_boot_config(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (strcmp(str, "off") == 0)
+		kmemleak_disable();
+	else if (strcmp(str, "on") != 0)
+		return -EINVAL;
+	return 0;
+}
+early_param("kmemleak", kmemleak_boot_config);
+
+/*
+ * Kkmemleak initialization.
+ */
+void __init kmemleak_init(void)
+{
+	int i;
+	unsigned long flags;
+
+	jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
+	jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
+	jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+
+	object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
+	scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
+	INIT_PRIO_TREE_ROOT(&object_tree_root);
+
+	/* the kernel is still in UP mode, so disabling the IRQs is enough */
+	local_irq_save(flags);
+	if (!atomic_read(&kmemleak_error)) {
+		atomic_set(&kmemleak_enabled, 1);
+		atomic_set(&kmemleak_early_log, 0);
+	}
+	local_irq_restore(flags);
+
+	/*
+	 * This is the point where tracking allocations is safe. Automatic
+	 * scanning is started during the late initcall. Add the early logged
+	 * callbacks to the kmemleak infrastructure.
+	 */
+	for (i = 0; i < crt_early_log; i++) {
+		struct early_log *log = &early_log[i];
+
+		switch (log->op_type) {
+		case KMEMLEAK_ALLOC:
+			kmemleak_alloc(log->ptr, log->size, log->min_count,
+				       GFP_KERNEL);
+			break;
+		case KMEMLEAK_FREE:
+			kmemleak_free(log->ptr);
+			break;
+		case KMEMLEAK_NOT_LEAK:
+			kmemleak_not_leak(log->ptr);
+			break;
+		case KMEMLEAK_IGNORE:
+			kmemleak_ignore(log->ptr);
+			break;
+		case KMEMLEAK_SCAN_AREA:
+			kmemleak_scan_area(log->ptr, log->offset, log->length,
+					   GFP_KERNEL);
+			break;
+		case KMEMLEAK_NO_SCAN:
+			kmemleak_no_scan(log->ptr);
+			break;
+		default:
+			WARN_ON(1);
+		}
+	}
+}
+
+/*
+ * Late initialization function.
+ */
+static int __init kmemleak_late_init(void)
+{
+	struct dentry *dentry;
+
+	atomic_set(&kmemleak_initialized, 1);
+
+	if (atomic_read(&kmemleak_error)) {
+		/*
+		 * Some error occured and kmemleak was disabled. There is a
+		 * small chance that kmemleak_disable() was called immediately
+		 * after setting kmemleak_initialized and we may end up with
+		 * two clean-up threads but serialized by scan_mutex.
+		 */
+		kmemleak_cleanup();
+		return -ENOMEM;
+	}
+
+	dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
+				     &kmemleak_fops);
+	if (!dentry)
+		pr_warning("kmemleak: Failed to create the debugfs kmemleak "
+			   "file\n");
+	mutex_lock(&kmemleak_mutex);
+	start_scan_thread();
+	mutex_unlock(&kmemleak_mutex);
+
+	pr_info("Kernel memory leak detector initialized\n");
+
+	return 0;
+}
+late_initcall(kmemleak_late_init);
-- 
cgit v1.2.3-70-g09d2


From d5cff635290aec9ad7e6ee546aa4fae895361cbb Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:40 +0100
Subject: kmemleak: Add the slab memory allocation/freeing hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from
the slab allocator. The patch also adds the SLAB_NOLEAKTRACE flag to
avoid recursive calls to kmemleak when it allocates its own data
structures.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h |  2 ++
 mm/slab.c            | 32 ++++++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 24c5602bee9..48803064ced 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -62,6 +62,8 @@
 # define SLAB_DEBUG_OBJECTS	0x00000000UL
 #endif
 
+#define SLAB_NOLEAKTRACE	0x00800000UL	/* Avoid kmemleak tracing */
+
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
diff --git a/mm/slab.c b/mm/slab.c
index f85831da908..859067f8e4f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,6 +107,7 @@
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
 #include	<linux/nodemask.h>
+#include	<linux/kmemleak.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 #include	<linux/fault-inject.h>
@@ -178,13 +179,13 @@
 			 SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS)
+			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS)
+			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
 #endif
 
 /*
@@ -964,6 +965,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 	struct array_cache *nc = NULL;
 
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	/*
+	 * The array_cache structures contain pointers to free object.
+	 * However, when such objects are allocated or transfered to another
+	 * cache the pointers are not cleared and they could be counted as
+	 * valid references during a kmemleak scan. Therefore, kmemleak must
+	 * not scan such objects.
+	 */
+	kmemleak_no_scan(nc);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -2621,6 +2630,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
 					      local_flags, nodeid);
+		/*
+		 * If the first object in the slab is leaked (it's allocated
+		 * but no one has a reference to it), we want to make sure
+		 * kmemleak does not treat the ->s_mem pointer as a reference
+		 * to the object. Otherwise we will not report the leak.
+		 */
+		kmemleak_scan_area(slabp, offsetof(struct slab, list),
+				   sizeof(struct list_head), local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -3141,6 +3158,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 		STATS_INC_ALLOCMISS(cachep);
 		objp = cache_alloc_refill(cachep, flags);
 	}
+	/*
+	 * To avoid a false negative, if an object that is in one of the
+	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
+	 * treat the array pointers as a reference to the object.
+	 */
+	kmemleak_erase(&ac->entry[ac->avail]);
 	return objp;
 }
 
@@ -3360,6 +3383,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
   out:
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+				 flags);
 
 	if (unlikely((flags & __GFP_ZERO) && ptr))
 		memset(ptr, 0, obj_size(cachep));
@@ -3415,6 +3440,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	objp = __do_cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+				 flags);
 	prefetchw(objp);
 
 	if (unlikely((flags & __GFP_ZERO) && objp))
@@ -3530,6 +3557,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
+	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 2e1483c995bbd0fa6cbd055ad76088a520799ba4 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:13 +0100
Subject: kmemleak: Remove some of the kmemleak false positives

There are allocations for which the main pointer cannot be found but
they are not memory leaks. This patch fixes some of them. For more
information on false positives, see Documentation/kmemleak.txt.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/char/vt.c      | 7 +++++++
 fs/block_dev.c         | 6 ++++++
 include/linux/percpu.h | 5 +++++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4de48..961c1a788c6 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -104,6 +104,7 @@
 #include <linux/io.h>
 #include <asm/system.h>
 #include <linux/uaccess.h>
+#include <linux/kmemleak.h>
 
 #define MAX_NR_CON_DRIVER 16
 
@@ -2880,6 +2881,12 @@ static int __init con_init(void)
 	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
 		vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+		/*
+		 * Kmemleak does not track the memory allocated via
+		 * alloc_bootmem() but this block contains pointers to
+		 * other blocks allocated via kmalloc.
+		 */
+		kmemleak_alloc(vc, sizeof(struct vc_data), 1, GFP_ATOMIC);
 		INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
 		visual_init(vc, currcons, 1);
 		vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd1..d250f807fd8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -492,6 +493,11 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
+	/*
+	 * This vfsmount structure is only used to obtain the
+	 * blockdev_superblock, so tell kmemleak not to report it.
+	 */
+	kmemleak_not_leak(bd_mnt);
 	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 }
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1581ff235c7..26fd9d12f05 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -86,7 +86,12 @@ struct percpu_data {
 	void *ptrs[1];
 };
 
+/* pointer disguising messes up the kmemleak objects tracking */
+#ifndef CONFIG_DEBUG_KMEMLEAK
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+#else
+#define __percpu_disguise(pdata) (struct percpu_data *)(pdata)
+#endif
 
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
-- 
cgit v1.2.3-70-g09d2


From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 25 May 2009 15:10:58 +0300
Subject: x86: remove some alloc_bootmem_cpumask_var calling

Now that we set up the slab allocator earlier, we can get rid of some
alloc_bootmem_cpumask_var() calls in boot code.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 include/linux/irq.h            | 18 +++++++-----------
 kernel/cpuset.c                |  2 +-
 kernel/profile.c               |  6 ------
 lib/cpumask.c                  | 11 ++---------
 5 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac42ab..139201a562b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -185,8 +185,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_bootmem_cpumask_var(&cfg[i].domain);
-		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+		alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
+		alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index eedbb8e5e0c..1e50c34f006 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  * Returns true if successful (or not required).
  */
 static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
-								bool boot)
+							bool boot)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (boot) {
-		alloc_bootmem_cpumask_var(&desc->affinity);
+	gfp_t gfp = GFP_ATOMIC;
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-		alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
-		return true;
-	}
+	if (boot)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
 		return false;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca86..d5a7e17474e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {
 
 int __init cpuset_init_early(void)
 {
-	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
 
 	top_cpuset.mems_generation = cpuset_mems_generation++;
 	return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409ba..28cf26ad2d2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	buffer_bytes = prof_len*sizeof(atomic_t);
-	if (!slab_is_available()) {
-		prof_buffer = alloc_bootmem(buffer_bytes);
-		alloc_bootmem_cpumask_var(&prof_cpu_mask);
-		cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-		return 0;
-	}
 
 	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index eb23aaa0c7b..7bb4142a502 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
  */
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
-	if (likely(slab_is_available()))
-		*mask = kmalloc_node(cpumask_size(), flags, node);
-	else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-		printk(KERN_ERR
-			"=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
-		*mask = NULL;
-	}
+	*mask = kmalloc_node(cpumask_size(), flags, node);
+
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 	if (!*mask) {
 		printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
-- 
cgit v1.2.3-70-g09d2


From 670025478c2a687453cd1bac697d7d765843f59d Mon Sep 17 00:00:00 2001
From: Karsten Keil <keil@b1-systems.de>
Date: Thu, 11 Jun 2009 15:52:10 +0200
Subject: mISDN: cleanup mISDNhw.h

Remove unused stuff.

Signed-off-by: Karsten Keil <keil@b1-systems.de>
---
 include/linux/mISDNhw.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mISDNhw.h b/include/linux/mISDNhw.h
index ce900f4c245..7f9831da847 100644
--- a/include/linux/mISDNhw.h
+++ b/include/linux/mISDNhw.h
@@ -89,11 +89,6 @@ struct dchannel {
 	void			(*phfunc) (struct dchannel *);
 	u_int			state;
 	void			*l1;
-	/* HW access */
-	u_char			(*read_reg) (void *, u_char);
-	void			(*write_reg) (void *, u_char, u_char);
-	void			(*read_fifo) (void *, u_char *, int);
-	void			(*write_fifo) (void *, u_char *, int);
 	void			*hw;
 	int			slot;	/* multiport card channel slot */
 	struct timer_list	timer;
@@ -151,11 +146,6 @@ struct bchannel {
 	u_long			Flags;
 	struct work_struct	workq;
 	u_int			state;
-	/* HW access */
-	u_char			(*read_reg) (void *, u_char);
-	void			(*write_reg) (void *, u_char, u_char);
-	void			(*read_fifo) (void *, u_char *, int);
-	void			(*write_fifo) (void *, u_char *, int);
 	void			*hw;
 	int			slot;	/* multiport card channel slot */
 	struct timer_list	timer;
-- 
cgit v1.2.3-70-g09d2


From 90586523eb4b349806887c62ee70685a49415124 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:20 -0400
Subject: fsnotify: unified filesystem notification backend

fsnotify is a backend for filesystem notification.  fsnotify does
not provide any userspace interface but does provide the basis
needed for other notification schemes such as dnotify.  fsnotify
can be extended to be the backend for inotify or the upcoming
fanotify.  fsnotify provides a mechanism for "groups" to register for
some set of filesystem events and to then deliver those events to
those groups for processing.

fsnotify has a number of benefits, the first being actually shrinking the size
of an inode.  Before fsnotify to support both dnotify and inotify an inode had

        unsigned long           i_dnotify_mask; /* Directory notify events */
        struct dnotify_struct   *i_dnotify; /* for directory notifications */
        struct list_head        inotify_watches; /* watches on this inode */
        struct mutex            inotify_mutex;  /* protects the watches list

But with fsnotify this same functionallity (and more) is done with just

        __u32                   i_fsnotify_mask; /* all events for this inode */
        struct hlist_head       i_fsnotify_mark_entries; /* marks on this inode */

That's right, inotify, dnotify, and fanotify all in 64 bits.  We used that
much space just in inotify_watches alone, before this patch set.

fsnotify object lifetime and locking is MUCH better than what we have today.
inotify locking is incredibly complex.  See 8f7b0ba1c8539 as an example of
what's been busted since inception.  inotify needs to know internal semantics
of superblock destruction and unmounting to function.  The inode pinning and
vfs contortions are horrible.

no fsnotify implementers do allocation under locks.  This means things like
f04b30de3 which (due to an overabundance of caution) changes GFP_KERNEL to
GFP_NOFS can be reverted.  There are no longer any allocation rules when using
or implementing your own fsnotify listener.

fsnotify paves the way for fanotify.  In brief fanotify is a notification
mechanism that delivers the lisener both an 'event' and an open file descriptor
to the object in question.  This means that fanotify is pathname agnostic.
Some on lkml may not care for the original companies or users that pushed for
TALPA, but fanotify was designed with flexibility and input for other users in
mind.  The readahead group expressed interest in fanotify as it could be used
to profile disk access on boot without breaking the audit system.  The desktop
search groups have also expressed interest in fanotify as it solves a number
of the race conditions and problems present with managing inotify when more
than a limited number of specific files are of interest.  fanotify can provide
for a userspace access control system which makes it a clean interface for AV
vendors to hook without trying to do binary patching on the syscall table,
LSM, and everywhere else they do their things today.  With this patch series
fanotify can be implemented in less than 1200 lines of easy to review code.
Almost all of which is the socket based user interface.

This patch series builds fsnotify to the point that it can implement
dnotify and inotify_user.  Patches exist and will be sent soon after
acceptance to finish the in kernel inotify conversion (audit) and implement
fanotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/Kconfig                |  13 +++
 fs/notify/Makefile               |   2 +
 fs/notify/fsnotify.c             |  79 ++++++++++++++++
 fs/notify/fsnotify.h             |  15 +++
 fs/notify/group.c                | 198 +++++++++++++++++++++++++++++++++++++++
 fs/notify/inotify/inotify.c      |  20 ++++
 fs/notify/notification.c         | 121 ++++++++++++++++++++++++
 include/linux/fsnotify.h         | 115 ++++++++++++++++-------
 include/linux/fsnotify_backend.h | 177 ++++++++++++++++++++++++++++++++++
 9 files changed, 705 insertions(+), 35 deletions(-)
 create mode 100644 fs/notify/fsnotify.c
 create mode 100644 fs/notify/fsnotify.h
 create mode 100644 fs/notify/group.c
 create mode 100644 fs/notify/notification.c
 create mode 100644 include/linux/fsnotify_backend.h

(limited to 'include/linux')

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c..31dac7e3b0f 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
+config FSNOTIFY
+	bool "Filesystem notification backend"
+	default y
+	---help---
+	   fsnotify is a backend for filesystem notification.  fsnotify does
+	   not provide any userspace interface but does provide the basis
+	   needed for other notification schemes such as dnotify, inotify,
+	   and fanotify.
+
+	   Say Y here to enable fsnotify suport.
+
+	   If unsure, say Y.
+
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce..db5467b5b58 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o
+
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 00000000000..56bee0f10c3
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *event = NULL;
+	int idx;
+
+	if (list_empty(&fsnotify_groups))
+		return;
+
+	if (!(mask & fsnotify_mask))
+		return;
+
+	/*
+	 * SRCU!!  the groups list is very very much read only and the path is
+	 * very hot.  The VAST majority of events are not going to need to do
+	 * anything other than walk the list so it's crazy to pre-allocate.
+	 */
+	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+		if (mask & group->mask) {
+			if (!event) {
+				event = fsnotify_create_event(to_tell, mask, data, data_is);
+				/* shit, we OOM'd and now we can't tell, maybe
+				 * someday someone else will want to do something
+				 * here */
+				if (!event)
+					break;
+			}
+			group->ops->handle_event(group, event);
+		}
+	}
+	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	/*
+	 * fsnotify_create_event() took a reference so the event can't be cleaned
+	 * up while we are still trying to add it to lists, drop that one.
+	 */
+	if (event)
+		fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+
+static __init int fsnotify_init(void)
+{
+	return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 00000000000..c6a8bd47657
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,15 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+#endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 00000000000..c6812953b96
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include <asm/atomic.h>
+
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+	struct fsnotify_group *group;
+	__u32 mask = 0;
+	int idx;
+
+	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+		mask |= group->mask;
+	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	fsnotify_mask = mask;
+}
+
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+	atomic_inc(&group->refcnt);
+}
+
+/*
+ * Final freeing of a group
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	if (group->ops->free_group_priv)
+		group->ops->free_group_priv(group);
+
+	kfree(group);
+}
+
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	if (group->on_group_list)
+		list_del_rcu(&group->group_list);
+	group->on_group_list = 0;
+}
+
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+	mutex_lock(&fsnotify_grp_mutex);
+	__fsnotify_evict_group(group);
+	mutex_unlock(&fsnotify_grp_mutex);
+}
+
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+	if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+		return;
+
+	/*
+	 * OK, now we know that there's no other users *and* we hold mutex,
+	 * so no new references will appear
+	 */
+	__fsnotify_evict_group(group);
+
+	/*
+	 * now it's off the list, so the only thing we might care about is
+	 * srcu access....
+	 */
+	mutex_unlock(&fsnotify_grp_mutex);
+	synchronize_srcu(&fsnotify_grp_srcu);
+
+	/* and now it is really dead. _Nothing_ could be seeing it */
+	fsnotify_recalc_global_mask();
+	fsnotify_destroy_group(group);
+}
+
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+						  const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group_iter;
+	struct fsnotify_group *group = NULL;
+
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+		if (group_iter->group_num == group_num) {
+			if ((group_iter->mask == mask) &&
+			    (group_iter->ops == ops)) {
+				fsnotify_get_group(group_iter);
+				group = group_iter;
+			} else
+				group = ERR_PTR(-EEXIST);
+		}
+	}
+	return group;
+}
+
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+					     const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group, *tgroup;
+
+	/* very low use, simpler locking if we just always alloc */
+	group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&group->refcnt, 1);
+
+	group->on_group_list = 0;
+	group->group_num = group_num;
+	group->mask = mask;
+
+	group->ops = ops;
+
+	mutex_lock(&fsnotify_grp_mutex);
+	tgroup = fsnotify_find_group(group_num, mask, ops);
+	if (tgroup) {
+		/* group already exists */
+		mutex_unlock(&fsnotify_grp_mutex);
+		/* destroy the new one we made */
+		fsnotify_put_group(group);
+		return tgroup;
+	}
+
+	/* group not found, add a new one */
+	list_add_rcu(&group->group_list, &fsnotify_groups);
+	group->on_group_list = 1;
+
+	mutex_unlock(&fsnotify_grp_mutex);
+
+	if (mask)
+		fsnotify_recalc_global_mask();
+
+	return group;
+}
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73..40b1cf914cc 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 
 static atomic_t inotify_cookie;
 
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
  */
 static int __init inotify_setup(void)
 {
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
 	atomic_set(&inotify_cookie, 0);
 
 	return 0;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 00000000000..b8e9a87f8f5
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+static struct kmem_cache *fsnotify_event_cachep;
+
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+	atomic_inc(&event->refcnt);
+}
+
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+	if (!event)
+		return;
+
+	if (atomic_dec_and_test(&event->refcnt)) {
+		if (event->data_type == FSNOTIFY_EVENT_PATH)
+			path_put(&event->path);
+
+		kmem_cache_free(fsnotify_event_cachep, event);
+	}
+}
+
+/*
+ * Allocate a new event which will be sent to each group's handle_event function
+ * if the group was interested in this particular event.
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+					     void *data, int data_type)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	atomic_set(&event->refcnt, 1);
+
+	spin_lock_init(&event->lock);
+
+	event->path.dentry = NULL;
+	event->path.mnt = NULL;
+	event->inode = NULL;
+
+	event->to_tell = to_tell;
+
+	switch (data_type) {
+	case FSNOTIFY_EVENT_FILE: {
+		struct file *file = data;
+		struct path *path = &file->f_path;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		event->data_type = FSNOTIFY_EVENT_PATH;
+		break;
+	}
+	case FSNOTIFY_EVENT_PATH: {
+		struct path *path = data;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		event->data_type = FSNOTIFY_EVENT_PATH;
+		break;
+	}
+	case FSNOTIFY_EVENT_INODE:
+		event->inode = data;
+		event->data_type = FSNOTIFY_EVENT_INODE;
+		break;
+	case FSNOTIFY_EVENT_NONE:
+		event->inode = NULL;
+		event->path.dentry = NULL;
+		event->path.mnt = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	event->mask = mask;
+
+	return event;
+}
+
+__init int fsnotify_notification_init(void)
+{
+	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+
+	return 0;
+}
+subsys_initcall(fsnotify_notification_init);
+
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 00fbd5b245c..6c9ebefdac8 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -13,6 +13,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/audit.h>
 
 /*
@@ -34,6 +35,16 @@ static inline void fsnotify_d_move(struct dentry *entry)
 	inotify_d_move(entry);
 }
 
+/*
+ * fsnotify_link_count - inode's link count changed
+ */
+static inline void fsnotify_link_count(struct inode *inode)
+{
+	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+}
+
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
@@ -43,28 +54,47 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 {
 	struct inode *source = moved->d_inode;
 	u32 cookie = inotify_get_cookie();
+	__u32 old_dir_mask = 0;
+	__u32 new_dir_mask = 0;
 
-	if (old_dir == new_dir)
+	if (old_dir == new_dir) {
 		inode_dir_notify(old_dir, DN_RENAME);
-	else {
+		old_dir_mask = FS_DN_RENAME;
+	} else {
 		inode_dir_notify(old_dir, DN_DELETE);
+		old_dir_mask = FS_DELETE;
 		inode_dir_notify(new_dir, DN_CREATE);
+		new_dir_mask = FS_CREATE;
 	}
 
-	if (isdir)
+	if (isdir) {
 		isdir = IN_ISDIR;
+		old_dir_mask |= FS_IN_ISDIR;
+		new_dir_mask |= FS_IN_ISDIR;
+	}
+
+	old_dir_mask |= FS_MOVED_FROM;
+	new_dir_mask |= FS_MOVED_TO;
+
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
 				  source);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
 		inotify_inode_is_dead(target);
+
+		/* this is really a link_count change not a removal */
+		fsnotify_link_count(target);
 	}
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -74,10 +104,12 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 {
+	__u32 mask = FS_DELETE;
+
 	if (isdir)
-		isdir = IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
-	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 }
 
 /*
@@ -87,14 +119,8 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
-}
 
-/*
- * fsnotify_link_count - inode's link count changed
- */
-static inline void fsnotify_link_count(struct inode *inode)
-{
-	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -106,6 +132,8 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
+
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -120,6 +148,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 				  inode);
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
+
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -127,10 +157,14 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
  */
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
+	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
+	struct inode *d_inode = dentry->d_inode;
+
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
-				  dentry->d_name.name, dentry->d_inode);
+	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
+
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -139,14 +173,16 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 static inline void fsnotify_access(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_ACCESS;
+	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_ACCESS);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -155,14 +191,16 @@ static inline void fsnotify_access(struct dentry *dentry)
 static inline void fsnotify_modify(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_MODIFY;
+	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_MODIFY);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -171,13 +209,15 @@ static inline void fsnotify_modify(struct dentry *dentry)
 static inline void fsnotify_open(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_OPEN;
+	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -189,13 +229,15 @@ static inline void fsnotify_close(struct file *file)
 	struct inode *inode = dentry->d_inode;
 	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
-	u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
+	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
 }
 
 /*
@@ -204,13 +246,15 @@ static inline void fsnotify_close(struct file *file)
 static inline void fsnotify_xattr(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_ATTRIB;
+	__u32 mask = FS_ATTRIB;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -221,34 +265,34 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
 	int dn_mask = 0;
-	u32 in_mask = 0;
+	__u32 in_mask = 0;
 
 	if (ia_valid & ATTR_UID) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 	if (ia_valid & ATTR_GID) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 	if (ia_valid & ATTR_SIZE) {
-		in_mask |= IN_MODIFY;
+		in_mask |= FS_MODIFY;
 		dn_mask |= DN_MODIFY;
 	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
 	{
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	} else if (ia_valid & ATTR_ATIME) {
-		in_mask |= IN_ACCESS;
+		in_mask |= FS_ACCESS;
 		dn_mask |= DN_ACCESS;
 	} else if (ia_valid & ATTR_MTIME) {
-		in_mask |= IN_MODIFY;
+		in_mask |= FS_MODIFY;
 		dn_mask |= DN_MODIFY;
 	}
 	if (ia_valid & ATTR_MODE) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 
@@ -256,14 +300,15 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		dnotify_parent(dentry, dn_mask);
 	if (in_mask) {
 		if (S_ISDIR(inode->i_mode))
-			in_mask |= IN_ISDIR;
+			in_mask |= FS_IN_ISDIR;
 		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
 		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
 						  dentry->d_name.name);
+		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
 
-#ifdef CONFIG_INOTIFY	/* inotify helpers */
+#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY)	/* notify helpers */
 
 /*
  * fsnotify_oldname_init - save off the old filename before we change it
@@ -281,7 +326,7 @@ static inline void fsnotify_oldname_free(const char *old_name)
 	kfree(old_name);
 }
 
-#else	/* CONFIG_INOTIFY */
+#else	/* CONFIG_INOTIFY || CONFIG_FSNOTIFY */
 
 static inline const char *fsnotify_oldname_init(const char *name)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
new file mode 100644
index 00000000000..1a55718b38a
--- /dev/null
+++ b/include/linux/fsnotify_backend.h
@@ -0,0 +1,177 @@
+/*
+ * Filesystem access notification for Linux
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ */
+
+#ifndef __LINUX_FSNOTIFY_BACKEND_H
+#define __LINUX_FSNOTIFY_BACKEND_H
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/list.h>
+#include <linux/path.h> /* struct path */
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+
+/*
+ * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
+ * convert between them.  dnotify only needs conversion at watch creation
+ * so no perf loss there.  fanotify isn't defined yet, so it can use the
+ * wholes if it needs more events.
+ */
+#define FS_ACCESS		0x00000001	/* File was accessed */
+#define FS_MODIFY		0x00000002	/* File was modified */
+#define FS_ATTRIB		0x00000004	/* Metadata changed */
+#define FS_CLOSE_WRITE		0x00000008	/* Writtable file was closed */
+#define FS_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
+#define FS_OPEN			0x00000020	/* File was opened */
+#define FS_MOVED_FROM		0x00000040	/* File was moved from X */
+#define FS_MOVED_TO		0x00000080	/* File was moved to Y */
+#define FS_CREATE		0x00000100	/* Subfile was created */
+#define FS_DELETE		0x00000200	/* Subfile was deleted */
+#define FS_DELETE_SELF		0x00000400	/* Self was deleted */
+#define FS_MOVE_SELF		0x00000800	/* Self was moved */
+
+#define FS_UNMOUNT		0x00002000	/* inode on umount fs */
+#define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
+#define FS_IN_IGNORED		0x00008000	/* last inotify event here */
+
+#define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
+#define FS_IN_ONESHOT		0x80000000	/* only send event once */
+
+#define FS_DN_RENAME		0x10000000	/* file renamed */
+#define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
+
+struct fsnotify_group;
+struct fsnotify_event;
+
+/*
+ * Each group much define these ops.  The fsnotify infrastructure will call
+ * these operations for each relevant group.
+ *
+ * handle_event - main call for a group to handle an fs event
+ * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ */
+struct fsnotify_ops {
+	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
+	void (*free_group_priv)(struct fsnotify_group *group);
+};
+
+/*
+ * A group is a "thing" that wants to receive notification about filesystem
+ * events.  The mask holds the subset of event types this group cares about.
+ * refcnt on a group is up to the implementor and at any moment if it goes 0
+ * everything will be cleaned up.
+ */
+struct fsnotify_group {
+	/*
+	 * global list of all groups receiving events from fsnotify.
+	 * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
+	 * or fsnotify_grp_srcu depending on write vs read.
+	 */
+	struct list_head group_list;
+
+	/*
+	 * Defines all of the event types in which this group is interested.
+	 * This mask is a bitwise OR of the FS_* events from above.  Each time
+	 * this mask changes for a group (if it changes) the correct functions
+	 * must be called to update the global structures which indicate global
+	 * interest in event types.
+	 */
+	__u32 mask;
+
+	/*
+	 * How the refcnt is used is up to each group.  When the refcnt hits 0
+	 * fsnotify will clean up all of the resources associated with this group.
+	 * As an example, the dnotify group will always have a refcnt=1 and that
+	 * will never change.  Inotify, on the other hand, has a group per
+	 * inotify_init() and the refcnt will hit 0 only when that fd has been
+	 * closed.
+	 */
+	atomic_t refcnt;		/* things with interest in this group */
+	unsigned int group_num;		/* simply prevents accidental group collision */
+
+	const struct fsnotify_ops *ops;	/* how this group handles things */
+
+	/* prevents double list_del of group_list.  protected by global fsnotify_gr_mutex */
+	bool on_group_list;
+
+	/* groups can define private fields here or use the void *private */
+	union {
+		void *private;
+	};
+};
+
+/*
+ * all of the information about the original object we want to now send to
+ * a group.  If you want to carry more info from the accessing task to the
+ * listener this structure is where you need to be adding fields.
+ */
+struct fsnotify_event {
+	spinlock_t lock;	/* protection for the associated event_holder and private_list */
+	/* to_tell may ONLY be dereferenced during handle_event(). */
+	struct inode *to_tell;	/* either the inode the event happened to or its parent */
+	/*
+	 * depending on the event type we should have either a path or inode
+	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
+	 * the path, it may be dereferenced at any point during this object's
+	 * lifetime.  That reference is dropped when this object's refcnt hits
+	 * 0.  If this event contains an inode instead of a path, the inode may
+	 * ONLY be used during handle_event().
+	 */
+	union {
+		struct path path;
+		struct inode *inode;
+	};
+/* when calling fsnotify tell it if the data is a path or inode */
+#define FSNOTIFY_EVENT_NONE	0
+#define FSNOTIFY_EVENT_PATH	1
+#define FSNOTIFY_EVENT_INODE	2
+#define FSNOTIFY_EVENT_FILE	3
+	int data_type;		/* which of the above union we have */
+	atomic_t refcnt;	/* how many groups still are using/need to send this event */
+	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
+};
+
+#ifdef CONFIG_FSNOTIFY
+
+/* called from the vfs helpers */
+
+/* main fsnotify call to send events */
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+
+
+/* called from fsnotify listeners, such as fanotify or dnotify */
+
+/* must call when a group changes its ->mask */
+extern void fsnotify_recalc_global_mask(void);
+/* get a reference to an existing or create a new group */
+extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
+						    __u32 mask,
+						    const struct fsnotify_ops *ops);
+/* drop reference on a group from fsnotify_obtain_group */
+extern void fsnotify_put_group(struct fsnotify_group *group);
+
+/* take a reference to an event */
+extern void fsnotify_get_event(struct fsnotify_event *event);
+extern void fsnotify_put_event(struct fsnotify_event *event);
+/* find private data previously attached to an event */
+extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
+									struct fsnotify_event *event);
+
+/* put here because inotify does some weird stuff when destroying watches */
+extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+						    void *data, int data_is);
+#else
+
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{}
+#endif	/* CONFIG_FSNOTIFY */
+
+#endif	/* __KERNEL __ */
+
+#endif	/* __LINUX_FSNOTIFY_BACKEND_H */
-- 
cgit v1.2.3-70-g09d2


From 3be25f49b9d6a97eae9bcb96d3292072b7658bd8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:26 -0400
Subject: fsnotify: add marks to inodes so groups can interpret how to handle
 those inodes

This patch creates a way for fsnotify groups to attach marks to inodes.
These marks have little meaning to the generic fsnotify infrastructure
and thus their meaning should be interpreted by the group that attached
them to the inode's list.

dnotify and inotify  will make use of these markings to indicate which
inodes are of interest to their respective groups.  But this implementation
has the useful property that in the future other listeners could actually
use the marks for the exact opposite reason, aka to indicate which inodes
it had NO interest in.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c                       |   9 ++
 fs/notify/Makefile               |   2 +-
 fs/notify/fsnotify.c             |  13 ++
 fs/notify/fsnotify.h             |   5 +
 fs/notify/group.c                |  49 +++++-
 fs/notify/inode_mark.c           | 329 +++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h               |   5 +
 include/linux/fsnotify.h         |   9 ++
 include/linux/fsnotify_backend.h |  65 +++++++-
 9 files changed, 483 insertions(+), 3 deletions(-)
 create mode 100644 fs/notify/inode_mark.c

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb..54c63ce3de2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
 
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 
+#ifdef CONFIG_FSNOTIFY
+	inode->i_fsnotify_mask = 0;
+#endif
+
 	return inode;
 
 out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
 	BUG_ON(inode_has_buffers(inode));
 	ima_inode_free(inode);
 	security_inode_free(inode);
+	fsnotify_inode_delete(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index db5467b5b58..0922cc826c4 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 56bee0f10c3..d5654629c65 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -25,6 +25,15 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+	fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
@@ -43,6 +52,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 	if (!(mask & fsnotify_mask))
 		return;
 
+	if (!(mask & to_tell->i_fsnotify_mask))
+		return;
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
 	 * very hot.  The VAST majority of events are not going to need to do
@@ -51,6 +62,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
 		if (mask & group->mask) {
+			if (!group->ops->should_send_event(group, to_tell, mask))
+				continue;
 			if (!event) {
 				event = fsnotify_create_event(to_tell, mask, data, data_is);
 				/* shit, we OOM'd and now we can't tell, maybe
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index c6a8bd47657..8ebcbe893c9 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,4 +12,9 @@ extern struct srcu_struct fsnotify_grp_srcu;
 extern struct list_head fsnotify_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
 extern __u32 fsnotify_mask;
+
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index c6812953b96..a29d2fa6792 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -54,6 +54,29 @@ void fsnotify_recalc_global_mask(void)
 	fsnotify_mask = mask;
 }
 
+/*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+	__u32 mask = 0;
+	__u32 old_mask = group->mask;
+	struct fsnotify_mark_entry *entry;
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry(entry, &group->mark_entries, g_list)
+		mask |= entry->mask;
+	spin_unlock(&group->mark_lock);
+
+	group->mask = mask;
+
+	if (old_mask != mask)
+		fsnotify_recalc_global_mask();
+}
+
 /*
  * Take a reference to a group so things found under the fsnotify_grp_mutex
  * can't get freed under us
@@ -66,7 +89,7 @@ static void fsnotify_get_group(struct fsnotify_group *group)
 /*
  * Final freeing of a group
  */
-static void fsnotify_destroy_group(struct fsnotify_group *group)
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
@@ -74,6 +97,24 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
 	kfree(group);
 }
 
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	/* clear all inode mark entries for this group */
+	fsnotify_clear_marks_by_group(group);
+
+	/* past the point of no return, matches the initial value of 1 */
+	if (atomic_dec_and_test(&group->num_marks))
+		fsnotify_final_destroy_group(group);
+}
+
 /*
  * Remove this group from the global list of groups that will get events
  * this can be done even if there are still references and things still using
@@ -173,6 +214,10 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->group_num = group_num;
 	group->mask = mask;
 
+	spin_lock_init(&group->mark_lock);
+	atomic_set(&group->num_marks, 0);
+	INIT_LIST_HEAD(&group->mark_entries);
+
 	group->ops = ops;
 
 	mutex_lock(&fsnotify_grp_mutex);
@@ -188,6 +233,8 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	/* group not found, add a new one */
 	list_add_rcu(&group->group_list, &fsnotify_groups);
 	group->on_group_list = 1;
+	/* being on the fsnotify_groups list holds one num_marks */
+	atomic_inc(&group->num_marks);
 
 	mutex_unlock(&fsnotify_grp_mutex);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 00000000000..cdc15414697
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,329 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+	atomic_inc(&entry->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->refcnt))
+		entry->free_mark(entry);
+}
+
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+		new_mask |= entry->mask;
+	inode->i_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	fsnotify_recalc_inode_mask_locked(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+	struct fsnotify_group *group;
+	struct inode *inode;
+
+	spin_lock(&entry->lock);
+
+	group = entry->group;
+	inode = entry->inode;
+
+	BUG_ON(group && !inode);
+	BUG_ON(!group && inode);
+
+	/* if !group something else already marked this to die */
+	if (!group) {
+		spin_unlock(&entry->lock);
+		return;
+	}
+
+	/* 1 from caller and 1 for being on i_list/g_list */
+	BUG_ON(atomic_read(&entry->refcnt) < 2);
+
+	spin_lock(&group->mark_lock);
+	spin_lock(&inode->i_lock);
+
+	hlist_del_init(&entry->i_list);
+	entry->inode = NULL;
+
+	list_del_init(&entry->g_list);
+	entry->group = NULL;
+
+	fsnotify_put_mark(entry); /* for i_list and g_list */
+
+	/*
+	 * this mark is now off the inode->i_fsnotify_mark_entries list and we
+	 * hold the inode->i_lock, so this is the perfect time to update the
+	 * inode->i_fsnotify_mask
+	 */
+	fsnotify_recalc_inode_mask_locked(inode);
+
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&entry->lock);
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this entry
+	 * is being freed.
+	 */
+	group->ops->freeing_mark(entry, group);
+
+	/*
+	 * it's possible that this group tried to destroy itself, but this
+	 * this mark was simultaneously being freed by inode.  If that's the
+	 * case, we finish freeing the group here.
+	 */
+	if (unlikely(atomic_dec_and_test(&group->num_marks)))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	struct fsnotify_mark_entry *lentry, *entry;
+	LIST_HEAD(free_list);
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+		list_add(&entry->free_g_list, &free_list);
+		list_del_init(&entry->g_list);
+		fsnotify_get_mark(entry);
+	}
+	spin_unlock(&group->mark_lock);
+
+	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+}
+
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry, *lentry;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+		list_add(&entry->free_i_list, &free_list);
+		hlist_del_init(&entry->i_list);
+		fsnotify_get_mark(entry);
+	}
+	spin_unlock(&inode->i_lock);
+
+	list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+}
+
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+						     struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+		if (entry->group == group) {
+			fsnotify_get_mark(entry);
+			return entry;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+			void (*free_mark)(struct fsnotify_mark_entry *entry))
+
+{
+	spin_lock_init(&entry->lock);
+	atomic_set(&entry->refcnt, 1);
+	INIT_HLIST_NODE(&entry->i_list);
+	entry->group = NULL;
+	entry->mask = 0;
+	entry->inode = NULL;
+	entry->free_mark = free_mark;
+}
+
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+		      struct fsnotify_group *group, struct inode *inode)
+{
+	struct fsnotify_mark_entry *lentry;
+	int ret = 0;
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * entry->lock
+	 * group->mark_lock
+	 * inode->i_lock
+	 */
+	spin_lock(&entry->lock);
+	spin_lock(&group->mark_lock);
+	spin_lock(&inode->i_lock);
+
+	entry->group = group;
+	entry->inode = inode;
+
+	lentry = fsnotify_find_mark_entry(group, inode);
+	if (!lentry) {
+		hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+		list_add(&entry->g_list, &group->mark_entries);
+
+		fsnotify_get_mark(entry); /* for i_list and g_list */
+
+		atomic_inc(&group->num_marks);
+
+		fsnotify_recalc_inode_mask_locked(inode);
+	}
+
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&entry->lock);
+
+	if (lentry) {
+		ret = -EEXIST;
+		fsnotify_put_mark(lentry);
+	}
+
+	return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83d6b439724..275b0860044 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -755,6 +755,11 @@ struct inode {
 
 	__u32			i_generation;
 
+#ifdef CONFIG_FSNOTIFY
+	__u32			i_fsnotify_mask; /* all events this inode cares about */
+	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
+#endif
+
 #ifdef CONFIG_DNOTIFY
 	unsigned long		i_dnotify_mask; /* Directory notify events */
 	struct dnotify_struct	*i_dnotify; /* for directory notifications */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6c9ebefdac8..3856eb6e597 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -99,6 +99,14 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	audit_inode_child(new_name, moved, new_dir);
 }
 
+/*
+ * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
+ */
+static inline void fsnotify_inode_delete(struct inode *inode)
+{
+	__fsnotify_inode_delete(inode);
+}
+
 /*
  * fsnotify_nameremove - a filename was removed from a directory
  */
@@ -121,6 +129,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_is_dead(inode);
 
 	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
+	__fsnotify_inode_delete(inode);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1a55718b38a..cad5c4d75c1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -48,17 +48,25 @@
 
 struct fsnotify_group;
 struct fsnotify_event;
+struct fsnotify_mark_entry;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
  * these operations for each relevant group.
  *
+ * should_send_event - given a group, inode, and mask this function determines
+ *		if the group is interested in this event.
  * handle_event - main call for a group to handle an fs event
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ * freeing-mark - this means that a mark has been flagged to die when everything
+ *		finishes using it.  The function is supplied with what must be a
+ *		valid group and inode to use to clean up.
  */
 struct fsnotify_ops {
+	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
+	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
 };
 
 /*
@@ -97,7 +105,14 @@ struct fsnotify_group {
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
-	/* prevents double list_del of group_list.  protected by global fsnotify_gr_mutex */
+	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
+	spinlock_t mark_lock;		/* protect mark_entries list */
+	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
+					 * past the point of no return when freeing
+					 * a group */
+	struct list_head mark_entries;	/* all inode mark entries for this group */
+
+	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_group_list;
 
 	/* groups can define private fields here or use the void *private */
@@ -137,12 +152,38 @@ struct fsnotify_event {
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 };
 
+/*
+ * a mark is simply an entry attached to an in core inode which allows an
+ * fsnotify listener to indicate they are either no longer interested in events
+ * of a type matching mask or only interested in those events.
+ *
+ * these are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
+ * (such as dnotify) will flush these when the open fd is closed and not at
+ * inode eviction or modification.
+ */
+struct fsnotify_mark_entry {
+	__u32 mask;			/* mask this mark entry is for */
+	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+	 * in kernel that found and may be using this mark. */
+	atomic_t refcnt;		/* active things looking at this mark */
+	struct inode *inode;		/* inode this entry is associated with */
+	struct fsnotify_group *group;	/* group this mark entry is for */
+	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct list_head g_list;	/* list of mark_entries by group->i_fsnotify_mark_entries */
+	spinlock_t lock;		/* protect group, inode, and killme */
+	struct list_head free_i_list;	/* tmp list used when freeing this mark */
+	struct list_head free_g_list;	/* tmp list used when freeing this mark */
+	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
+};
+
 #ifdef CONFIG_FSNOTIFY
 
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void __fsnotify_inode_delete(struct inode *inode);
 
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
@@ -153,6 +194,8 @@ extern void fsnotify_recalc_global_mask(void);
 extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
 						    __u32 mask,
 						    const struct fsnotify_ops *ops);
+/* run all marks associated with this group and update group->mask */
+extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_obtain_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
 
@@ -163,6 +206,22 @@ extern void fsnotify_put_event(struct fsnotify_event *event);
 extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
 									struct fsnotify_event *event);
 
+/* functions used to manipulate the marks attached to inodes */
+
+/* run all marks associated with an inode and update inode->i_fsnotify_mask */
+extern void fsnotify_recalc_inode_mask(struct inode *inode);
+extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry));
+/* find (and take a reference) to a mark associated with group and inode */
+extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
+/* attach the mark to both the group and the inode */
+extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode);
+/* given a mark, flag it to be freed when all references are dropped */
+extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
+/* run all the marks in a group, and flag them to be freed */
+extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
+extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 						    void *data, int data_is);
@@ -170,6 +229,10 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 {}
+
+static inline void __fsnotify_inode_delete(struct inode *inode)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v1.2.3-70-g09d2


From c28f7e56e9d95fb531dc3be8df2e7f52bee76d21 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:29 -0400
Subject: fsnotify: parent event notification

inotify and dnotify both use a similar parent notification mechanism.  We
add a generic parent notification mechanism to fsnotify for both of these
to use.  This new machanism also adds the dentry flag optimization which
exists for inotify to dnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.c             | 91 ++++++++++++++++++++++++++++++++++++++++
 fs/notify/fsnotify.h             |  5 +++
 fs/notify/inode_mark.c           | 17 ++++++++
 include/linux/dcache.h           |  4 +-
 include/linux/fsnotify.h         | 34 +++++++++++----
 include/linux/fsnotify_backend.h | 64 ++++++++++++++++++++++++++++
 6 files changed, 205 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d5654629c65..7fc760067a6 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -34,6 +34,97 @@ void __fsnotify_inode_delete(struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
 
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+	struct dentry *alias;
+	int watched;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	/* determine if the children should tell inode about their events */
+	watched = fsnotify_inode_watches_children(inode);
+
+	spin_lock(&dcache_lock);
+	/* run all of the dentries associated with this inode.  Since this is a
+	 * directory, there damn well better only be one item on this list */
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		/* run all of the children of the original inode and fix their
+		 * d_flags to indicate parental interest (their parent is the
+		 * original inode) */
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock(&child->d_lock);
+			if (watched)
+				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+	struct dentry *parent;
+	struct inode *p_inode;
+	bool send = false;
+	bool should_update_children = false;
+
+	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	p_inode = parent->d_inode;
+
+	if (fsnotify_inode_watches_children(p_inode)) {
+		if (p_inode->i_fsnotify_mask & mask) {
+			dget(parent);
+			send = true;
+		}
+	} else {
+		/*
+		 * The parent doesn't care about events on it's children but
+		 * at least one child thought it did.  We need to run all the
+		 * children and update their d_flags to let them know p_inode
+		 * doesn't care about them any more.
+		 */
+		dget(parent);
+		should_update_children = true;
+	}
+
+	spin_unlock(&dentry->d_lock);
+
+	if (send) {
+		/* we are notifying a parent so come up with the new mask which
+		 * specifies these are events which came from a child. */
+		mask |= FS_EVENT_ON_CHILD;
+
+		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+		dput(parent);
+	}
+
+	if (unlikely(should_update_children)) {
+		__fsnotify_update_child_dentry_flags(p_inode);
+		dput(parent);
+	}
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 8ebcbe893c9..83b8ec0a8ec 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -17,4 +17,9 @@ extern __u32 fsnotify_mask;
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index cdc15414697..a39534845b2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -131,6 +131,8 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	fsnotify_recalc_inode_mask_locked(inode);
 	spin_unlock(&inode->i_lock);
+
+	__fsnotify_update_child_dentry_flags(inode);
 }
 
 /*
@@ -189,6 +191,19 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	 */
 	group->ops->freeing_mark(entry, group);
 
+	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the entry->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+
 	/*
 	 * it's possible that this group tried to destroy itself, but this
 	 * this mark was simultaneously being freed by inode.  If that's the
@@ -323,6 +338,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	if (lentry) {
 		ret = -EEXIST;
 		fsnotify_put_mark(lentry);
+	} else {
+		__fsnotify_update_child_dentry_flags(inode);
 	}
 
 	return ret;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 15156364d19..97978004338 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -180,10 +180,12 @@ d_iput:		no		no		no       yes
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
 #define DCACHE_UNHASHED		0x0010	
 
-#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
+#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched by inotify */
 
 #define DCACHE_COOKIE		0x0040	/* For use by dcookie subsystem */
 
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x0080 /* Parent inode is watched by some fsnotify listener */
+
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 3856eb6e597..6a662ed0bc8 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -23,15 +23,31 @@
 static inline void fsnotify_d_instantiate(struct dentry *entry,
 						struct inode *inode)
 {
+	__fsnotify_d_instantiate(entry, inode);
+
 	inotify_d_instantiate(entry, inode);
 }
 
+/* Notify this dentry's parent about a child's events. */
+static inline void fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+	__fsnotify_parent(dentry, mask);
+
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+}
+
 /*
  * fsnotify_d_move - entry has been moved
  * Called with dcache_lock and entry->d_lock held.
  */
 static inline void fsnotify_d_move(struct dentry *entry)
 {
+	/*
+	 * On move we need to update entry->d_flags to indicate if the new parent
+	 * cares about events from this entry.
+	 */
+	__fsnotify_update_dcache_flags(entry);
+
 	inotify_d_move(entry);
 }
 
@@ -117,7 +133,8 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 	if (isdir)
 		mask |= FS_IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+
+	fsnotify_parent(dentry, mask);
 }
 
 /*
@@ -188,9 +205,9 @@ static inline void fsnotify_access(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_ACCESS);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -206,9 +223,9 @@ static inline void fsnotify_modify(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_MODIFY);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -223,9 +240,9 @@ static inline void fsnotify_open(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -236,16 +253,15 @@ static inline void fsnotify_close(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
 }
 
@@ -260,9 +276,9 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -311,8 +327,8 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		if (S_ISDIR(inode->i_mode))
 			in_mask |= FS_IN_ISDIR;
 		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
-		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
-						  dentry->d_name.name);
+
+		fsnotify_parent(dentry, in_mask);
 		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index cad5c4d75c1..13d2dd57004 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -46,6 +46,17 @@
 #define FS_DN_RENAME		0x10000000	/* file renamed */
 #define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
 
+/* This inode cares about things that happen to its children.  Always set for
+ * dnotify and inotify. */
+#define FS_EVENT_ON_CHILD	0x08000000
+
+/* This is a list of all events that may get sent to a parernt based on fs event
+ * happening to inodes inside that directory */
+#define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
+				   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
+				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
+				   FS_DELETE)
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
@@ -183,8 +194,52 @@ struct fsnotify_mark_entry {
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 
+static inline int fsnotify_inode_watches_children(struct inode *inode)
+{
+	/* FS_EVENT_ON_CHILD is set if the inode may care */
+	if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
+		return 0;
+	/* this inode might care about child events, does it care about the
+	 * specific set of events that can happen on a child? */
+	return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
+}
+
+/*
+ * Update the dentry with a flag indicating the interest of its parent to receive
+ * filesystem events when those events happens to this dentry->d_inode.
+ */
+static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	assert_spin_locked(&dcache_lock);
+	assert_spin_locked(&dentry->d_lock);
+
+	parent = dentry->d_parent;
+	if (fsnotify_inode_watches_children(parent->d_inode))
+		dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+	else
+		dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+}
+
+/*
+ * fsnotify_d_instantiate - instantiate a dentry for inode
+ * Called with dcache_lock held.
+ */
+static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	if (!inode)
+		return;
+
+	assert_spin_locked(&dcache_lock);
+
+	spin_lock(&dentry->d_lock);
+	__fsnotify_update_dcache_flags(dentry);
+	spin_unlock(&dentry->d_lock);
+}
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
 
@@ -230,9 +285,18 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 {}
 
+static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{}
+
 static inline void __fsnotify_inode_delete(struct inode *inode)
 {}
 
+static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
+{}
+
+static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v1.2.3-70-g09d2


From 3c5119c05d624f95f4967d16b38c9624b816bdb9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:33 -0400
Subject: dnotify: reimplement dnotify using fsnotify

Reimplement dnotify using fsnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                      |   6 +-
 fs/notify/dnotify/Kconfig        |   1 +
 fs/notify/dnotify/dnotify.c      | 469 ++++++++++++++++++++++++++++++---------
 include/linux/dnotify.h          |  29 +--
 include/linux/fs.h               |   5 -
 include/linux/fsnotify.h         |  68 ++----
 include/linux/fsnotify_backend.h |   3 +
 7 files changed, 398 insertions(+), 183 deletions(-)

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index ccdb57524e3..96e0c8c6079 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1802,10 +1802,10 @@ F:	drivers/char/epca*
 F:	drivers/char/digi*
 
 DIRECTORY NOTIFICATION (DNOTIFY)
-P:	Stephen Rothwell
-M:	sfr@canb.auug.org.au
+P:	Eric Paris
+M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
-S:	Supported
+S:	Maintained
 F:	Documentation/filesystems/dnotify.txt
 F:	fs/notify/dnotify/
 F:	include/linux/dnotify.h
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa64..904ff8d5405 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
 	bool "Dnotify support"
+	depends on FSNOTIFY
 	default y
 	help
 	  Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80b..d9d80f502c6 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
  *
  * Copyright (C) 2000,2001,2002 Stephen Rothwell
  *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,178 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 
 int dir_notify_enable __read_mostly = 1;
 
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+	struct fsnotify_mark_entry fsn_entry;
+	struct dnotify_struct *dn;
+};
 
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-	unsigned long new_mask;
+	__u32 new_mask, old_mask;
 	struct dnotify_struct *dn;
+	struct dnotify_mark_entry *dnentry  = container_of(entry,
+							   struct dnotify_mark_entry,
+							   fsn_entry);
+
+	assert_spin_locked(&entry->lock);
 
+	old_mask = entry->mask;
 	new_mask = 0;
-	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
-		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
-	inode->i_dnotify_mask = new_mask;
+	for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
+		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
+	entry->mask = new_mask;
+
+	if (old_mask == new_mask)
+		return;
+
+	if (entry->inode)
+		fsnotify_recalc_inode_mask(entry->inode);
 }
 
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_event *event)
+{
+	struct fsnotify_mark_entry *entry = NULL;
+	struct dnotify_mark_entry *dnentry;
+	struct inode *to_tell;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *fown;
+
+	to_tell = event->to_tell;
+
+	spin_lock(&to_tell->i_lock);
+	entry = fsnotify_find_mark_entry(group, to_tell);
+	spin_unlock(&to_tell->i_lock);
+
+	/* unlikely since we alreay passed dnotify_should_send_event() */
+	if (unlikely(!entry))
+		return 0;
+	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+
+	spin_lock(&entry->lock);
+	prev = &dnentry->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & event->mask) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & FS_DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(entry);
+		}
+	}
+
+	spin_unlock(&entry->lock);
+	fsnotify_put_mark(entry);
+
+	return 0;
+}
+
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+				      struct inode *inode, __u32 mask)
+{
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	/* !dir_notify_enable should never get here, don't waste time checking
+	if (!dir_notify_enable)
+		return 0; */
+
+	/* not a dir, dnotify doesn't care */
+	if (!S_ISDIR(inode->i_mode))
+		return false;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+
+	/* no mark means no dnotify watch */
+	if (!entry)
+		return false;
+
+	spin_lock(&entry->lock);
+	send = (mask & entry->mask) ? true : false;
+	spin_unlock(&entry->lock);
+	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+
+	return send;
+}
+
+static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry,
+				 struct fsnotify_group *group)
+{
+	/* dnotify doesn't care than an inode is on the way out */
+}
+
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+	struct dnotify_mark_entry *dnentry = container_of(entry,
+							  struct dnotify_mark_entry,
+							  fsn_entry);
+
+	BUG_ON(dnentry->dn);
+
+	kmem_cache_free(dnotify_mark_entry_cache, dnentry);
+}
+
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+	.handle_event = dnotify_handle_event,
+	.should_send_event = dnotify_should_send_event,
+	.free_group_priv = NULL,
+	.freeing_mark = dnotify_freeing_mark,
+};
+
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+	struct fsnotify_mark_entry *entry;
+	struct dnotify_mark_entry *dnentry;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
@@ -46,145 +203,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	inode = filp->f_path.dentry->d_inode;
 	if (!S_ISDIR(inode->i_mode))
 		return;
+
 	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
+	entry = fsnotify_find_mark_entry(dnotify_group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return;
+	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+
+	mutex_lock(&dnotify_mark_mutex);
+
+	spin_lock(&entry->lock);
+	prev = &dnentry->dn;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
 			*prev = dn->dn_next;
-			redo_inode_mask(inode);
-			kmem_cache_free(dn_cache, dn);
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(entry);
 			break;
 		}
 		prev = &dn->dn_next;
 	}
-	spin_unlock(&inode->i_lock);
+
+	spin_unlock(&entry->lock);
+
+	/* nothing else could have found us thanks to the dnotify_mark_mutex */
+	if (dnentry->dn == NULL)
+		fsnotify_destroy_mark_by_entry(entry);
+
+	fsnotify_recalc_group_mask(dnotify_group);
+
+	mutex_unlock(&dnotify_mark_mutex);
+
+	fsnotify_put_mark(entry);
+}
+
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+	__u32 new_mask = FS_EVENT_ON_CHILD;
+
+	if (arg & DN_MULTISHOT)
+		new_mask |= FS_DN_MULTISHOT;
+	if (arg & DN_DELETE)
+		new_mask |= (FS_DELETE | FS_MOVED_FROM);
+	if (arg & DN_MODIFY)
+		new_mask |= FS_MODIFY;
+	if (arg & DN_ACCESS)
+		new_mask |= FS_ACCESS;
+	if (arg & DN_ATTRIB)
+		new_mask |= FS_ATTRIB;
+	if (arg & DN_RENAME)
+		new_mask |= FS_DN_RENAME;
+	if (arg & DN_CREATE)
+		new_mask |= (FS_CREATE | FS_MOVED_TO);
+
+	return new_mask;
 }
 
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+	struct dnotify_struct *odn;
+
+	odn = dnentry->dn;
+	while (odn != NULL) {
+		/* adding more events to existing dnofiy_struct? */
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= mask;
+			return -EEXIST;
+		}
+		odn = odn->dn_next;
+	}
+
+	dn->dn_mask = mask;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	dn->dn_next = dnentry->dn;
+	dnentry->dn = dn;
+
+	return 0;
+}
+
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+	struct dnotify_mark_entry *new_dnentry, *dnentry;
+	struct fsnotify_mark_entry *new_entry, *entry;
 	struct dnotify_struct *dn;
-	struct dnotify_struct *odn;
-	struct dnotify_struct **prev;
 	struct inode *inode;
 	fl_owner_t id = current->files;
 	struct file *f;
-	int error = 0;
+	int destroy = 0, error = 0;
+	__u32 mask;
+
+	/* we use these to tell if we need to kfree */
+	new_entry = NULL;
+	dn = NULL;
 
+	if (!dir_notify_enable) {
+		error = -EINVAL;
+		goto out_err;
+	}
+
+	/* a 0 mask means we are explicitly removing the watch */
 	if ((arg & ~DN_MULTISHOT) == 0) {
 		dnotify_flush(filp, id);
-		return 0;
+		error = 0;
+		goto out_err;
 	}
-	if (!dir_notify_enable)
-		return -EINVAL;
+
+	/* dnotify only works on directories */
 	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return -ENOTDIR;
-	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
-	if (dn == NULL)
-		return -ENOMEM;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((odn = *prev) != NULL) {
-		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-			odn->dn_fd = fd;
-			odn->dn_mask |= arg;
-			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-			goto out_free;
-		}
-		prev = &odn->dn_next;
+	if (!S_ISDIR(inode->i_mode)) {
+		error = -ENOTDIR;
+		goto out_err;
 	}
 
-	rcu_read_lock();
-	f = fcheck(fd);
-	rcu_read_unlock();
-	/* we'd lost the race with close(), sod off silently */
-	/* note that inode->i_lock prevents reordering problems
-	 * between accesses to descriptor table and ->i_dnotify */
-	if (f != filp)
-		goto out_free;
+	/* expect most fcntl to add new rather than augment old */
+	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
+	if (!dn) {
+		error = -ENOMEM;
+		goto out_err;
+	}
 
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	if (error)
-		goto out_free;
+	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
+	new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
+	if (!new_dnentry) {
+		error = -ENOMEM;
+		goto out_err;
+	}
 
-	dn->dn_mask = arg;
-	dn->dn_fd = fd;
-	dn->dn_filp = filp;
-	dn->dn_owner = id;
-	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-	dn->dn_next = inode->i_dnotify;
-	inode->i_dnotify = dn;
-	spin_unlock(&inode->i_lock);
-	return 0;
+	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
+	mask = convert_arg(arg);
 
-out_free:
-	spin_unlock(&inode->i_lock);
-	kmem_cache_free(dn_cache, dn);
-	return error;
-}
+	/* set up the new_entry and new_dnentry */
+	new_entry = &new_dnentry->fsn_entry;
+	fsnotify_init_mark(new_entry, dnotify_free_mark);
+	new_entry->mask = mask;
+	new_dnentry->dn = NULL;
 
-void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	struct dnotify_struct *	dn;
-	struct dnotify_struct **prev;
-	struct fown_struct *	fown;
-	int			changed = 0;
+	/* this is needed to prevent the fcntl/close race described below */
+	mutex_lock(&dnotify_mark_mutex);
 
+	/* add the new_entry or find an old one. */
 	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_mask & event) == 0) {
-			prev = &dn->dn_next;
-			continue;
-		}
-		fown = &dn->dn_filp->f_owner;
-		send_sigio(fown, dn->dn_fd, POLL_MSG);
-		if (dn->dn_mask & DN_MULTISHOT)
-			prev = &dn->dn_next;
-		else {
-			*prev = dn->dn_next;
-			changed = 1;
-			kmem_cache_free(dn_cache, dn);
-		}
-	}
-	if (changed)
-		redo_inode_mask(inode);
+	entry = fsnotify_find_mark_entry(dnotify_group, inode);
 	spin_unlock(&inode->i_lock);
-}
-
-EXPORT_SYMBOL(__inode_dir_notify);
+	if (entry) {
+		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+		spin_lock(&entry->lock);
+	} else {
+		fsnotify_add_mark(new_entry, dnotify_group, inode);
+		spin_lock(&new_entry->lock);
+		entry = new_entry;
+		dnentry = new_dnentry;
+		/* we used new_entry, so don't free it */
+		new_entry = NULL;
+	}
 
-/*
- * This is hopelessly wrong, but unfixable without API changes.  At
- * least it doesn't oops the kernel...
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-	struct dentry *parent;
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
 
-	if (!dir_notify_enable)
-		return;
+	/* if (f != filp) means that we lost a race and another task/thread
+	 * actually closed the fd we are still playing with before we grabbed
+	 * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+	 * only time we clean up the mark entries we need to get our mark off
+	 * the list. */
+	if (f != filp) {
+		/* if we added ourselves, shoot ourselves, it's possible that
+		 * the flush actually did shoot this entry.  That's fine too
+		 * since multiple calls to destroy_mark is perfectly safe, if
+		 * we found a dnentry already attached to the inode, just sod
+		 * off silently as the flush at close time dealt with it.
+		 */
+		if (dnentry == new_dnentry)
+			destroy = 1;
+		goto out;
+	}
 
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		__inode_dir_notify(parent->d_inode, event);
-		dput(parent);
-	} else {
-		spin_unlock(&dentry->d_lock);
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	if (error) {
+		/* if we added, we must shoot */
+		if (dnentry == new_dnentry)
+			destroy = 1;
+		goto out;
 	}
+
+	error = attach_dn(dn, dnentry, id, fd, filp, mask);
+	/* !error means that we attached the dn to the dnentry, so don't free it */
+	if (!error)
+		dn = NULL;
+	/* -EEXIST means that we didn't add this new dn and used an old one.
+	 * that isn't an error (and the unused dn should be freed) */
+	else if (error == -EEXIST)
+		error = 0;
+
+	dnotify_recalc_inode_mask(entry);
+out:
+	spin_unlock(&entry->lock);
+
+	if (destroy)
+		fsnotify_destroy_mark_by_entry(entry);
+
+	fsnotify_recalc_group_mask(dnotify_group);
+
+	mutex_unlock(&dnotify_mark_mutex);
+	fsnotify_put_mark(entry);
+out_err:
+	if (new_entry)
+		fsnotify_put_mark(new_entry);
+	if (dn)
+		kmem_cache_free(dnotify_struct_cache, dn);
+	return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 
 static int __init dnotify_init(void)
 {
-	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
+	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+
+	dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+					      0, &dnotify_fsnotify_ops);
+	if (IS_ERR(dnotify_group))
+		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
 }
 
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index 102a902b439..ecc06286226 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -10,7 +10,7 @@
 
 struct dnotify_struct {
 	struct dnotify_struct *	dn_next;
-	unsigned long		dn_mask;
+	__u32			dn_mask;
 	int			dn_fd;
 	struct file *		dn_filp;
 	fl_owner_t		dn_owner;
@@ -21,23 +21,18 @@ struct dnotify_struct {
 
 #ifdef CONFIG_DNOTIFY
 
-extern void __inode_dir_notify(struct inode *, unsigned long);
+#define DNOTIFY_ALL_EVENTS (FS_DELETE | FS_DELETE_CHILD |\
+			    FS_MODIFY | FS_MODIFY_CHILD |\
+			    FS_ACCESS | FS_ACCESS_CHILD |\
+			    FS_ATTRIB | FS_ATTRIB_CHILD |\
+			    FS_CREATE | FS_DN_RENAME |\
+			    FS_MOVED_FROM | FS_MOVED_TO)
+
 extern void dnotify_flush(struct file *, fl_owner_t);
 extern int fcntl_dirnotify(int, struct file *, unsigned long);
-extern void dnotify_parent(struct dentry *, unsigned long);
-
-static inline void inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	if (inode->i_dnotify_mask & (event))
-		__inode_dir_notify(inode, event);
-}
 
 #else
 
-static inline void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-}
-
 static inline void dnotify_flush(struct file *filp, fl_owner_t id)
 {
 }
@@ -47,14 +42,6 @@ static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	return -EINVAL;
 }
 
-static inline void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-}
-
-static inline void inode_dir_notify(struct inode *inode, unsigned long event)
-{
-}
-
 #endif /* CONFIG_DNOTIFY */
 
 #endif /* __KERNEL __ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 275b0860044..323b5ce474c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -760,11 +760,6 @@ struct inode {
 	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
 #endif
 
-#ifdef CONFIG_DNOTIFY
-	unsigned long		i_dnotify_mask; /* Directory notify events */
-	struct dnotify_struct	*i_dnotify; /* for directory notifications */
-#endif
-
 #ifdef CONFIG_INOTIFY
 	struct list_head	inotify_watches; /* watches on this inode */
 	struct mutex		inotify_mutex;	/* protects the watches list */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6a662ed0bc8..db12d9de352 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -74,13 +74,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	__u32 new_dir_mask = 0;
 
 	if (old_dir == new_dir) {
-		inode_dir_notify(old_dir, DN_RENAME);
 		old_dir_mask = FS_DN_RENAME;
-	} else {
-		inode_dir_notify(old_dir, DN_DELETE);
-		old_dir_mask = FS_DELETE;
-		inode_dir_notify(new_dir, DN_CREATE);
-		new_dir_mask = FS_CREATE;
 	}
 
 	if (isdir) {
@@ -132,7 +126,6 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 
 	if (isdir)
 		mask |= FS_IN_ISDIR;
-	dnotify_parent(dentry, DN_DELETE);
 
 	fsnotify_parent(dentry, mask);
 }
@@ -154,7 +147,6 @@ static inline void fsnotify_inoderemove(struct inode *inode)
  */
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
-	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
@@ -169,7 +161,6 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry)
 {
-	inode_dir_notify(dir, DN_CREATE);
 	inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
 				  inode);
 	fsnotify_link_count(inode);
@@ -186,7 +177,6 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
 	struct inode *d_inode = dentry->d_inode;
 
-	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
@@ -204,7 +194,6 @@ static inline void fsnotify_access(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	dnotify_parent(dentry, DN_ACCESS);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
@@ -222,7 +211,6 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	dnotify_parent(dentry, DN_MODIFY);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
@@ -289,47 +277,33 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
-	int dn_mask = 0;
-	__u32 in_mask = 0;
+	__u32 mask = 0;
+
+	if (ia_valid & ATTR_UID)
+		mask |= FS_ATTRIB;
+	if (ia_valid & ATTR_GID)
+		mask |= FS_ATTRIB;
+	if (ia_valid & ATTR_SIZE)
+		mask |= FS_MODIFY;
 
-	if (ia_valid & ATTR_UID) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
-	if (ia_valid & ATTR_GID) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		in_mask |= FS_MODIFY;
-		dn_mask |= DN_MODIFY;
-	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
-	{
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	} else if (ia_valid & ATTR_ATIME) {
-		in_mask |= FS_ACCESS;
-		dn_mask |= DN_ACCESS;
-	} else if (ia_valid & ATTR_MTIME) {
-		in_mask |= FS_MODIFY;
-		dn_mask |= DN_MODIFY;
-	}
-	if (ia_valid & ATTR_MODE) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
+		mask |= FS_ATTRIB;
+	else if (ia_valid & ATTR_ATIME)
+		mask |= FS_ACCESS;
+	else if (ia_valid & ATTR_MTIME)
+		mask |= FS_MODIFY;
+
+	if (ia_valid & ATTR_MODE)
+		mask |= FS_ATTRIB;
 
-	if (dn_mask)
-		dnotify_parent(dentry, dn_mask);
-	if (in_mask) {
+	if (mask) {
 		if (S_ISDIR(inode->i_mode))
-			in_mask |= FS_IN_ISDIR;
-		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
+			mask |= FS_IN_ISDIR;
+		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
-		fsnotify_parent(dentry, in_mask);
-		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
+		fsnotify_parent(dentry, mask);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 13d2dd57004..9ea800e840f 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -57,6 +57,9 @@
 				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
 				   FS_DELETE)
 
+/* listeners that hard code group numbers near the top */
+#define DNOTIFY_GROUP_NUM	UINT_MAX
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
-- 
cgit v1.2.3-70-g09d2


From a2d8bc6cb4a3024661baf877242f123787d0c054 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:37 -0400
Subject: fsnotify: generic notification queue and waitq

inotify needs to do asyc notification in which event information is stored
on a queue until the listener is ready to receive it.  This patch
implements a generic notification queue for inotify (and later fanotify) to
store events to be sent at a later time.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.h             |   9 ++
 fs/notify/group.c                |   9 ++
 fs/notify/notification.c         | 230 +++++++++++++++++++++++++++++++++++++--
 include/linux/fsnotify_backend.h |  37 +++++++
 4 files changed, 278 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 83b8ec0a8ec..4dc240824b2 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -13,8 +13,12 @@ extern struct list_head fsnotify_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
 extern __u32 fsnotify_mask;
 
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 /*
@@ -22,4 +26,9 @@ extern void fsnotify_clear_marks_by_inode(struct inode *inode);
  * about events that happen to its children.
  */
 extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index a29d2fa6792..0e1677144bc 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -91,6 +91,9 @@ static void fsnotify_get_group(struct fsnotify_group *group)
  */
 void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
+	/* clear the notification queue of all events */
+	fsnotify_flush_notify(group);
+
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
 
@@ -214,6 +217,12 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->group_num = group_num;
 	group->mask = mask;
 
+	mutex_init(&group->notification_mutex);
+	INIT_LIST_HEAD(&group->notification_list);
+	init_waitqueue_head(&group->notification_waitq);
+	group->q_len = 0;
+	group->max_events = UINT_MAX;
+
 	spin_lock_init(&group->mark_lock);
 	atomic_set(&group->num_marks, 0);
 	INIT_LIST_HEAD(&group->mark_entries);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8e9a87f8f5..dddecc74e63 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -16,6 +16,21 @@
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -33,6 +48,21 @@
 #include "fsnotify.h"
 
 static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	return list_empty(&group->notification_list) ? true : false;
+}
 
 void fsnotify_get_event(struct fsnotify_event *event)
 {
@@ -52,19 +82,176 @@ void fsnotify_put_event(struct fsnotify_event *event)
 	}
 }
 
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+	return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+	kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+
+/*
+ * check if 2 events contain the same information.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	if ((old->mask == new->mask) &&
+	    (old->to_tell == new->to_tell) &&
+	    (old->data_type == new->data_type)) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_INODE):
+			if (old->inode == new->inode)
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+		case (FSNOTIFY_EVENT_NONE):
+			return true;
+		};
+	}
+	return false;
+}
+
 /*
- * Allocate a new event which will be sent to each group's handle_event function
- * if the group was interested in this particular event.
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type)
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *holder = NULL;
+	struct list_head *list = &group->notification_list;
+	struct fsnotify_event_holder *last_holder;
+	struct fsnotify_event *last_event;
+
+	/*
+	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+	 * Check if we expect to be able to use that holder.  If not alloc a new
+	 * holder.
+	 * For the overflow event it's possible that something will use the in
+	 * event holder before we get the lock so we may need to jump back and
+	 * alloc a new holder, this can't happen for most events...
+	 */
+	if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+		holder = fsnotify_alloc_event_holder();
+		if (!holder)
+			return -ENOMEM;
+	}
+
+	mutex_lock(&group->notification_mutex);
+
+	if (group->q_len >= group->max_events)
+		event = &q_overflow_event;
+
+	spin_lock(&event->lock);
+
+	if (list_empty(&event->holder.event_list)) {
+		if (unlikely(holder))
+			fsnotify_destroy_event_holder(holder);
+		holder = &event->holder;
+	} else if (unlikely(!holder)) {
+		/* between the time we checked above and got the lock the in
+		 * event holder was used, go back and get a new one */
+		spin_unlock(&event->lock);
+		mutex_unlock(&group->notification_mutex);
+		goto alloc_holder;
+	}
+
+	if (!list_empty(list)) {
+		last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+		last_event = last_holder->event;
+		if (event_compare(last_event, event)) {
+			spin_unlock(&event->lock);
+			mutex_unlock(&group->notification_mutex);
+			if (holder != &event->holder)
+				fsnotify_destroy_event_holder(holder);
+			return 0;
+		}
+	}
+
+	group->q_len++;
+	holder->event = event;
+
+	fsnotify_get_event(event);
+	list_add_tail(&holder->event_list, list);
+	spin_unlock(&event->lock);
+	mutex_unlock(&group->notification_mutex);
+
+	wake_up(&group->notification_waitq);
+	return 0;
+}
+
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
 
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-	if (!event)
-		return NULL;
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
 
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+
+	event = holder->event;
+
+	spin_lock(&event->lock);
+	holder->event = NULL;
+	list_del_init(&holder->event_list);
+	spin_unlock(&event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (holder != &event->holder)
+		fsnotify_destroy_event_holder(holder);
+
+	group->q_len--;
+
+	return event;
+}
+
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
+
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+	event = holder->event;
+
+	return event;
+}
+
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		event = fsnotify_remove_notify_event(group);
+		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+	}
+	mutex_unlock(&group->notification_mutex);
+}
+
+static void initialize_event(struct fsnotify_event *event)
+{
+	event->holder.event = NULL;
+	INIT_LIST_HEAD(&event->holder.event_list);
 	atomic_set(&event->refcnt, 1);
 
 	spin_lock_init(&event->lock);
@@ -72,7 +259,32 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 	event->path.dentry = NULL;
 	event->path.mnt = NULL;
 	event->inode = NULL;
+	event->data_type = FSNOTIFY_EVENT_NONE;
 
+	event->to_tell = NULL;
+}
+
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *	parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+					     void *data, int data_type)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	initialize_event(event);
 	event->to_tell = to_tell;
 
 	switch (data_type) {
@@ -114,6 +326,10 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 __init int fsnotify_notification_init(void)
 {
 	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+
+	initialize_event(&q_overflow_event);
+	q_overflow_event.mask = FS_Q_OVERFLOW;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9ea800e840f..15f8f82a5c5 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -119,6 +119,13 @@ struct fsnotify_group {
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
+	/* needed to send notification to userspace */
+	struct mutex notification_mutex;	/* protect the notification_list */
+	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace */
+	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
+	unsigned int q_len;			/* events on the queue */
+	unsigned int max_events;		/* maximum events allowed on the list */
+
 	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
 	spinlock_t mark_lock;		/* protect mark_entries list */
 	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
@@ -135,12 +142,33 @@ struct fsnotify_group {
 	};
 };
 
+/*
+ * A single event can be queued in multiple group->notification_lists.
+ *
+ * each group->notification_list will point to an event_holder which in turns points
+ * to the actual event that needs to be sent to userspace.
+ *
+ * Seemed cheaper to create a refcnt'd event and a small holder for every group
+ * than create a different event for every group
+ *
+ */
+struct fsnotify_event_holder {
+	struct fsnotify_event *event;
+	struct list_head event_list;
+};
+
 /*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
  * listener this structure is where you need to be adding fields.
  */
 struct fsnotify_event {
+	/*
+	 * If we create an event we are also likely going to need a holder
+	 * to link to a group.  So embed one holder in the event.  Means only
+	 * one allocation for the common case where we only have one group
+	 */
+	struct fsnotify_event_holder holder;
 	spinlock_t lock;	/* protection for the associated event_holder and private_list */
 	/* to_tell may ONLY be dereferenced during handle_event(). */
 	struct inode *to_tell;	/* either the inode the event happened to or its parent */
@@ -264,6 +292,15 @@ extern void fsnotify_put_event(struct fsnotify_event *event);
 extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
 									struct fsnotify_event *event);
 
+/* attach the event to the group notification queue */
+extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event);
+/* true if the group notification queue is empty */
+extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
+/* return, but do not dequeue the first event on the notification queue */
+extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
+/* reutnr AND dequeue the first event on the notification queue */
+extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
+
 /* functions used to manipulate the marks attached to inodes */
 
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
-- 
cgit v1.2.3-70-g09d2


From 62ffe5dfba056f7ba81d710fee9f28c58a42fdd6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:43 -0400
Subject: fsnotify: include pathnames with entries when possible

When inotify wants to send events to a directory about a child it includes
the name of the original file.  This patch collects that filename and makes
it available for notification.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.c             |  7 ++++---
 fs/notify/notification.c         | 16 +++++++++++++++-
 include/linux/fsnotify.h         | 28 ++++++++++++++--------------
 include/linux/fsnotify_backend.h | 11 ++++++++---
 4 files changed, 41 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7fc760067a6..675129fa9fd 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -114,7 +114,8 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+			 dentry->d_name.name);
 		dput(parent);
 	}
 
@@ -131,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
@@ -156,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is);
+				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index dddecc74e63..c69b18b9aba 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -78,6 +78,7 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
+		kfree(event->file_name);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
 }
@@ -262,6 +263,9 @@ static void initialize_event(struct fsnotify_event *event)
 	event->data_type = FSNOTIFY_EVENT_NONE;
 
 	event->to_tell = NULL;
+
+	event->file_name = NULL;
+	event->name_len = 0;
 }
 
 /*
@@ -274,9 +278,10 @@ static void initialize_event(struct fsnotify_event *event)
  * @mask what actually happened.
  * @data pointer to the object which was actually affected
  * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type)
+					     void *data, int data_type, const char *name)
 {
 	struct fsnotify_event *event;
 
@@ -285,6 +290,15 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		return NULL;
 
 	initialize_event(event);
+
+	if (name) {
+		event->file_name = kstrdup(name, GFP_KERNEL);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+		event->name_len = strlen(event->file_name);
+	}
 	event->to_tell = to_tell;
 
 	switch (data_type) {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index db12d9de352..180740e9ec8 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -58,7 +58,7 @@ static inline void fsnotify_link_count(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -91,8 +91,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name);
 
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
@@ -104,7 +104,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -138,7 +138,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL);
 	__fsnotify_inode_delete(inode);
 }
 
@@ -151,7 +151,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
 }
 
 /*
@@ -166,7 +166,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name);
 }
 
 /*
@@ -180,7 +180,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
 }
 
 /*
@@ -197,7 +197,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -214,7 +214,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -231,7 +231,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -250,7 +250,7 @@ static inline void fsnotify_close(struct file *file)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL);
 }
 
 /*
@@ -267,7 +267,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -303,7 +303,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 15f8f82a5c5..52692f40589 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -192,6 +192,9 @@ struct fsnotify_event {
 	int data_type;		/* which of the above union we have */
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
+
+	char *file_name;
+	size_t name_len;
 };
 
 /*
@@ -224,7 +227,7 @@ struct fsnotify_mark_entry {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name);
 extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 
@@ -319,10 +322,12 @@ extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is);
+						    void *data, int data_is, const char *name);
+
 #else
 
-static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+			    const char *name);
 {}
 
 static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
-- 
cgit v1.2.3-70-g09d2


From 47882c6f51e8ef41fbbe2bbb746a1ea3228dd7ca Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:47 -0400
Subject: fsnotify: add correlations between events

As part of the standard inotify events it includes a correlation cookie
between two dentry move operations.  This patch includes the same behaviour
in fsnotify events.  It is needed so that inotify userspace can be
implemented on top of fsnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.c             |  6 +++---
 fs/notify/notification.c         | 20 ++++++++++++++++++--
 include/linux/fsnotify.h         | 35 ++++++++++++++++++-----------------
 include/linux/fsnotify_backend.h | 15 ++++++++++++---
 4 files changed, 51 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 675129fa9fd..f11d75f0236 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -115,7 +115,7 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		mask |= FS_EVENT_ON_CHILD;
 
 		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-			 dentry->d_name.name);
+			 dentry->d_name.name, 0);
 		dput(parent);
 	}
 
@@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
@@ -157,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name);
+				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c69b18b9aba..346f6e5c355 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/namei.h>
@@ -56,6 +57,17 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
  * get set to 0 so it will never get 'freed'
  */
 static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+	return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 
 /* return true if the notify queue is empty, false otherwise */
 bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
@@ -266,6 +278,8 @@ static void initialize_event(struct fsnotify_event *event)
 
 	event->file_name = NULL;
 	event->name_len = 0;
+
+	event->sync_cookie = 0;
 }
 
 /*
@@ -280,8 +294,8 @@ static void initialize_event(struct fsnotify_event *event)
  * @data_type flag indication if the data is a file, path, inode, nothing...
  * @name the filename, if available
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type, const char *name)
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+					     int data_type, const char *name, u32 cookie)
 {
 	struct fsnotify_event *event;
 
@@ -299,6 +313,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		}
 		event->name_len = strlen(event->file_name);
 	}
+
+	event->sync_cookie = cookie;
 	event->to_tell = to_tell;
 
 	switch (data_type) {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 180740e9ec8..c25b39ddd62 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -58,7 +58,7 @@ static inline void fsnotify_link_count(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -69,7 +69,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
-	u32 cookie = inotify_get_cookie();
+	u32 in_cookie = inotify_get_cookie();
+	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = 0;
 	__u32 new_dir_mask = 0;
 
@@ -86,13 +87,13 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	old_dir_mask |= FS_MOVED_FROM;
 	new_dir_mask |= FS_MOVED_TO;
 
-	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
+	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
 				  source);
-	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
+	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name);
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
 
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
@@ -104,7 +105,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -138,7 +139,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	__fsnotify_inode_delete(inode);
 }
 
@@ -151,7 +152,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -166,7 +167,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name);
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
 }
 
 /*
@@ -180,7 +181,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -197,7 +198,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -214,7 +215,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -231,7 +232,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -250,7 +251,7 @@ static inline void fsnotify_close(struct file *file)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
@@ -267,7 +268,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -303,7 +304,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 52692f40589..b78b5573d22 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -193,6 +193,7 @@ struct fsnotify_event {
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 
+	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
 };
@@ -227,9 +228,11 @@ struct fsnotify_mark_entry {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name);
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+		     const char *name, u32 cookie);
 extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
+extern u32 fsnotify_get_cookie(void);
 
 static inline int fsnotify_inode_watches_children(struct inode *inode)
 {
@@ -322,12 +325,13 @@ extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is, const char *name);
+						    void *data, int data_is, const char *name,
+						    u32 cookie);
 
 #else
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-			    const char *name);
+			    const char *name, u32 cookie)
 {}
 
 static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
@@ -342,6 +346,11 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
 {}
 
+static inline u32 fsnotify_get_cookie(void)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v1.2.3-70-g09d2


From e4aff117368cfdd3567ee41844d216d079b55173 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:50 -0400
Subject: fsnotify: allow groups to add private data to events

inotify needs per group information attached to events.  This patch allows
groups to attach private information and implements a callback so that
information can be freed when an event is being destroyed.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/dnotify/dnotify.c      |  1 +
 fs/notify/notification.c         | 52 ++++++++++++++++++++++++++++++++++++----
 include/linux/fsnotify_backend.h | 24 +++++++++++++++----
 3 files changed, 68 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d9d80f502c6..12f9e6b1ffe 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -183,6 +183,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
 	.freeing_mark = dnotify_freeing_mark,
+	.free_event_priv = NULL,
 };
 
 /*
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 346f6e5c355..959b73e756f 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -90,6 +90,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
+		BUG_ON(!list_empty(&event->private_data_list));
+
 		kfree(event->file_name);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
@@ -106,7 +108,29 @@ void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
 }
 
 /*
- * check if 2 events contain the same information.
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_private_data *lpriv;
+	struct fsnotify_event_private_data *priv = NULL;
+
+	assert_spin_locked(&event->lock);
+
+	list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+		if (lpriv->group == group) {
+			priv = lpriv;
+			list_del(&priv->event_list);
+			break;
+		}
+	}
+	return priv;
+}
+
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
  */
 static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
 {
@@ -134,13 +158,17 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event)
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+			      struct fsnotify_event_private_data *priv)
 {
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
 
+	/* easy to tell if priv was attached to the event */
+	INIT_LIST_HEAD(&priv->event_list);
+
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
 	 * Check if we expect to be able to use that holder.  If not alloc a new
@@ -158,8 +186,11 @@ alloc_holder:
 
 	mutex_lock(&group->notification_mutex);
 
-	if (group->q_len >= group->max_events)
+	if (group->q_len >= group->max_events) {
 		event = &q_overflow_event;
+		/* sorry, no private data on the overflow event */
+		priv = NULL;
+	}
 
 	spin_lock(&event->lock);
 
@@ -183,7 +214,7 @@ alloc_holder:
 			mutex_unlock(&group->notification_mutex);
 			if (holder != &event->holder)
 				fsnotify_destroy_event_holder(holder);
-			return 0;
+			return -EEXIST;
 		}
 	}
 
@@ -192,6 +223,8 @@ alloc_holder:
 
 	fsnotify_get_event(event);
 	list_add_tail(&holder->event_list, list);
+	if (priv)
+		list_add_tail(&priv->event_list, &event->private_data_list);
 	spin_unlock(&event->lock);
 	mutex_unlock(&group->notification_mutex);
 
@@ -252,10 +285,19 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 void fsnotify_flush_notify(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct fsnotify_event_private_data *priv;
 
 	mutex_lock(&group->notification_mutex);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		event = fsnotify_remove_notify_event(group);
+		/* if they don't implement free_event_priv they better not have attached any */
+		if (group->ops->free_event_priv) {
+			spin_lock(&event->lock);
+			priv = fsnotify_remove_priv_from_event(group, event);
+			spin_unlock(&event->lock);
+			if (priv)
+				group->ops->free_event_priv(priv);
+		}
 		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
 	}
 	mutex_unlock(&group->notification_mutex);
@@ -274,6 +316,8 @@ static void initialize_event(struct fsnotify_event *event)
 	event->inode = NULL;
 	event->data_type = FSNOTIFY_EVENT_NONE;
 
+	INIT_LIST_HEAD(&event->private_data_list);
+
 	event->to_tell = NULL;
 
 	event->file_name = NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b78b5573d22..efdf9e442d8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -63,6 +63,7 @@
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
+struct fsnotify_event_private_data;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
@@ -81,6 +82,7 @@ struct fsnotify_ops {
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
 };
 
 /*
@@ -157,6 +159,15 @@ struct fsnotify_event_holder {
 	struct list_head event_list;
 };
 
+/*
+ * Inotify needs to tack data onto an event.  This struct lets us later find the
+ * correct private data of the correct group.
+ */
+struct fsnotify_event_private_data {
+	struct fsnotify_group *group;
+	struct list_head event_list;
+};
+
 /*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
@@ -196,6 +207,8 @@ struct fsnotify_event {
 	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
+
+	struct list_head private_data_list;	/* groups can store private data here */
 };
 
 /*
@@ -294,17 +307,18 @@ extern void fsnotify_put_group(struct fsnotify_group *group);
 /* take a reference to an event */
 extern void fsnotify_get_event(struct fsnotify_event *event);
 extern void fsnotify_put_event(struct fsnotify_event *event);
-/* find private data previously attached to an event */
-extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
-									struct fsnotify_event *event);
+/* find private data previously attached to an event and unlink it */
+extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
+									   struct fsnotify_event *event);
 
 /* attach the event to the group notification queue */
-extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event);
+extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+				     struct fsnotify_event_private_data *priv);
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
-/* reutnr AND dequeue the first event on the notification queue */
+/* return AND dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
 
 /* functions used to manipulate the marks attached to inodes */
-- 
cgit v1.2.3-70-g09d2


From 164bc6195139047faaf5ada1278332e99494803b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:58 -0400
Subject: fsnotify: handle filesystem unmounts with fsnotify marks

When an fs is unmounted with an fsnotify mark entry attached to one of its
inodes we need to destroy that mark entry and we also (like inotify) send
an unmount event.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c                       |  1 +
 fs/notify/inode_mark.c           | 72 ++++++++++++++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h |  4 +++
 3 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 54c63ce3de2..ca337014ae2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -407,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
 	mutex_lock(&iprune_mutex);
 	spin_lock(&inode_lock);
 	inotify_unmount_inodes(&sb->s_inodes);
+	fsnotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 282150f74cf..0a499d2c619 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -89,6 +89,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
 
@@ -351,3 +352,74 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 
 	return ret;
 }
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inode *need_iput_tmp;
+
+		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+			continue;
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+
+		/* In case fsnotify_inode_delete() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+		    atomic_read(&next_i->i_count) &&
+		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+			__iget(next_i);
+			need_iput = next_i;
+		}
+
+		/*
+		 * We can safely drop inode_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.  Finally,
+		 * iprune_mutex keeps shrink_icache_memory() away.
+		 */
+		spin_unlock(&inode_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send FS_UNMOUNT and then remove it */
+		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+		fsnotify_inode_delete(inode);
+
+		iput(inode);
+
+		spin_lock(&inode_lock);
+	}
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index efdf9e442d8..d2c0ee30e61 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -336,6 +336,7 @@ extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
 extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
@@ -365,6 +366,9 @@ static inline u32 fsnotify_get_cookie(void)
 	return 0;
 }
 
+static inline void fsnotify_unmount_inodes(struct list_head *list)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v1.2.3-70-g09d2


From 63c882a05416e18de6fb59f7dd6da48f3bbe8273 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:02:01 -0400
Subject: inotify: reimplement inotify using fsnotify

Reimplement inotify_user using fsnotify.  This should be feature for feature
exactly the same as the original inotify_user.  This does not make any changes
to the in kernel inotify feature used by audit.  Those patches (and the eventual
removal of in kernel inotify) will come after the new inotify_user proves to be
working correctly.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                          |   2 +
 fs/notify/inotify/Kconfig            |  20 +-
 fs/notify/inotify/Makefile           |   2 +-
 fs/notify/inotify/inotify.h          |  21 +
 fs/notify/inotify/inotify_fsnotify.c | 137 ++++++
 fs/notify/inotify/inotify_user.c     | 837 +++++++++++++++++------------------
 include/linux/fsnotify_backend.h     |  11 +
 init/Kconfig                         |   3 +-
 8 files changed, 585 insertions(+), 448 deletions(-)
 create mode 100644 fs/notify/inotify/inotify.h
 create mode 100644 fs/notify/inotify/inotify_fsnotify.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 96e0c8c6079..e697b67031a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2858,6 +2858,8 @@ P:	John McCutchan
 M:	john@johnmccutchan.com
 P:	Robert Love
 M:	rlove@rlove.org
+P:	Eric Paris
+M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/inotify.txt
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 44679284102..5356884289a 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
 	bool "Inotify file change notification support"
-	default y
+	default n
 	---help---
-	  Say Y here to enable inotify support.  Inotify is a file change
-	  notification system and a replacement for dnotify.  Inotify fixes
-	  numerous shortcomings in dnotify and introduces several new features
-	  including multiple file events, one-shot support, and unmount
-	  notification.
+	  Say Y here to enable legacy in kernel inotify support.  Inotify is a
+	  file change notification system.  It is a replacement for dnotify.
+	  This option only provides the legacy inotify in kernel API.  There
+	  are no in tree kernel users of this interface since it is deprecated.
+	  You only need this if you are loading an out of tree kernel module
+	  that uses inotify.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	depends on INOTIFY
+	depends on FSNOTIFY
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
 	  associated system calls.  Inotify allows monitoring of both files and
 	  directories via a single open fd.  Events are read from the file
 	  descriptor, which is also select()- and poll()-able.
+	  Inotify fixes numerous shortcomings in dnotify and introduces several
+	  new features including multiple file events, one-shot support, and
+	  unmount notification.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8..94382817136 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)		+= inotify.o
-obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 00000000000..ea2605a58b8
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+
+extern struct kmem_cache *event_priv_cachep;
+
+struct inotify_event_private_data {
+	struct fsnotify_event_private_data fsnotify_event_priv_data;
+	int wd;
+};
+
+struct inotify_inode_mark_entry {
+	/* fsnotify_mark_entry MUST be the first thing */
+	struct fsnotify_mark_entry fsn_entry;
+	int wd;
+};
+
+extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 00000000000..160da548683
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,137 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+
+#include "inotify.h"
+
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *ientry;
+	struct inode *to_tell;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	int wd, ret;
+
+	to_tell = event->to_tell;
+
+	spin_lock(&to_tell->i_lock);
+	entry = fsnotify_find_mark_entry(group, to_tell);
+	spin_unlock(&to_tell->i_lock);
+	/* race with watch removal?  We already passes should_send */
+	if (unlikely(!entry))
+		return 0;
+	ientry = container_of(entry, struct inotify_inode_mark_entry,
+			      fsn_entry);
+	wd = ientry->wd;
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		return -ENOMEM;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = wd;
+
+	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+	/* EEXIST is not an error */
+	if (ret == -EEXIST)
+		ret = 0;
+
+	/* did event_priv get attached? */
+	if (list_empty(&fsn_event_priv->event_list))
+		inotify_free_event_priv(fsn_event_priv);
+
+	/*
+	 * If we hold the entry until after the event is on the queue
+	 * IN_IGNORED won't be able to pass this event in the queue
+	 */
+	fsnotify_put_mark(entry);
+
+	return ret;
+}
+
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	inotify_destroy_mark_entry(entry, group);
+}
+
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return false;
+
+	send = (entry->mask & mask);
+
+	/* find took a reference */
+	fsnotify_put_mark(entry);
+
+	return send;
+}
+
+static int idr_callback(int id, void *p, void *data)
+{
+	BUG();
+	return 0;
+}
+
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+	/* ideally the idr is empty and we won't hit the BUG in teh callback */
+	idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+	idr_remove_all(&group->inotify_data.idr);
+	idr_destroy(&group->inotify_data.idr);
+}
+
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+	struct inotify_event_private_data *event_priv;
+
+
+	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+				  fsnotify_event_priv_data);
+
+	kmem_cache_free(event_priv_cachep, event_priv);
+}
+
+const struct fsnotify_ops inotify_fsnotify_ops = {
+	.handle_event = inotify_handle_event,
+	.should_send_event = inotify_should_send_event,
+	.free_group_priv = inotify_free_group_priv,
+	.free_event_priv = inotify_free_event_priv,
+	.freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e240..982a412ac5b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
  * Copyright (C) 2005 John McCutchan
  * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
  * General Public License for more details.
  */
 
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h> /* module_init */
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
 
-#include <asm/ioctls.h>
+#include "inotify.h"
 
-static struct kmem_cache *watch_cachep __read_mostly;
-static struct kmem_cache *event_cachep __read_mostly;
+#include <asm/ioctls.h>
 
 static struct vfsmount *inotify_mnt __read_mostly;
 
+/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
+static struct inotify_event nul_inotify_event;
+
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
 
-/*
- * Lock ordering:
- *
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- * 	inode->inotify_mutex (protects inode's watch list)
- * 		inotify_handle->mutex (protects inotify_handle's watch list)
- * 			inotify_dev->ev_mutex (protects device's event queue)
- */
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *event_priv_cachep __read_mostly;
+static struct fsnotify_event *inotify_ignored_event;
 
 /*
- * Lifetimes of the main data structures:
- *
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
+ * When inotify registers a new group it increments this and uses that
+ * value as an offset to set the fsnotify group "name" and priority.
  */
-
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
-	struct mutex		ev_mutex;	/* protects event queue */
-	struct mutex		up_mutex;	/* synchronizes watch updates */
-	struct list_head 	events;		/* list of queued events */
-	struct user_struct	*user;		/* user who opened this dev */
-	struct inotify_handle	*ih;		/* inotify handle */
-	struct fasync_struct    *fa;            /* async notification */
-	atomic_t		count;		/* reference count */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
-};
-
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-	struct inotify_device	*dev;	/* associated device */
-	struct inotify_watch	wdata;	/* inotify watch data */
-};
+static atomic_t inotify_grp_num;
 
 #ifdef CONFIG_SYSCTL
 
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static inline void get_inotify_dev(struct inotify_device *dev)
-{
-	atomic_inc(&dev->count);
-}
-
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		kfree(dev);
-	}
-}
-
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	atomic_dec(&dev->user->inotify_watches);
-	put_inotify_dev(dev);
-	kmem_cache_free(watch_cachep, watch);
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_NOFS);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
 {
-	if (list_empty(&dev->events))
-		return NULL;
-	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
+	__u32 mask;
 
-/*
- * inotify_dev_queue_event - event handler registered with core inotify, adds
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name,
-				    struct inode *ignored)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-	struct inotify_kernel_event *kevent, *last;
+	/* everything should accept their own ignored and cares about children */
+	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
 
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
+	/* mask off the flags used to open the fd */
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
 
-	mutex_lock(&dev->ev_mutex);
-
-	/* we can safely put the watch as we don't reference it while
-	 * generating the event
-	 */
-	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-		put_inotify_watch(w); /* final put */
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_last_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			goto out;
-		if (name && lastname && !strcmp(lastname, name))
-			goto out;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		goto out;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		goto out;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-	kill_fasync(&dev->fa, SIGIO, POLL_IN);
-
-out:
-	mutex_unlock(&dev->ev_mutex);
-}
-
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
-
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
-		free_kevent(kevent);
-	}
-}
-
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-		      unsigned flags)
-{
-	int error;
-
-	error = user_path_at(AT_FDCWD, dirname, flags, path);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
-	if (error)
-		path_put(path);
-	return error;
+	return mask;
 }
 
-/*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-			u32 mask)
+static inline u32 inotify_mask_to_arg(__u32 mask)
 {
-	struct inotify_user_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return -ENOSPC;
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return -ENOMEM;
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	inotify_init_watch(&watch->wdata);
-	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-	if (ret < 0)
-		free_inotify_user_watch(&watch->wdata);
-
-	return ret;
+	return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+		       IN_Q_OVERFLOW);
 }
 
-/* Device Interface */
-
+/* intofiy userspace file descriptor functions */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-	struct inotify_device *dev = file->private_data;
+	struct fsnotify_group *group = file->private_data;
 	int ret = 0;
 
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->ev_mutex);
-	if (!list_empty(&dev->events))
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
 		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->ev_mutex);
+	mutex_unlock(&group->notification_mutex);
 
 	return ret;
 }
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
  * enough to fit in "count". Return an error pointer if
  * not large enough.
  *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
  */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
-						  size_t count)
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
 {
 	size_t event_size = sizeof(struct inotify_event);
-	struct inotify_kernel_event *kevent;
+	struct fsnotify_event *event;
 
-	if (list_empty(&dev->events))
+	if (fsnotify_notify_queue_is_empty(group))
 		return NULL;
 
-	kevent = inotify_dev_get_event(dev);
-	if (kevent->name)
-		event_size += kevent->event.len;
+	event = fsnotify_peek_notify_event(group);
+
+	event_size += roundup(event->name_len, event_size);
 
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
-	remove_kevent(dev, kevent);
-	return kevent;
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	fsnotify_remove_notify_event(group);
+
+	return event;
 }
 
 /*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
  * We already checked that the event size is smaller than the
  * buffer we had in "get_one_event()" above.
  */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
 				  char __user *buf)
 {
+	struct inotify_event inotify_event;
+	struct fsnotify_event_private_data *fsn_priv;
+	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
+	size_t name_len;
+
+	/* we get the inotify watch descriptor from the event private data */
+	spin_lock(&event->lock);
+	fsn_priv = fsnotify_remove_priv_from_event(group, event);
+	spin_unlock(&event->lock);
+
+	if (!fsn_priv)
+		inotify_event.wd = -1;
+	else {
+		priv = container_of(fsn_priv, struct inotify_event_private_data,
+				    fsnotify_event_priv_data);
+		inotify_event.wd = priv->wd;
+		inotify_free_event_priv(fsn_priv);
+	}
+
+	/* round up event->name_len so it is a multiple of event_size */
+	name_len = roundup(event->name_len, event_size);
+	inotify_event.len = name_len;
+
+	inotify_event.mask = inotify_mask_to_arg(event->mask);
+	inotify_event.cookie = event->sync_cookie;
 
-	if (copy_to_user(buf, &kevent->event, event_size))
+	/* send the main event */
+	if (copy_to_user(buf, &inotify_event, event_size))
 		return -EFAULT;
 
-	if (kevent->name) {
-		buf += event_size;
+	buf += event_size;
 
-		if (copy_to_user(buf, kevent->name, kevent->event.len))
+	/*
+	 * fsnotify only stores the pathname, so here we have to send the pathname
+	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
+	 * with zeros.  I get my zeros from the nul_inotify_event.
+	 */
+	if (name_len) {
+		unsigned int len_to_zero = name_len - event->name_len;
+		/* copy the path name */
+		if (copy_to_user(buf, event->file_name, event->name_len))
 			return -EFAULT;
+		buf += event->name_len;
 
-		event_size += kevent->event.len;
+		/* fill userspace with 0's from nul_inotify_event */
+		if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+			return -EFAULT;
+		buf += len_to_zero;
+		event_size += name_len;
 	}
+
 	return event_size;
 }
 
 static ssize_t inotify_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *pos)
 {
-	struct inotify_device *dev;
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
 	char __user *start;
 	int ret;
 	DEFINE_WAIT(wait);
 
 	start = buf;
-	dev = file->private_data;
+	group = file->private_data;
 
 	while (1) {
-		struct inotify_kernel_event *kevent;
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
 
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_lock(&dev->ev_mutex);
-		kevent = get_one_event(dev, count);
-		mutex_unlock(&dev->ev_mutex);
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
 
 		if (kevent) {
 			ret = PTR_ERR(kevent);
 			if (IS_ERR(kevent))
 				break;
-			ret = copy_event_to_user(kevent, buf);
-			free_kevent(kevent);
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 		schedule();
 	}
 
-	finish_wait(&dev->wq, &wait);
+	finish_wait(&group->notification_waitq, &wait);
 	if (start != buf && ret != -EFAULT)
 		ret = buf - start;
 	return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-	struct inotify_device *dev = file->private_data;
+	struct fsnotify_group *group = file->private_data;
 
-	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+	return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-	struct inotify_device *dev = file->private_data;
-
-	inotify_destroy(dev->ih);
+	struct fsnotify_group *group = file->private_data;
 
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->ev_mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->ev_mutex);
+	fsnotify_clear_marks_by_group(group);
 
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
+	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
+	fsnotify_put_group(group);
 
 	return 0;
 }
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
 			  unsigned long arg)
 {
-	struct inotify_device *dev;
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	struct fsnotify_event *event;
 	void __user *p;
 	int ret = -ENOTTY;
+	size_t send_len = 0;
 
-	dev = file->private_data;
+	group = file->private_data;
 	p = (void __user *) arg;
 
 	switch (cmd) {
 	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list) {
+			event = holder->event;
+			send_len += sizeof(struct inotify_event);
+			send_len += roundup(event->name_len,
+					     sizeof(struct inotify_event));
+		}
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
 		break;
 	}
 
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 
 static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.fasync         = inotify_fasync,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
+	.poll		= inotify_poll,
+	.read		= inotify_read,
+	.fasync		= inotify_fasync,
+	.release	= inotify_release,
+	.unlocked_ioctl	= inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
 };
 
-static const struct inotify_operations inotify_user_ops = {
-	.handle_event	= inotify_dev_queue_event,
-	.destroy_watch	= free_inotify_user_watch,
-};
 
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+/*
+ * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * for the given wd.
+ *
+ * There is a bit of recursion here.  The loop looks like:
+ * 	inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
+ *	inotify_freeing_mark -> inotify_destory_mark_entry -> restart
+ * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
+ * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
+ * test below will not call back to fsnotify again.  But even if that test wasn't
+ * there this would still be safe since fsnotify_destroy_mark_by_entry() is
+ * safe from recursion.
+ */
+void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	struct inotify_inode_mark_entry *ientry;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	struct fsnotify_group *egroup;
+	struct idr *idr;
+
+	spin_lock(&entry->lock);
+	egroup = entry->group;
+
+	/* if egroup we aren't really done and something might still send events
+	 * for this inode, on the callback we'll send the IN_IGNORED */
+	if (egroup) {
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark_by_entry(entry);
+		return;
+	}
+	spin_unlock(&entry->lock);
+
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		goto skip_send_ignore;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = ientry->wd;
+
+	fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+
+	/* did the private data get added? */
+	if (list_empty(&fsn_event_priv->event_list))
+		inotify_free_event_priv(fsn_event_priv);
+
+skip_send_ignore:
+
+	/* remove this entry from the idr */
+	spin_lock(&group->inotify_data.idr_lock);
+	idr = &group->inotify_data.idr;
+	idr_remove(idr, ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+
+	/* removed from idr, drop that reference */
+	fsnotify_put_mark(entry);
+}
+
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+	struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+
+	kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	struct fsnotify_mark_entry *entry = NULL;
+	struct inotify_inode_mark_entry *ientry;
+	int ret = 0;
+	int add = (arg & IN_MASK_ADD);
+	__u32 mask;
+	__u32 old_mask, new_mask;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!ientry))
+		return -ENOMEM;
+	/* we set the mask at the end after attaching it */
+	fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+	ientry->wd = 0;
+
+find_entry:
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (entry) {
+		kmem_cache_free(inotify_inode_mark_cachep, ientry);
+		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	} else {
+		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+			ret = -ENOSPC;
+			goto out_err;
+		}
+
+		ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
+		if (ret == -EEXIST)
+			goto find_entry;
+		else if (ret)
+			goto out_err;
+
+		entry = &ientry->fsn_entry;
+retry:
+		ret = -ENOMEM;
+		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+			goto out_err;
+
+		spin_lock(&group->inotify_data.idr_lock);
+		/* if entry is added to the idr we keep the reference obtained
+		 * through fsnotify_mark_add.  remember to drop this reference
+		 * when entry is removed from idr */
+		ret = idr_get_new_above(&group->inotify_data.idr, entry,
+					++group->inotify_data.last_wd,
+					&ientry->wd);
+		spin_unlock(&group->inotify_data.idr_lock);
+		if (ret) {
+			if (ret == -EAGAIN)
+				goto retry;
+			goto out_err;
+		}
+		atomic_inc(&group->inotify_data.user->inotify_watches);
+	}
+
+	spin_lock(&entry->lock);
+
+	old_mask = entry->mask;
+	if (add) {
+		entry->mask |= mask;
+		new_mask = entry->mask;
+	} else {
+		entry->mask = mask;
+		new_mask = entry->mask;
+	}
+
+	spin_unlock(&entry->lock);
+
+	if (old_mask != new_mask) {
+		/* more bits in old than in new? */
+		int dropped = (old_mask & ~new_mask);
+		/* more bits in this entry than the inode's mask? */
+		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+		/* more bits in this entry than the group? */
+		int do_group = (new_mask & ~group->mask);
+
+		/* update the inode with this new entry */
+		if (dropped || do_inode)
+			fsnotify_recalc_inode_mask(inode);
+
+		/* update the group mask with the new mask */
+		if (dropped || do_group)
+			fsnotify_recalc_group_mask(group);
+	}
+
+	return ientry->wd;
+
+out_err:
+	/* see this isn't supposed to happen, just kill the watch */
+	if (entry) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+	return ret;
+}
+
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+	struct fsnotify_group *group;
+	unsigned int grp_num;
+
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+	group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return group;
+
+	group->max_events = max_events;
+
+	spin_lock_init(&group->inotify_data.idr_lock);
+	idr_init(&group->inotify_data.idr);
+	group->inotify_data.last_wd = 0;
+	group->inotify_data.user = user;
+	group->inotify_data.fa = NULL;
+
+	return group;
+}
+
+
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-	struct inotify_device *dev;
-	struct inotify_handle *ih;
+	struct fsnotify_group *group;
 	struct user_struct *user;
 	struct file *filp;
 	int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 		goto out_free_uid;
 	}
 
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	group = inotify_new_group(user, inotify_max_queued_events);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
 		goto out_free_uid;
 	}
 
-	ih = inotify_init(&inotify_user_ops);
-	if (IS_ERR(ih)) {
-		ret = PTR_ERR(ih);
-		goto out_free_dev;
-	}
-	dev->ih = ih;
-	dev->fa = NULL;
-
 	filp->f_op = &inotify_fops;
 	filp->f_path.mnt = mntget(inotify_mnt);
 	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
 	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 	filp->f_mode = FMODE_READ;
 	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = dev;
-
-	INIT_LIST_HEAD(&dev->events);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->ev_mutex);
-	mutex_init(&dev->up_mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
+	filp->private_data = group;
+
 	atomic_inc(&user->inotify_devs);
+
 	fd_install(fd, filp);
 
 	return fd;
-out_free_dev:
-	kfree(dev);
+
 out_free_uid:
 	free_uid(user);
 	put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		u32, mask)
 {
+	struct fsnotify_group *group;
 	struct inode *inode;
-	struct inotify_device *dev;
 	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = find_inode(pathname, &path, flags);
-	if (unlikely(ret))
+	ret = inotify_find_inode(pathname, &path, flags);
+	if (ret)
 		goto fput_and_out;
 
-	/* inode held in place by reference to path; dev by fget on fd */
+	/* inode held in place by reference to path; group by fget on fd */
 	inode = path.dentry->d_inode;
-	dev = filp->private_data;
+	group = filp->private_data;
 
-	mutex_lock(&dev->up_mutex);
-	ret = inotify_find_update_watch(dev->ih, inode, mask);
-	if (ret == -ENOENT)
-		ret = create_watch(dev, inode, mask);
-	mutex_unlock(&dev->up_mutex);
+	/* create/update an inode mark */
+	ret = inotify_update_watch(group, inode, mask);
+	if (unlikely(ret))
+		goto path_put_and_out;
 
+path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ fput_and_out:
 
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+	struct fsnotify_group *group;
+	struct fsnotify_mark_entry *entry;
 	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
+	int ret = 0, fput_needed;
 
 	filp = fget_light(fd, &fput_needed);
 	if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 		goto out;
 	}
 
-	dev = filp->private_data;
+	group = filp->private_data;
 
-	/* we free our watch data when we get IN_IGNORED */
-	ret = inotify_rm_wd(dev->ih, wd);
+	spin_lock(&group->inotify_data.idr_lock);
+	entry = idr_find(&group->inotify_data.idr, wd);
+	if (unlikely(!entry)) {
+		spin_unlock(&group->inotify_data.idr_lock);
+		ret = -EINVAL;
+		goto out;
+	}
+	fsnotify_get_mark(entry);
+	spin_unlock(&group->inotify_data.idr_lock);
+
+	inotify_destroy_mark_entry(entry, group);
+	fsnotify_put_mark(entry);
 
 out:
 	fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .name	= "inotifyfs",
+    .get_sb	= inotify_get_sb,
+    .kill_sb	= kill_anon_super,
 };
 
 /*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
 	if (IS_ERR(inotify_mnt))
 		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
 
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+	inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+	if (!inotify_ignored_event)
+		panic("unable to allocate the inotify ignored event\n");
+
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
 	inotify_max_user_watches = 8192;
 
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL);
-
 	return 0;
 }
-
 module_init(inotify_user_setup);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d2c0ee30e61..44848aa830d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -9,6 +9,7 @@
 
 #ifdef __KERNEL__
 
+#include <linux/idr.h> /* inotify uses this */
 #include <linux/fs.h> /* struct inode */
 #include <linux/list.h>
 #include <linux/path.h> /* struct path */
@@ -59,6 +60,7 @@
 
 /* listeners that hard code group numbers near the top */
 #define DNOTIFY_GROUP_NUM	UINT_MAX
+#define INOTIFY_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
 
 struct fsnotify_group;
 struct fsnotify_event;
@@ -141,6 +143,15 @@ struct fsnotify_group {
 	/* groups can define private fields here or use the void *private */
 	union {
 		void *private;
+#ifdef CONFIG_INOTIFY_USER
+		struct inotify_group_private_data {
+			spinlock_t	idr_lock;
+			struct idr      idr;
+			u32             last_wd;
+			struct fasync_struct    *fa;    /* async notification */
+			struct user_struct      *user;
+		} inotify_data;
+#endif
 	};
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index d4e9671347e..5de1c17c51e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -302,7 +302,8 @@ config AUDITSYSCALL
 
 config AUDIT_TREE
 	def_bool y
-	depends on AUDITSYSCALL && INOTIFY
+	depends on AUDITSYSCALL
+	select INOTIFY
 
 menu "RCU Subsystem"
 
-- 
cgit v1.2.3-70-g09d2


From ff52cc2158b32b3b979ca7802b1fd7c70f36e13c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: fsnotify: move events should indicate the event was on a child

fsnotify tells its listeners explicitly when an event happened on the given
inode verses on the child of the given inode.  (see __fsnotify_parent)
However, the semantics of fsnotify_move() are such that we deliver events
directly to the two parent directories in question (old_dir and new_dir)
directly without using the __fsnotify_parent() call.  fsnotify should be
adding FS_EVENT_ON_CHILD for the notifications to these parents.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index c25b39ddd62..936f9aa8bb9 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -71,12 +71,11 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	struct inode *source = moved->d_inode;
 	u32 in_cookie = inotify_get_cookie();
 	u32 fs_cookie = fsnotify_get_cookie();
-	__u32 old_dir_mask = 0;
-	__u32 new_dir_mask = 0;
+	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
+	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
 
-	if (old_dir == new_dir) {
-		old_dir_mask = FS_DN_RENAME;
-	}
+	if (old_dir == new_dir)
+		old_dir_mask |= FS_DN_RENAME;
 
 	if (isdir) {
 		isdir = IN_ISDIR;
@@ -84,9 +83,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		new_dir_mask |= FS_IN_ISDIR;
 	}
 
-	old_dir_mask |= FS_MOVED_FROM;
-	new_dir_mask |= FS_MOVED_TO;
-
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
 				  source);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
-- 
cgit v1.2.3-70-g09d2


From 04846b5b8112e53b588038349b3e92b8485c1807 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Mon, 20 Apr 2009 10:54:52 +0900
Subject: PCI MSI: Remove unused/obsolete macros and definitions

Impact: cleanup, spec compliance

This patch does:

 - Remove unused msi/msix_enable/disable macros.
   User should use msi/msix_set_enable() functions instead.

 - Remove unused msix_mask/unmask/pending macros.
   These macros are useless because they are not based on any of
   the PCI Local Bus Specifications properly.
   It seems that they were written based on a draft of PCI spec,
   and that the draft was the MSI-X ECN that underwent membership
   review in September 2002.
   (* In the draft, the size of a entry in MSI-X table was 64bit,
      containing 32bit message data and DWORD aligned lower address
      plus a pending bit and a mask bit.(30+1+1bit)  The higher
      address was placed in MSI-X capability structure and shared
      by all entries.)

 - Remove PCI_MSIX_FLAGS_BITMASK.
   This definition also come from the draft ECN.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.h        | 8 +-------
 include/linux/pci_regs.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 71f4df2ef65..4fed5926195 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,18 +19,12 @@
 	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
 #define msi_mask_bits_reg(base, is64bit) \
 	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
-#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 
 #define msix_table_offset_reg(base)	(base + 0x04)
 #define msix_pba_offset_reg(base)	(base + 0x08)
-#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
-#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
 #define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define multi_msix_capable		msix_table_size
-#define msix_unmask(address)	 	(address & ~PCI_MSIX_FLAGS_BITMASK)
-#define msix_mask(address)		(address | PCI_MSIX_FLAGS_BITMASK)
-#define msix_is_pending(address) 	(address & PCI_MSIX_FLAGS_PENDMASK)
+#define multi_msix_capable(control)	msix_table_size((control))
 
 #endif /* MSI_H */
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 616bf8b3c8b..dcba7668e0c 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -304,7 +304,6 @@
 #define  PCI_MSIX_FLAGS_ENABLE	(1 << 15)
 #define  PCI_MSIX_FLAGS_MASKALL	(1 << 14)
 #define PCI_MSIX_FLAGS_BIRMASK	(7 << 0)
-#define PCI_MSIX_FLAGS_BITMASK	(1 << 0)
 
 /* CompactPCI Hotswap Register */
 
-- 
cgit v1.2.3-70-g09d2


From 67b5db6502ddd27d65dea43bf036abbd82d0dfc9 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Mon, 20 Apr 2009 10:54:59 +0900
Subject: PCI MSI: Define PCI_MSI_MASK_32/64

Impact: cleanup, improve readability

Define PCI_MSI_MASK_32/64 for 32/64bit devices, instead of using
implicit offset (-4), "PCI_MSI_MASK_BIT - 4" and "PCI_MSI_MASK_BIT".

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c        | 2 +-
 drivers/pci/msi.h        | 6 +++---
 include/linux/pci_regs.h | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 362773247fb..7ffac27d5d4 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -381,7 +381,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec)
 	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.pos = pos;
 
-	entry->mask_pos = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
+	entry->mask_pos = msi_mask_reg(pos, entry->msi_attrib.is_64);
 	/* All MSIs are unmasked by default, Mask them all */
 	if (entry->msi_attrib.maskbit)
 		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 4fed5926195..a0662842550 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -16,9 +16,9 @@
 #define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
 #define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
 #define msi_data_reg(base, is64bit)	\
-	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
-#define msi_mask_bits_reg(base, is64bit) \
-	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+	(base + ((is64bit == 1) ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32))
+#define msi_mask_reg(base, is64bit)	\
+	(base + ((is64bit == 1) ? PCI_MSI_MASK_64 : PCI_MSI_MASK_32))
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index dcba7668e0c..83b02f5a25b 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -295,8 +295,9 @@
 #define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
 #define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
 #define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
+#define PCI_MSI_MASK_32		12	/* Mask bits register for 32-bit devices */
 #define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
-#define PCI_MSI_MASK_BIT	16	/* Mask bits register */
+#define PCI_MSI_MASK_64		16	/* Mask bits register for 64-bit devices */
 
 /* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */
 #define PCI_MSIX_FLAGS		2
-- 
cgit v1.2.3-70-g09d2


From 1f82de10d6b1d845155363c895c552e61b36b51a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 23 Apr 2009 20:48:32 -0700
Subject: PCI/x86: don't assume prefetchable ranges are 64bit

We should not assign 64bit ranges to PCI devices that only take 32bit
prefetchable addresses.

Try to set IORESOURCE_MEM_64 in 64bit resource of pci_device/pci_bridge
and make the bus resource only have that bit set when all devices under
it support 64bit prefetchable memory.  Use that flag to allocate
resources from that range.

Reported-by: Yannick <yannick.roehlly@free.fr>
Reviewed-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci.h |  1 +
 drivers/pci/bus.c          |  7 ++++++-
 drivers/pci/probe.c        |  9 ++++++--
 drivers/pci/setup-bus.c    | 52 +++++++++++++++++++++++++++++++++++++---------
 include/linux/ioport.h     |  2 ++
 include/linux/pci.h        |  4 ++++
 6 files changed, 62 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b51a1e8b0ba..927958d13c1 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void);
 
 /* generic pci stuff */
 #include <asm-generic/pci.h>
+#define PCIBIOS_MAX_MEM_32 0xffffffff
 
 #ifdef CONFIG_NUMA
 /* Returns the node based on pci bus */
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 97a8194063b..40af27f3104 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -41,9 +41,14 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 		void *alignf_data)
 {
 	int i, ret = -ENOMEM;
+	resource_size_t max = -1;
 
 	type_mask |= IORESOURCE_IO | IORESOURCE_MEM;
 
+	/* don't allocate too high if the pref mem doesn't support 64bit*/
+	if (!(res->flags & IORESOURCE_MEM_64))
+		max = PCIBIOS_MAX_MEM_32;
+
 	for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) {
 		struct resource *r = bus->resource[i];
 		if (!r)
@@ -62,7 +67,7 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 		/* Ok, try it out.. */
 		ret = allocate_resource(r, res, size,
 					r->start ? : min,
-					-1, align,
+					max, align,
 					alignf, alignf_data);
 		if (ret == 0)
 			break;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f1ae2475fff..b962326e3d9 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -193,7 +193,7 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 		res->flags |= pci_calc_resource_flags(l) | IORESOURCE_SIZEALIGN;
 		if (type == pci_bar_io) {
 			l &= PCI_BASE_ADDRESS_IO_MASK;
-			mask = PCI_BASE_ADDRESS_IO_MASK & 0xffff;
+			mask = PCI_BASE_ADDRESS_IO_MASK & IO_SPACE_LIMIT;
 		} else {
 			l &= PCI_BASE_ADDRESS_MEM_MASK;
 			mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
@@ -237,6 +237,8 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 			dev_printk(KERN_DEBUG, &dev->dev,
 				"reg %x 64bit mmio: %pR\n", pos, res);
 		}
+
+		res->flags |= IORESOURCE_MEM_64;
 	} else {
 		sz = pci_size(l, sz, mask);
 
@@ -362,7 +364,10 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child)
 		}
 	}
 	if (base <= limit) {
-		res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		res->flags = (mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) |
+					 IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		if (res->flags & PCI_PREF_RANGE_TYPE_64)
+			res->flags |= IORESOURCE_MEM_64;
 		res->start = base;
 		res->end = limit + 0xfffff;
 		dev_printk(KERN_DEBUG, &dev->dev, "bridge %sbit mmio pref: %pR\n",
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a00f85471b6..e1c360a5b0d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -143,6 +143,7 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	struct pci_dev *bridge = bus->self;
 	struct pci_bus_region region;
 	u32 l, bu, lu, io_upper16;
+	int pref_mem64;
 
 	if (pci_is_enabled(bridge))
 		return;
@@ -198,16 +199,22 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, 0);
 
 	/* Set up PREF base/limit. */
+	pref_mem64 = 0;
 	bu = lu = 0;
 	pcibios_resource_to_bus(bridge, &region, bus->resource[2]);
 	if (bus->resource[2]->flags & IORESOURCE_PREFETCH) {
+		int width = 8;
 		l = (region.start >> 16) & 0xfff0;
 		l |= region.end & 0xfff00000;
-		bu = upper_32_bits(region.start);
-		lu = upper_32_bits(region.end);
-		dev_info(&bridge->dev, "  PREFETCH window: %#016llx-%#016llx\n",
-		    (unsigned long long)region.start,
-		    (unsigned long long)region.end);
+		if (bus->resource[2]->flags & IORESOURCE_MEM_64) {
+			pref_mem64 = 1;
+			bu = upper_32_bits(region.start);
+			lu = upper_32_bits(region.end);
+			width = 16;
+		}
+		dev_info(&bridge->dev, "  PREFETCH window: %#0*llx-%#0*llx\n",
+				width, (unsigned long long)region.start,
+				width, (unsigned long long)region.end);
 	}
 	else {
 		l = 0x0000fff0;
@@ -215,9 +222,11 @@ static void pci_setup_bridge(struct pci_bus *bus)
 	}
 	pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, l);
 
-	/* Set the upper 32 bits of PREF base & limit. */
-	pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, bu);
-	pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, lu);
+	if (pref_mem64) {
+		/* Set the upper 32 bits of PREF base & limit. */
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, bu);
+		pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, lu);
+	}
 
 	pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
 }
@@ -255,8 +264,25 @@ static void pci_bridge_check_ranges(struct pci_bus *bus)
 		pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
 		pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, 0x0);
 	}
-	if (pmem)
+	if (pmem) {
 		b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		if ((pmem & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64)
+			b_res[2].flags |= IORESOURCE_MEM_64;
+	}
+
+	/* double check if bridge does support 64 bit pref */
+	if (b_res[2].flags & IORESOURCE_MEM_64) {
+		u32 mem_base_hi, tmp;
+		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+					 &mem_base_hi);
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+					       0xffffffff);
+		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &tmp);
+		if (!tmp)
+			b_res[2].flags &= ~IORESOURCE_MEM_64;
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+				       mem_base_hi);
+	}
 }
 
 /* Helper function for sizing routines: find first available
@@ -336,6 +362,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	resource_size_t aligns[12];	/* Alignments from 1Mb to 2Gb */
 	int order, max_order;
 	struct resource *b_res = find_free_bus_resource(bus, type);
+	unsigned int mem64_mask = 0;
 
 	if (!b_res)
 		return 0;
@@ -344,9 +371,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	max_order = 0;
 	size = 0;
 
+	mem64_mask = b_res->flags & IORESOURCE_MEM_64;
+	b_res->flags &= ~IORESOURCE_MEM_64;
+
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		int i;
-		
+
 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 			struct resource *r = &dev->resource[i];
 			resource_size_t r_size;
@@ -372,6 +402,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 				aligns[order] += align;
 			if (order > max_order)
 				max_order = order;
+			mem64_mask &= r->flags & IORESOURCE_MEM_64;
 		}
 	}
 
@@ -396,6 +427,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
 	b_res->flags |= IORESOURCE_STARTALIGN;
+	b_res->flags |= mem64_mask;
 	return 1;
 }
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 32e4b2f7229..786e7b8cece 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -49,6 +49,8 @@ struct resource_list {
 #define IORESOURCE_SIZEALIGN	0x00020000	/* size indicates alignment */
 #define IORESOURCE_STARTALIGN	0x00040000	/* start field is alignment */
 
+#define IORESOURCE_MEM_64	0x00100000
+
 #define IORESOURCE_EXCLUSIVE	0x08000000	/* Userland may not map this resource */
 #define IORESOURCE_DISABLED	0x10000000
 #define IORESOURCE_UNSET	0x20000000
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 72698d89e76..6dfa47d25ba 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1097,6 +1097,10 @@ static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus,
 
 #include <asm/pci.h>
 
+#ifndef PCIBIOS_MAX_MEM_32
+#define PCIBIOS_MAX_MEM_32 (-1)
+#endif
+
 /* these helpers provide future and backwards compatibility
  * for accessing popular PCI BAR info */
 #define pci_resource_start(dev, bar)	((dev)->resource[(bar)].start)
-- 
cgit v1.2.3-70-g09d2


From 3b073eda9557975a87a27b08a46a545fe8da66fb Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Tue, 31 Mar 2009 09:24:22 -0600
Subject: PCI: remove deprecated pci_find_slot() interface

The last in-tree caller of pci_find_slot has been converted, so
let's get rid of this deprecated interface.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/search.c | 30 ------------------------------
 include/linux/pci.h  |  8 --------
 2 files changed, 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 710d4ea6956..650bc0a538d 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -114,36 +114,6 @@ pci_find_next_bus(const struct pci_bus *from)
 }
 
 #ifdef CONFIG_PCI_LEGACY
-/**
- * pci_find_slot - locate PCI device from a given PCI slot
- * @bus: number of PCI bus on which desired PCI device resides
- * @devfn: encodes number of PCI slot in which the desired PCI
- * device resides and the logical device number within that slot
- * in case of multi-function devices.
- *
- * Given a PCI bus and slot/function number, the desired PCI device
- * is located in system global list of PCI devices.  If the device
- * is found, a pointer to its data structure is returned.  If no
- * device is found, %NULL is returned.
- *
- * NOTE: Do not use this function any more; use pci_get_slot() instead, as
- * the PCI device returned by this function can disappear at any moment in
- * time.
- */
-struct pci_dev *pci_find_slot(unsigned int bus, unsigned int devfn)
-{
-	struct pci_dev *dev = NULL;
-
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		if (dev->bus->number == bus && dev->devfn == devfn) {
-			pci_dev_put(dev);
-			return dev;
-		}
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(pci_find_slot);
-
 /**
  * pci_find_device - begin or continue searching for a PCI device by vendor/device id
  * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6dfa47d25ba..19ee92c53ef 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -599,8 +599,6 @@ extern void pci_sort_breadthfirst(void);
 struct pci_dev __deprecated *pci_find_device(unsigned int vendor,
 					     unsigned int device,
 					     struct pci_dev *from);
-struct pci_dev __deprecated *pci_find_slot(unsigned int bus,
-					   unsigned int devfn);
 #endif /* CONFIG_PCI_LEGACY */
 
 enum pci_lost_interrupt_reason {
@@ -936,12 +934,6 @@ static inline struct pci_dev *pci_find_device(unsigned int vendor,
 	return NULL;
 }
 
-static inline struct pci_dev *pci_find_slot(unsigned int bus,
-					    unsigned int devfn)
-{
-	return NULL;
-}
-
 static inline struct pci_dev *pci_get_device(unsigned int vendor,
 					     unsigned int device,
 					     struct pci_dev *from)
-- 
cgit v1.2.3-70-g09d2


From 43c16408842b0eeb367c23a6fa540ce69f99e347 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Wed, 22 Apr 2009 16:52:09 -0600
Subject: PCI: Add support for turning PCIe ECRC on or off

Adds support for PCI Express transaction layer end-to-end CRC checking
(ECRC).  This patch will enable/disable ECRC checking by setting/clearing
the ECRC Check Enable and/or ECRC Generation Enable bits for devices that
support ECRC.

The ECRC setting is controlled by the "pci=ecrc=<policy>" command-line
option. If this option is not set or is set to 'bios", the enable and
generation bits are left in whatever state that firmware/BIOS set them to.
The "off" setting turns them off, and the "on" option turns them on (if the
device supports it).

Turning ECRC on or off can be a data integrity versus performance
tradeoff.  In theory, turning it on will catch more data errors, turning
it off means possibly better performance since CRC does not need to be
calculated by the PCIe hardware and packet sizes are reduced.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt |   6 ++
 drivers/pci/pci.c                   |   2 +
 drivers/pci/pcie/aer/Kconfig        |  13 ++++
 drivers/pci/pcie/aer/Makefile       |   2 +
 drivers/pci/pcie/aer/aerdrv_core.c  |  16 +++--
 drivers/pci/pcie/aer/ecrc.c         | 131 ++++++++++++++++++++++++++++++++++++
 include/linux/pci.h                 |  11 +++
 7 files changed, 174 insertions(+), 7 deletions(-)
 create mode 100644 drivers/pci/pcie/aer/ecrc.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7bdaf508040..395d1a013eb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1824,6 +1824,12 @@ and is between 256 and 4096 characters. It is defined in the file
 				PAGE_SIZE is used as alignment.
 				PCI-PCI bridge can be specified, if resource
 				windows need to be expanded.
+		ecrc=		Enable/disable PCIe ECRC (transaction layer
+				end-to-end CRC checking).
+				bios: Use BIOS/firmware settings. This is the
+				the default.
+				off: Turn ECRC off
+				on: Turn ECRC on.
 
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 761557688b1..56fb18d2cb5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2588,6 +2588,8 @@ static int __init pci_setup(char *str)
 			} else if (!strncmp(str, "resource_alignment=", 19)) {
 				pci_set_resource_alignment_param(str + 19,
 							strlen(str + 19));
+			} else if (!strncmp(str, "ecrc=", 5)) {
+				pcie_ecrc_get_policy(str + 5);
 			} else {
 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
 						str);
diff --git a/drivers/pci/pcie/aer/Kconfig b/drivers/pci/pcie/aer/Kconfig
index c3bde588aa1..db4cb950933 100644
--- a/drivers/pci/pcie/aer/Kconfig
+++ b/drivers/pci/pcie/aer/Kconfig
@@ -10,3 +10,16 @@ config PCIEAER
 	  This enables PCI Express Root Port Advanced Error Reporting
 	  (AER) driver support. Error reporting messages sent to Root
 	  Port will be handled by PCI Express AER driver.
+
+
+#
+# PCI Express ECRC
+#
+config PCIE_ECRC
+	bool "PCI Express ECRC settings control"
+	depends on PCIEAER
+	help
+	  Used to override firmware/bios settings for PCI Express ECRC
+	  (transaction layer end-to-end CRC checking).
+
+	  When in doubt, say N.
diff --git a/drivers/pci/pcie/aer/Makefile b/drivers/pci/pcie/aer/Makefile
index 8da3bd8455a..7f93411c56e 100644
--- a/drivers/pci/pcie/aer/Makefile
+++ b/drivers/pci/pcie/aer/Makefile
@@ -4,6 +4,8 @@
 
 obj-$(CONFIG_PCIEAER) += aerdriver.o
 
+obj-$(CONFIG_PCIE_ECRC)	+= ecrc.o
+
 aerdriver-objs := aerdrv_errprint.o aerdrv_core.o aerdrv.o
 aerdriver-$(CONFIG_ACPI) += aerdrv_acpi.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 307452f3003..dd3829e68e3 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -113,15 +113,17 @@ static void set_device_error_reporting(struct pci_dev *dev, void *data)
 {
 	bool enable = *((bool *)data);
 
-	if (dev->pcie_type != PCIE_RC_PORT &&
-	    dev->pcie_type != PCIE_SW_UPSTREAM_PORT &&
-	    dev->pcie_type != PCIE_SW_DOWNSTREAM_PORT)
-		return;
+	if (dev->pcie_type == PCIE_RC_PORT ||
+	    dev->pcie_type == PCIE_SW_UPSTREAM_PORT ||
+	    dev->pcie_type == PCIE_SW_DOWNSTREAM_PORT) {
+		if (enable)
+			pci_enable_pcie_error_reporting(dev);
+		else
+			pci_disable_pcie_error_reporting(dev);
+	}
 
 	if (enable)
-		pci_enable_pcie_error_reporting(dev);
-	else
-		pci_disable_pcie_error_reporting(dev);
+		pcie_set_ecrc_checking(dev);
 }
 
 /**
diff --git a/drivers/pci/pcie/aer/ecrc.c b/drivers/pci/pcie/aer/ecrc.c
new file mode 100644
index 00000000000..ece97df4df6
--- /dev/null
+++ b/drivers/pci/pcie/aer/ecrc.c
@@ -0,0 +1,131 @@
+/*
+ *    Enables/disables PCIe ECRC checking.
+ *
+ *    (C) Copyright 2009 Hewlett-Packard Development Company, L.P.
+ *    Andrew Patterson <andrew.patterson@hp.com>
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2 of the License.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ *    General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ *    02111-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/errno.h>
+#include "../../pci.h"
+
+#define ECRC_POLICY_DEFAULT 0		/* ECRC set by BIOS */
+#define ECRC_POLICY_OFF     1		/* ECRC off for performance */
+#define ECRC_POLICY_ON      2		/* ECRC on for data integrity */
+
+static int ecrc_policy = ECRC_POLICY_DEFAULT;
+
+static const char *ecrc_policy_str[] = {
+	[ECRC_POLICY_DEFAULT] = "bios",
+	[ECRC_POLICY_OFF] = "off",
+	[ECRC_POLICY_ON] = "on"
+};
+
+/**
+ * enable_ercr_checking - enable PCIe ECRC checking for a device
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+static int enable_ecrc_checking(struct pci_dev *dev)
+{
+	int pos;
+	u32 reg32;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	if (reg32 & PCI_ERR_CAP_ECRC_GENC)
+		reg32 |= PCI_ERR_CAP_ECRC_GENE;
+	if (reg32 & PCI_ERR_CAP_ECRC_CHKC)
+		reg32 |= PCI_ERR_CAP_ECRC_CHKE;
+	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+
+	return 0;
+}
+
+/**
+ * disable_ercr_checking - disables PCIe ECRC checking for a device
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+static int disable_ecrc_checking(struct pci_dev *dev)
+{
+	int pos;
+	u32 reg32;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return -ENODEV;
+
+	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	reg32 &= ~(PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+
+	return 0;
+}
+
+/**
+ * pcie_set_ecrc_checking - set/unset PCIe ECRC checking for a device based on global policy
+ * @dev: the PCI device
+ */
+void pcie_set_ecrc_checking(struct pci_dev *dev)
+{
+	switch (ecrc_policy) {
+	case ECRC_POLICY_DEFAULT:
+		return;
+	case ECRC_POLICY_OFF:
+		disable_ecrc_checking(dev);
+		break;
+	case ECRC_POLICY_ON:
+		enable_ecrc_checking(dev);;
+		break;
+	default:
+		return;
+	}
+}
+
+/**
+ * pcie_ecrc_get_policy - parse kernel command-line ecrc option
+ */
+void pcie_ecrc_get_policy(char *str)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecrc_policy_str); i++)
+		if (!strncmp(str, ecrc_policy_str[i],
+			     strlen(ecrc_policy_str[i])))
+			break;
+	if (i >= ARRAY_SIZE(ecrc_policy_str))
+		return;
+
+	ecrc_policy = i;
+}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 19ee92c53ef..ec03b90d351 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -878,6 +878,17 @@ static inline int pcie_aspm_enabled(void)
 extern int pcie_aspm_enabled(void);
 #endif
 
+#ifndef CONFIG_PCIE_ECRC
+static inline void pcie_set_ecrc_checking(struct pci_dev *dev)
+{
+	return;
+}
+static inline void pcie_ecrc_get_policy(char *str) {};
+#else
+extern void pcie_set_ecrc_checking(struct pci_dev *dev);
+extern void pcie_ecrc_get_policy(char *str);
+#endif
+
 #define pci_enable_msi(pdev)	pci_enable_msi_block(pdev, 1)
 
 #ifdef CONFIG_HT_IRQ
-- 
cgit v1.2.3-70-g09d2


From 73422811d290c628b4ddbf6830e5cd6fa42e84f1 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Sun, 10 May 2009 16:05:39 -0400
Subject: reiserfs: allow exposing privroot w/ xattrs enabled

This patch adds an -oexpose_privroot option to allow access to the privroot.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/dir.c              | 10 ++++------
 fs/reiserfs/super.c            |  1 +
 fs/reiserfs/xattr.c            |  3 ++-
 include/linux/reiserfs_fs_sb.h |  2 ++
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c7..6d2668fdc38 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
 				   struct reiserfs_de_head *deh)
 {
-	int ret = 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
 	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	ret = (dir == dir->d_parent && privroot->d_inode &&
-	       deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
-#endif
-	return ret;
+	if (reiserfs_expose_privroot(dir->d_sb))
+		return 0;
+	return (dir == dir->d_parent && privroot->d_inode &&
+	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb..9dbdcfb5d31 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -898,6 +898,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		{"conv",.setmask = 1 << REISERFS_CONVERT},
 		{"attrs",.setmask = 1 << REISERFS_ATTRS},
 		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
 		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
 		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e696..f3d47d85684 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
 		REISERFS_SB(s)->priv_root = dentry;
-		s->s_root->d_op = &xattr_lookup_poison_ops;
+		if (!reiserfs_expose_privroot(s))
+			s->s_root->d_op = &xattr_lookup_poison_ops;
 		if (dentry->d_inode)
 			dentry->d_inode->i_flags |= S_PRIVATE;
 	} else
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 6473650c28f..dab68bbed67 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -453,6 +453,7 @@ enum reiserfs_mount_options {
 	REISERFS_ATTRS,
 	REISERFS_XATTRS_USER,
 	REISERFS_POSIXACL,
+	REISERFS_EXPOSE_PRIVROOT,
 	REISERFS_BARRIER_NONE,
 	REISERFS_BARRIER_FLUSH,
 
@@ -490,6 +491,7 @@ enum reiserfs_mount_options {
 #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
+#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
 #define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
 #define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
-- 
cgit v1.2.3-70-g09d2


From 2a737871108de9ba8930f7650d549f1383767f8b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:49:53 -0400
Subject: Cache root in nameidata

New field: nd->root.  When pathname resolution wants to know the root,
check if nd->root.mnt is non-NULL; use nd->root if it is, otherwise
copy current->fs->root there.  After path_walk() is finished, we check
if we'd got a cached value in nd->root and drop it.  Before calling
path_walk() we should either set nd->root.mnt to NULL *or* copy (and
pin down) some path to nd->root.  In the latter case we won't be
looking at current->fs->root at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 53 +++++++++++++++++++++++++++++++++------------------
 include/linux/namei.h |  1 +
 2 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 895733efc6b..88baaf2b916 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
 	return result;
 }
 
+static __always_inline void set_root(struct nameidata *nd)
+{
+	if (!nd->root.mnt) {
+		struct fs_struct *fs = current->fs;
+		read_lock(&fs->lock);
+		nd->root = fs->root;
+		path_get(&nd->root);
+		read_unlock(&fs->lock);
+	}
+}
+
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
 	int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		goto fail;
 
 	if (*link == '/') {
-		struct fs_struct *fs = current->fs;
-
+		set_root(nd);
 		path_put(&nd->path);
-
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
+		nd->path = nd->root;
+		path_get(&nd->root);
 	}
 
 	res = link_path_walk(link, nd);
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-	struct fs_struct *fs = current->fs;
+	set_root(nd);
 
 	while(1) {
 		struct vfsmount *parent;
 		struct dentry *old = nd->path.dentry;
 
-                read_lock(&fs->lock);
-		if (nd->path.dentry == fs->root.dentry &&
-		    nd->path.mnt == fs->root.mnt) {
-                        read_unlock(&fs->lock);
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
-                read_unlock(&fs->lock);
 		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -1022,18 +1026,18 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
-	struct fs_struct *fs = current->fs;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 	nd->depth = 0;
+	nd->root.mnt = NULL;
 
 	if (*name=='/') {
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
+		set_root(nd);
+		nd->path = nd->root;
+		path_get(&nd->root);
 	} else if (dfd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
 		read_lock(&fs->lock);
 		nd->path = fs->pwd;
 		path_get(&fs->pwd);
@@ -1079,6 +1083,10 @@ static int do_path_lookup(int dfd, const char *name,
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
+	if (nd->root.mnt) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
 	return retval;
 }
 
@@ -1115,6 +1123,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->last_type = LAST_ROOT;
 	nd->flags = flags;
 	nd->depth = 0;
+	nd->root.mnt = NULL;
 
 	nd->path.dentry = dentry;
 	nd->path.mnt = mnt;
@@ -1125,8 +1134,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
 
-	return retval;
+	if (nd->root.mnt) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
 
+	return retval;
 }
 
 /**
@@ -1817,6 +1830,8 @@ exit:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
 exit_parent:
+	if (nd.root.mnt)
+		path_put(&nd.root);
 	path_put(&nd.path);
 	return ERR_PTR(error);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 518098fe63a..325dd3ad39a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -18,6 +18,7 @@ enum { MAX_NESTED_LINKS = 8 };
 struct nameidata {
 	struct path	path;
 	struct qstr	last;
+	struct path	root;
 	unsigned int	flags;
 	int		last_type;
 	unsigned	depth;
-- 
cgit v1.2.3-70-g09d2


From 91c9fa8f75877c0c1e455c23e8f8206c91c8f77f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:42:05 -0400
Subject: switch rqst_exp_get_by_name()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c            | 15 ++++++---------
 fs/nfsd/vfs.c               | 32 ++++++++++++++++----------------
 include/linux/nfsd/export.h |  3 +--
 3 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5149dabde55..84f5e5cb086 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1240,18 +1240,15 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
  * use exp_get_by_name() or exp_find().
  */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
-		struct dentry *dentry)
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
-	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	if (rqstp->rq_client == NULL)
 		goto gss;
 
 	/* First try the auth_unix client: */
-	exp = exp_get_by_name(rqstp->rq_client, &path,
-						&rqstp->rq_chandle);
+	exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		goto gss;
 	if (IS_ERR(exp))
@@ -1263,8 +1260,7 @@ gss:
 	/* Otherwise, try falling back on gss client */
 	if (rqstp->rq_gssclient == NULL)
 		return exp;
-	gssexp = exp_get_by_name(rqstp->rq_gssclient, &path,
-						&rqstp->rq_chandle);
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
 	if (PTR_ERR(gssexp) == -ENOENT)
 		return exp;
 	if (!IS_ERR(exp))
@@ -1307,9 +1303,10 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
 	struct svc_export *exp;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	dget(dentry);
-	exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+	exp = rqst_exp_get_by_name(rqstp, &path);
 
 	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
 		struct dentry *parent;
@@ -1317,7 +1314,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		parent = dget_parent(dentry);
 		dput(dentry);
 		dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+		exp = rqst_exp_get_by_name(rqstp, &path);
 	}
 	dput(dentry);
 	return exp;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bd584bcf1d9..d84c4eaa526 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -101,36 +101,36 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
 	struct svc_export *exp = *expp, *exp2 = NULL;
 	struct dentry *dentry = *dpp;
-	struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-	struct dentry *mounts = dget(dentry);
+	struct path path = {.mnt = mntget(exp->ex_path.mnt),
+			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+	while (follow_down(&path.mnt, &path.dentry) &&
+	       d_mountpoint(path.dentry))
+		;
 
-	exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
 		if (PTR_ERR(exp2) != -ENOENT)
 			err = PTR_ERR(exp2);
-		dput(mounts);
-		mntput(mnt);
+		path_put(&path);
 		goto out;
 	}
 	if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
 		/* successfully crossed mount point */
 		/*
-		 * This is subtle: dentry is *not* under mnt at this point.
-		 * The only reason we are safe is that original mnt is pinned
-		 * down by exp, so we should dput before putting exp.
+		 * This is subtle: path.dentry is *not* on path.mnt
+		 * at this point.  The only reason we are safe is that
+		 * original mnt is pinned down by exp, so we should
+		 * put path *before* putting exp
 		 */
-		dput(dentry);
-		*dpp = mounts;
-		exp_put(exp);
+		*dpp = path.dentry;
+		path.dentry = dentry;
 		*expp = exp2;
-	} else {
-		exp_put(exp2);
-		dput(mounts);
+		exp2 = exp;
 	}
-	mntput(mnt);
+	path_put(&path);
+	exp_put(exp2);
 out:
 	return err;
 }
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index bcd0201589f..98f6fd584d5 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -125,8 +125,7 @@ void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
-					     struct vfsmount *,
-					     struct dentry *);
+					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
 					struct vfsmount *mnt,
 					struct dentry *dentry);
-- 
cgit v1.2.3-70-g09d2


From e64c390ca0b60fd2119331ef1fa888d7ea27e424 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:00:46 -0400
Subject: switch rqst_exp_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c            | 25 ++++++++++---------------
 fs/nfsd/vfs.c               | 23 ++++++++++++-----------
 include/linux/nfsd/export.h |  3 +--
 3 files changed, 23 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 84f5e5cb086..8b1f8efb469 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1299,24 +1299,19 @@ gss:
 }
 
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
-		struct dentry *dentry)
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
 {
-	struct svc_export *exp;
-	struct path path = {.mnt = mnt, .dentry = dentry};
-
-	dget(dentry);
-	exp = rqst_exp_get_by_name(rqstp, &path);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
-		struct dentry *parent;
+	struct dentry *saved = dget(path->dentry);
+	struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
 
-		parent = dget_parent(dentry);
-		dput(dentry);
-		dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, &path);
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = rqst_exp_get_by_name(rqstp, path);
 	}
-	dput(dentry);
+	dput(path->dentry);
+	path->dentry = saved;
 	return exp;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d84c4eaa526..9f1ea3127f5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -169,28 +169,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			/* checking mountpoint crossing is very different when stepping up */
 			struct svc_export *exp2 = NULL;
 			struct dentry *dp;
-			struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-			dentry = dget(dparent);
-			while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
+			struct path path = {.mnt = mntget(exp->ex_path.mnt),
+					    .dentry = dget(dparent)};
+
+			while (path.dentry == path.mnt->mnt_root &&
+			       follow_up(&path.mnt, &path.dentry))
 				;
-			dp = dget_parent(dentry);
-			dput(dentry);
-			dentry = dp;
+			dp = dget_parent(path.dentry);
+			dput(path.dentry);
+			path.dentry = dp;
 
-			exp2 = rqst_exp_parent(rqstp, mnt, dentry);
+			exp2 = rqst_exp_parent(rqstp, &path);
 			if (PTR_ERR(exp2) == -ENOENT) {
-				dput(dentry);
 				dentry = dget(dparent);
 			} else if (IS_ERR(exp2)) {
 				host_err = PTR_ERR(exp2);
-				dput(dentry);
-				mntput(mnt);
+				path_put(&path);
 				goto out_nfserr;
 			} else {
+				dentry = dget(path.dentry);
 				exp_put(exp);
 				exp = exp2;
 			}
-			mntput(mnt);
+			path_put(&path);
 		}
 	} else {
 		fh_lock(fhp);
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 98f6fd584d5..a6d9ef2bb34 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -127,8 +127,7 @@ void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
 					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
-					struct vfsmount *mnt,
-					struct dentry *dentry);
+					struct path *);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-- 
cgit v1.2.3-70-g09d2


From bab77ebf51e3902f608ecf08c9d34a0a52ac35a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:26:48 -0400
Subject: switch follow_up() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c |  2 +-
 fs/namei.c             | 16 ++++++++--------
 fs/nfsd/vfs.c          |  2 +-
 include/linux/namei.h  |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index f71dac9986f..670407576b2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname,
 				err = 0;
 			}
 		}
-		if (!follow_up(&path.mnt, &path.dentry))
+		if (!follow_up(&path))
 			break;
 	}
 	path_put(&path);
diff --git a/fs/namei.c b/fs/namei.c
index 4379ef98970..8c1f48ae68e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -675,23 +675,23 @@ loop:
 	return err;
 }
 
-int follow_up(struct vfsmount **mnt, struct dentry **dentry)
+int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
 	struct dentry *mountpoint;
 	spin_lock(&vfsmount_lock);
-	parent=(*mnt)->mnt_parent;
-	if (parent == *mnt) {
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt) {
 		spin_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(parent);
-	mountpoint=dget((*mnt)->mnt_mountpoint);
+	mountpoint = dget(path->mnt->mnt_mountpoint);
 	spin_unlock(&vfsmount_lock);
-	dput(*dentry);
-	*dentry = mountpoint;
-	mntput(*mnt);
-	*mnt = parent;
+	dput(path->dentry);
+	path->dentry = mountpoint;
+	mntput(path->mnt);
+	path->mnt = parent;
 	return 1;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9f1ea3127f5..7b2b3f77532 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -173,7 +173,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 					    .dentry = dget(dparent)};
 
 			while (path.dentry == path.mnt->mnt_root &&
-			       follow_up(&path.mnt, &path.dentry))
+			       follow_up(&path))
 				;
 			dp = dget_parent(path.dentry);
 			dput(path.dentry);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 325dd3ad39a..9cd5a717be3 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,7 +79,7 @@ extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
 extern int follow_down(struct vfsmount **, struct dentry **);
-extern int follow_up(struct vfsmount **, struct dentry **);
+extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
-- 
cgit v1.2.3-70-g09d2


From 589ff870ed60a9ebdd5ec99ec3f5afe1282fe151 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:28:19 -0400
Subject: Switch collect_mounts() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c      | 4 ++--
 include/linux/fs.h  | 2 +-
 kernel/audit_tree.c | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 88a904d5aa2..c85962206aa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1253,11 +1253,11 @@ Enomem:
 	return NULL;
 }
 
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
 	struct vfsmount *tree;
 	down_write(&namespace_sem);
-	tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
 	up_write(&namespace_sem);
 	return tree;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 323b5ce474c..03fb2102b8f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1800,7 +1800,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
-extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
+extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a8..1f6396d7668 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(path.mnt, path.dentry);
+		root_mnt = collect_mounts(&path);
 		path_put(&path);
 		if (!root_mnt)
 			goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(path.mnt, path.dentry);
+	mnt = collect_mounts(&path);
 	path_put(&path);
 	if (!mnt) {
 		err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
 	err = kern_path(new, 0, &path);
 	if (err)
 		return err;
-	tagged = collect_mounts(path.mnt, path.dentry);
+	tagged = collect_mounts(&path);
 	path_put(&path);
 	if (!tagged)
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 9393bd07cf218ca51d0e627653f906a9d76a9131 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 13:58:15 -0400
Subject: switch follow_down()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/mntpt.c         |  2 +-
 fs/autofs/dirhash.c    |  5 ++---
 fs/autofs4/autofs_i.h  |  6 +++---
 fs/autofs4/dev-ioctl.c |  2 +-
 fs/autofs4/expire.c    | 15 +++++++--------
 fs/autofs4/root.c      |  7 +++----
 fs/cifs/cifs_dfs_ref.c |  2 +-
 fs/namei.c             | 12 ++++++------
 fs/namespace.c         |  4 ++--
 fs/nfs/namespace.c     |  2 +-
 fs/nfsd/vfs.c          |  3 +--
 include/linux/namei.h  |  2 +-
 12 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a39..c52be53f694 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path.mnt, &nd->path.dentry))
+		       follow_down(&nd->path))
 			;
 		err = 0;
 	default:
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f..2316e944a10 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 		}
 		path.mnt = mnt;
 		path_get(&path);
-		if (!follow_down(&path.mnt, &path.dentry)) {
+		if (!follow_down(&path)) {
 			path_put(&path);
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(path.dentry) &&
-		       follow_down(&path.mnt, &path.dentry))
+		while (d_mountpoint(path.dentry) && follow_down(&path));
 			;
 		umount_ok = may_umount(path.mnt);
 		path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c6310..8f7cdde4173 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static inline int autofs4_follow_mount(struct path *path)
 {
 	int res = 0;
 
-	while (d_mountpoint(*dentry)) {
-		int followed = follow_down(mnt, dentry);
+	while (d_mountpoint(path->dentry)) {
+		int followed = follow_down(path);
 		if (!followed)
 			break;
 		res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 670407576b2..f3da2eb51f5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -562,7 +562,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		err = have_submounts(path.dentry);
 
 		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-			if (follow_down(&path.mnt, &path.dentry))
+			if (follow_down(&path))
 				magic = path.mnt->mnt_sb->s_magic;
 		}
 	}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f1652..aa39ae83f01 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct dentry *top = dentry;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 	int status = 1;
 
 	DPRINTK("dentry %p %.*s",
 		dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
-	mntget(mnt);
-	dget(dentry);
+	path_get(&path);
 
-	if (!follow_down(&mnt, &dentry))
+	if (!follow_down(&path))
 		goto done;
 
-	if (is_autofs4_dentry(dentry)) {
-		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	if (is_autofs4_dentry(path.dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
 
 		/* This is an autofs submount, we can't expire it */
 		if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		 * Otherwise it's an offset mount and we need to check
 		 * if we can umount its mount, if there is one.
 		 */
-		if (!d_mountpoint(dentry)) {
+		if (!d_mountpoint(path.dentry)) {
 			status = 0;
 			goto done;
 		}
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	status = 0;
 done:
 	DPRINTK("returning = %d", status);
-	dput(dentry);
-	mntput(mnt);
+	path_put(&path);
 	return status;
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f..b96a3c57359 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to beeak out of follow_down() at the autofs mount trigger
+	 * to break out of follow_down() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+			if (!follow_down(&nd->path))
 				goto done;
 			goto follow;
 		}
@@ -230,8 +230,7 @@ follow:
 	 * to follow it.
 	 */
 	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path.mnt,
-					  &nd->path.dentry)) {
+		if (!autofs4_follow_mount(&nd->path)) {
 			status = -ENOENT;
 			goto out_error;
 		}
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c..3bb11be8b6a 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path.mnt, &nd->path.dentry))
+		       follow_down(&nd->path))
 			;
 		err = 0;
 	default:
diff --git a/fs/namei.c b/fs/namei.c
index 8c1f48ae68e..4d49a3eee6d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -731,16 +731,16 @@ static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 /* no need for dcache_lock, as serialization is taken care in
  * namespace.c
  */
-int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(*mnt, *dentry);
+	mounted = lookup_mnt(path->mnt, path->dentry);
 	if (mounted) {
-		dput(*dentry);
-		mntput(*mnt);
-		*mnt = mounted;
-		*dentry = dget(mounted->mnt_root);
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
 		return 1;
 	}
 	return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index c85962206aa..ba5237be1cf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1601,7 +1601,7 @@ static int do_move_mount(struct path *path, char *old_name)
 
 	down_write(&namespace_sem);
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(&path->mnt, &path->dentry))
+	       follow_down(path))
 		;
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1695,7 +1695,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(&path->mnt, &path->dentry))
+	       follow_down(path))
 		;
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046..f01caec8446 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ out_err:
 	goto out;
 out_follow:
 	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path.mnt, &nd->path.dentry))
+	       follow_down(&nd->path))
 		;
 	err = 0;
 	goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7b2b3f77532..99f83575359 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -105,8 +105,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (follow_down(&path.mnt, &path.dentry) &&
-	       d_mountpoint(path.dentry))
+	while (d_mountpoint(path.dentry) && follow_down(&path))
 		;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9cd5a717be3..d870ae2faed 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -78,7 +78,7 @@ extern void release_open_intent(struct nameidata *);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
-extern int follow_down(struct vfsmount **, struct dentry **);
+extern int follow_down(struct path *);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-- 
cgit v1.2.3-70-g09d2


From 1c755af4df75996b0dd4b7e6cacaf9d57a6ef2ef Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 14:06:57 -0400
Subject: switch lookup_mnt()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c             | 6 +++---
 fs/namespace.c         | 4 ++--
 include/linux/dcache.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index c006bc61d1e..527119afb6a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -702,7 +702,7 @@ static int __follow_mount(struct path *path)
 {
 	int res = 0;
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -718,7 +718,7 @@ static int __follow_mount(struct path *path)
 static void follow_mount(struct path *path)
 {
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -735,7 +735,7 @@ int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(path->mnt, path->dentry);
+	mounted = lookup_mnt(path);
 	if (mounted) {
 		dput(path->dentry);
 		mntput(path->mnt);
diff --git a/fs/namespace.c b/fs/namespace.c
index ba5237be1cf..b94ad3d685f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -442,11 +442,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
  * lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct vfsmount *child_mnt;
 	spin_lock(&vfsmount_lock);
-	if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
 		mntget(child_mnt);
 	spin_unlock(&vfsmount_lock);
 	return child_mnt;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 97978004338..72ce2ae8859 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -370,7 +370,7 @@ static inline int d_mountpoint(struct dentry *dentry)
 	return dentry->d_mounted;
 }
 
-extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct vfsmount *lookup_mnt(struct path *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
-- 
cgit v1.2.3-70-g09d2


From 3174c21b74b56c6a53fddd41a30fd6f757a32bd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 13:19:18 -0400
Subject: Move junk from proc_fs.h to fs/proc/internal.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/internal.h      | 25 +++++++++++++++++++++++++
 fs/proc/proc_devtree.c  |  1 +
 include/linux/proc_fs.h | 24 ------------------------
 3 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a88..753ca37002c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
 	struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
+
+extern spinlock_t proc_subdir_lock;
+
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
+
+extern struct vfsmount *proc_mnt;
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a344..fc6c3025bef 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index fbfa3d44d33..e6e77d31c41 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -93,20 +93,9 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
-extern spinlock_t proc_subdir_lock;
-
 extern void proc_root_init(void);
 
 void proc_flush_task(struct task_struct *task);
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
-void task_mem(struct seq_file *, struct mm_struct *);
-void clear_refs_smap(struct mm_struct *mm);
-
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
-void de_put(struct proc_dir_entry *de);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 						struct proc_dir_entry *parent);
@@ -116,20 +105,7 @@ struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
 				void *data);
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 
-extern struct vfsmount *proc_mnt;
 struct pid_namespace;
-extern int proc_fill_super(struct super_block *);
-extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
-
-/*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
- */
-extern int proc_readdir(struct file *, void *, filldir_t);
-extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
-- 
cgit v1.2.3-70-g09d2


From d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:54 +1000
Subject: fs: mnt_want_write speedup

This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.

Before:
 avg = 501.9
 std = 14.7773

After:
 avg = 462.286
 std = 5.46106

(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)

It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.

It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.

[AV: minor cleanup]

Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 268 +++++++++++++++++---------------------------------
 include/linux/mount.h |  21 ++--
 2 files changed, 106 insertions(+), 183 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d685f..22ae06ad751 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
-		atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+		mnt->mnt_writers = alloc_percpu(int);
+		if (!mnt->mnt_writers)
+			goto out_free_devname;
+#else
+		mnt->mnt_writers = 0;
+#endif
 	}
 	return mnt;
 
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree(mnt->mnt_devname);
+#endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-struct mnt_writer {
-	/*
-	 * If holding multiple instances of this lock, they
-	 * must be ordered by cpu number.
-	 */
-	spinlock_t lock;
-	struct lock_class_key lock_class; /* compiles out with !lockdep */
-	unsigned long count;
-	struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+	mnt->mnt_writers++;
+#endif
+}
 
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
-		spin_lock_init(&writer->lock);
-		lockdep_set_class(&writer->lock, &writer->lock_class);
-		writer->count = 0;
-	}
-	return 0;
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+	mnt->mnt_writers--;
+#endif
 }
-fs_initcall(init_mnt_writers);
 
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
 	int cpu;
-	struct mnt_writer *cpu_writer;
 
 	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_unlock(&cpu_writer->lock);
+		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
 	}
-}
 
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
-	if (!cpu_writer->mnt)
-		return;
-	/*
-	 * This is in case anyone ever leaves an invalid,
-	 * old ->mnt and a count of 0.
-	 */
-	if (!cpu_writer->count)
-		return;
-	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-	cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-					  struct vfsmount *mnt)
-{
-	if (cpu_writer->mnt == mnt)
-		return;
-	__clear_mnt_count(cpu_writer);
-	cpu_writer->mnt = mnt;
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
 }
 
 /*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
 	int ret = 0;
-	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	/*
+	 * The store to inc_mnt_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+		cpu_relax();
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
+		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	cpu_writer->count++;
 out:
-	spin_unlock(&cpu_writer->lock);
-	put_cpu_var(mnt_writers);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-static void lock_mnt_writers(void)
-{
-	int cpu;
-	struct mnt_writer *cpu_writer;
-
-	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		__clear_mnt_count(cpu_writer);
-		cpu_writer->mnt = NULL;
-	}
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count.  Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
-	if (atomic_read(&mnt->__mnt_writers) >=
-	    MNT_WRITER_UNDERFLOW_LIMIT)
-		return;
-	/*
-	 * It isn't necessary to hold all of the locks
-	 * at the same time, but doing it this way makes
-	 * us share a lot more code.
-	 */
-	lock_mnt_writers();
-	/*
-	 * vfsmount_lock is for mnt_flags.
-	 */
-	spin_lock(&vfsmount_lock);
-	/*
-	 * If coalescing the per-cpu writer counts did not
-	 * get us back to a positive writer count, we have
-	 * a bug.
-	 */
-	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-				"count: %d\n",
-			mnt, atomic_read(&mnt->__mnt_writers));
-		/* use the flag to keep the dmesg spam down */
-		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-	}
-	spin_unlock(&vfsmount_lock);
-	unlock_mnt_writers();
-}
-
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	int must_check_underflow = 0;
-	struct mnt_writer *cpu_writer;
-
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
-
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	if (cpu_writer->count > 0) {
-		cpu_writer->count--;
-	} else {
-		must_check_underflow = 1;
-		atomic_dec(&mnt->__mnt_writers);
-	}
-
-	spin_unlock(&cpu_writer->lock);
-	/*
-	 * Logically, we could call this each time,
-	 * but the __mnt_writers cacheline tends to
-	 * be cold, and makes this expensive.
-	 */
-	if (must_check_underflow)
-		handle_write_count_underflow(mnt);
-	/*
-	 * This could be done right after the spinlock
-	 * is taken because the spinlock keeps us on
-	 * the cpu, and disables preemption.  However,
-	 * putting it here bounds the amount that
-	 * __mnt_writers can underflow.  Without it,
-	 * we could theoretically wrap __mnt_writers.
-	 */
-	put_cpu_var(mnt_writers);
+	preempt_disable();
+	dec_mnt_writers(mnt);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
-	lock_mnt_writers();
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
-	 * With all the locks held, this value is stable
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
 	 */
-	if (atomic_read(&mnt->__mnt_writers) > 0) {
-		ret = -EBUSY;
-		goto out;
-	}
+	smp_mb();
+
 	/*
-	 * nobody can do a successful mnt_want_write() with all
-	 * of the counts in MNT_DENIED_WRITE and the locks held.
+	 * With writers on hold, if this value is zero, then there are
+	 * definitely no active writers (although held writers may subsequently
+	 * increment the count, they'll have to wait, and decrement it after
+	 * seeing MNT_READONLY).
+	 *
+	 * It is OK to have counter incremented on one CPU and decremented on
+	 * another: the sum will add up correctly. The danger would be when we
+	 * sum up each counter, if we read a counter before it is incremented,
+	 * but then read another CPU's count which it has been subsequently
+	 * decremented from -- we would see more decrements than we should.
+	 * MNT_WRITE_HOLD protects against this scenario, because
+	 * mnt_want_write first increments count, then smp_mb, then spins on
+	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+	 * we're counting up here.
 	 */
-	spin_lock(&vfsmount_lock);
-	if (!ret)
+	if (count_mnt_writers(mnt) > 0)
+		ret = -EBUSY;
+	else
 		mnt->mnt_flags |= MNT_READONLY;
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
 	spin_unlock(&vfsmount_lock);
-out:
-	unlock_mnt_writers();
 	return ret;
 }
 
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_writers);
+#endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
 static inline void __mntput(struct vfsmount *mnt)
 {
-	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
-	/*
-	 * We don't have to hold all of the locks at the
-	 * same time here because we know that we're the
-	 * last reference to mnt and that no new writers
-	 * can come in.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		if (cpu_writer->mnt != mnt) {
-			spin_unlock(&cpu_writer->lock);
-			continue;
-		}
-		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-		cpu_writer->count = 0;
-		/*
-		 * Might as well do this so that no one
-		 * ever sees the pointer and expects
-		 * it to be valid.
-		 */
-		cpu_writer->mnt = NULL;
-		spin_unlock(&cpu_writer->lock);
-	}
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
-	WARN_ON(atomic_read(&mnt->__mnt_writers));
+	/*
+	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+	 * provides barriers, so count_mnt_writers() below is safe.  AV
+	 */
+	WARN_ON(count_mnt_writers(mnt));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f903af..ac49c1f8e5c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
 #define MNT_STRICTATIME 0x80
 
 #define MNT_SHRINKABLE	0x100
-#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
+#define MNT_WRITE_HOLD	0x200
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
-	/*
-	 * This value is not stable unless all of the mnt_writers[] spinlocks
-	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
-	 */
-	atomic_t __mnt_writers;
+#ifdef CONFIG_SMP
+	int *mnt_writers;
+#else
+	int mnt_writers;
+#endif
 };
 
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	return mnt->mnt_writers;
+#else
+	return &mnt->mnt_writers;
+#endif
+}
+
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
-- 
cgit v1.2.3-70-g09d2


From 96029c4e09ccbd73a6d0ed2b29e80bf2586ad7ef Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:55 +1000
Subject: fs: introduce mnt_clone_write

This patch speeds up lmbench lat_mmap test by about another 2% after the
first patch.

Before:
 avg = 462.286
 std = 5.46106

After:
 avg = 453.12
 std = 9.58257

(50 runs of each, stddev gives a reasonable confidence)

It does this by introducing mnt_clone_write, which avoids some heavyweight
operations of mnt_want_write if called on a vfsmount which we know already
has a write count; and mnt_want_write_file, which can call mnt_clone_write
if the file is open for write.

After these two patches, mnt_want_write and mnt_drop_write go from 7% on
the profile down to 1.3% (including mnt_clone_write).

[AV: mnt_want_write_file() should take file alone and derive mnt from it;
not only all callers have that form, but that's the only mnt about which
we know that it's already held for write if file is opened for write]

Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c       |  2 +-
 fs/inode.c            |  2 +-
 fs/namespace.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/open.c             |  4 ++--
 fs/xattr.c            |  4 ++--
 include/linux/mount.h |  4 ++++
 6 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe4884..3d66dbcebef 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
 		file_take_write(file);
-		error = mnt_want_write(mnt);
+		error = mnt_clone_write(mnt);
 		WARN_ON(error);
 	}
 	return error;
diff --git a/fs/inode.c b/fs/inode.c
index ca337014ae2..a88baebf77c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1422,7 +1422,7 @@ void file_update_time(struct file *file)
 	if (IS_NOCMTIME(inode))
 		return;
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		return;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 22ae06ad751..120b8a6b99e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -264,6 +264,46 @@ out:
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
+/**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+	/* superblock may be r/o */
+	if (__mnt_is_readonly(mnt))
+		return -EROFS;
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+	if (!(file->f_mode & FMODE_WRITE))
+		return mnt_want_write(file->f_path.mnt);
+	else
+		return mnt_clone_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a..7200e23d925 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 
 	audit_inode(NULL, dentry);
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		goto out_putf;
 	mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	if (!file)
 		goto out;
 
-	error = mnt_want_write(file->f_path.mnt);
+	error = mnt_want_write_file(file);
 	if (error)
 		goto out_fput;
 	dentry = file->f_path.dentry;
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db92..1c3d0af59dd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f);
 	if (!error) {
 		error = setxattr(dentry, name, value, size, flags);
 		mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f);
 	if (!error) {
 		error = removexattr(dentry, name);
 		mnt_drop_write(f->f_path.mnt);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index ac49c1f8e5c..5d527536486 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -88,7 +88,11 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 	return mnt;
 }
 
+struct file; /* forward dec */
+
 extern int mnt_want_write(struct vfsmount *mnt);
+extern int mnt_want_write_file(struct file *file);
+extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
-- 
cgit v1.2.3-70-g09d2


From 876a9f76abbcb775f8d21cbc99fa161f9e5937f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Apr 2009 18:05:55 +0200
Subject: remove s_async_list

Remove the unused s_async_list in the superblock, a leftover of the
broken async inode deletion code that leaked into mainline.  Having this
in the middle of the sync/unmount path is not helpful for the following
cleanups.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 8 --------
 include/linux/fs.h | 5 -----
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index c170551c23f..3d9f117dd2a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,7 +38,6 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
-#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -72,7 +71,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
-		INIT_LIST_HEAD(&s->s_async_list);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -342,11 +340,6 @@ void generic_shutdown_super(struct super_block *sb)
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
-		/*
-		 * wait for asynchronous fs operations to finish before going further
-		 */
-		async_synchronize_full_domain(&sb->s_async_list);
-
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
 		lock_kernel();
@@ -517,7 +510,6 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 03fb2102b8f..36bcff7036e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1372,11 +1372,6 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char *s_options;
-
-	/*
-	 * storage for asynchronous operations
-	 */
-	struct list_head s_async_list;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
-- 
cgit v1.2.3-70-g09d2


From 429479f031322a0cc5c921ffb2321a51718dc875 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:50 +0200
Subject: vfs: Make __fsync_super() a static function (version 4)

__fsync_super() does the same thing as fsync_super(). So change the only
caller to use fsync_super() and make __fsync_super() static. This removes
unnecessarily duplicated call to sync_blockdev() and prepares ground
for the changes to __fsync_super() in the following patches.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     | 2 +-
 fs/super.c         | 7 +++----
 include/linux/fs.h | 1 -
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 931f6b8c4b2..fe47f722761 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -241,7 +241,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		__fsync_super(sb);
+		fsync_super(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/fs/super.c b/fs/super.c
index fae91ba38e4..8dbe1ead9dd 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL(unlock_super);
  * device.  Takes the superblock lock.  Requires a second blkdev
  * flush by the caller to complete the operation.
  */
-void __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
@@ -300,7 +300,7 @@ void __fsync_super(struct super_block *sb)
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, 1);
-	sync_blockdev(sb->s_bdev);
+	return sync_blockdev(sb->s_bdev);
 }
 
 /*
@@ -310,8 +310,7 @@ void __fsync_super(struct super_block *sb)
  */
 int fsync_super(struct super_block *sb)
 {
-	__fsync_super(sb);
-	return sync_blockdev(sb->s_bdev);
+	return __fsync_super(sb);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 36bcff7036e..41a9907f342 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2078,7 +2078,6 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
-extern void __fsync_super(struct super_block *sb);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
-- 
cgit v1.2.3-70-g09d2


From 5cee5815d1564bbbd505fea86f4550f1efdb5cd0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:51 +0200
Subject: vfs: Make sys_sync() use fsync_super() (version 4)

It is unnecessarily fragile to have two places (fsync_super() and do_sync())
doing data integrity sync of the filesystem. Alter __fsync_super() to
accommodate needs of both callers and use it. So after this patch
__fsync_super() is the only place where we gather all the calls needed to
properly send all data on a filesystem to disk.

Nice bonus is that we get a complete livelock avoidance and write_supers()
is now only used for periodic writeback of superblocks.

sync_blockdevs() introduced a couple of patches ago is gone now.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c            | 15 ++++++----
 fs/fs-writeback.c         | 49 --------------------------------
 fs/internal.h             | 16 +++++------
 fs/super.c                | 72 +++++++++++++++--------------------------------
 fs/sync.c                 | 31 +++++++-------------
 include/linux/fs.h        |  2 +-
 include/linux/writeback.h |  1 -
 7 files changed, 51 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe47f722761..4b6a3b9d01e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	if (!bdev)
+		return 0;
+	if (!wait)
+		return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
+	return __sync_blockdev(bdev, 1);
 }
 EXPORT_SYMBOL(sync_blockdev);
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd5..e0fb2e78959 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -678,55 +678,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 	sync_sb_inodes(sb, &wbc);
 }
 
-/**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root) {
-			sync_inodes_sb(sb, wait);
-			sync_blockdev(sb->s_bdev);
-		}
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-
-void sync_inodes(int wait)
-{
-	__sync_inodes(0);
-
-	if (wait)
-		__sync_inodes(1);
-}
-
 /**
  * write_inode_now	-	write an inode to disk
  * @inode: inode to write to disk
diff --git a/fs/internal.h b/fs/internal.h
index 343a537ab80..dbec3cc2833 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 	return sb == blockdev_superblock;
 }
 
+extern int __sync_blockdev(struct block_device *bdev, int wait);
+
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
 	return 0;
 }
+
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
-
-/*
- * super.c
- */
-#ifdef CONFIG_BLOCK
-extern void sync_blockdevs(void);
-#else
-static inline void sync_blockdevs(void) { }
-#endif
diff --git a/fs/super.c b/fs/super.c
index 8dbe1ead9dd..c8ce5ed0424 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
 /*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb, int wait)
 {
-	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, 1);
+	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	return sync_blockdev(sb->s_bdev);
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
 }
 
 /*
@@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb)
  */
 int fsync_super(struct super_block *sb)
 {
-	return __fsync_super(sb);
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
 
@@ -469,20 +474,18 @@ restart:
 }
 
 /*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
  *
  * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
  * is used only here.  We set it against all filesystems and then clear it as
  * we sync them.  So redirtied filesystems are skipped.
  *
  * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
+ * calls sync_filesystems as well, process B will set all the s_need_sync
  * flags again, which will cause process A to resync everything.  Fix that with
  * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
  */
 void sync_filesystems(int wait)
 {
@@ -492,25 +495,23 @@ void sync_filesystems(int wait)
 	mutex_lock(&mutex);		/* Could be down_interruptible */
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_op->sync_fs)
-			continue;
 		if (sb->s_flags & MS_RDONLY)
 			continue;
-		sb->s_need_sync_fs = 1;
+		sb->s_need_sync = 1;
 	}
 
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync_fs)
+		if (!sb->s_need_sync)
 			continue;
-		sb->s_need_sync_fs = 0;
+		sb->s_need_sync = 0;
 		if (sb->s_flags & MS_RDONLY)
 			continue;	/* hm.  Was remounted r/o meanwhile */
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			sb->s_op->sync_fs(sb, wait);
+			__fsync_super(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
@@ -521,33 +522,6 @@ restart:
 	mutex_unlock(&mutex);
 }
 
-#ifdef CONFIG_BLOCK
-/*
- *  Sync all block devices underlying some superblock
- */
-void sync_blockdevs(void)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_bdev)
-			continue;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			sync_blockdev(sb->s_bdev);
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-#endif
-
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index 631fd5aece7..be0798cc33d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,35 +18,24 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
-/*
- * sync everything.  Start out by waking pdflush, because that writes back
- * all queues in parallel.
- */
-static void do_sync(unsigned long wait)
+SYSCALL_DEFINE0(sync)
 {
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	vfs_dq_sync(NULL);
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
-	sync_supers();		/* Write the superblocks */
-	sync_filesystems(0);	/* Start syncing the filesystems */
-	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_blockdevs();
-	if (!wait)
-		printk("Emergency Sync complete\n");
+	sync_filesystems(0);
+	sync_filesystems(1);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
-}
-
-SYSCALL_DEFINE0(sync)
-{
-	do_sync(1);
 	return 0;
 }
 
 static void do_sync_work(struct work_struct *work)
 {
-	do_sync(0);
+	/*
+	 * Sync twice to reduce the possibility we skipped some inodes / pages
+	 * because they were temporarily locked
+	 */
+	sync_filesystems(0);
+	sync_filesystems(0);
+	printk("Emergency Sync complete\n");
 	kfree(work);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 41a9907f342..f00df653cf2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1321,7 +1321,7 @@ struct super_block {
 	struct rw_semaphore	s_umount;
 	struct mutex		s_lock;
 	int			s_count;
-	int			s_need_sync_fs;
+	int			s_need_sync;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 93445477f86..3224820c851 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,7 +79,6 @@ struct writeback_control {
 void writeback_inodes(struct writeback_control *wbc);
 int inode_wait(void *);
 void sync_inodes_sb(struct super_block *, int wait);
-void sync_inodes(int wait);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
-- 
cgit v1.2.3-70-g09d2


From c15c54f5f056ee4819da9fde59a5f2cd45445f23 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:52 +0200
Subject: vfs: Move syncing code from super.c to sync.c (version 4)

Move sync_filesystems(), __fsync_super(), fsync_super() from
super.c to sync.c where it fits better.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 85 ------------------------------------------------------
 fs/sync.c          | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  3 +-
 3 files changed, 86 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index c8ce5ed0424..f822c74f25b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -283,42 +283,6 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
-/*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
- */
-static int __fsync_super(struct super_block *sb, int wait)
-{
-	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, wait);
-	lock_super(sb);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, wait);
-	return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-	int ret;
-
-	ret = __fsync_super(sb, 0);
-	if (ret < 0)
-		return ret;
-	return __fsync_super(sb, 1);
-}
-EXPORT_SYMBOL_GPL(fsync_super);
-
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -473,55 +437,6 @@ restart:
 	spin_unlock(&sb_lock);
 }
 
-/*
- * Sync all the data for all the filesystems (called by sys_sync() and
- * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- */
-void sync_filesystems(int wait)
-{
-	struct super_block *sb;
-	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);		/* Could be down_interruptible */
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_flags & MS_RDONLY)
-			continue;
-		sb->s_need_sync = 1;
-	}
-
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync)
-			continue;
-		sb->s_need_sync = 0;
-		if (sb->s_flags & MS_RDONLY)
-			continue;	/* hm.  Was remounted r/o meanwhile */
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			__fsync_super(sb, wait);
-		up_read(&sb->s_umount);
-		/* restart only when sb is no longer on the list */
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-	mutex_unlock(&mutex);
-}
-
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index be0798cc33d..d5fa7b79982 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,6 +18,91 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
+/*
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
+ */
+static int __fsync_super(struct super_block *sb, int wait)
+{
+	vfs_dq_sync(sb);
+	sync_inodes_sb(sb, wait);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
+}
+EXPORT_SYMBOL_GPL(fsync_super);
+
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ */
+static void sync_filesystems(int wait)
+{
+	struct super_block *sb;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);		/* Could be down_interruptible */
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_flags & MS_RDONLY)
+			continue;
+		sb->s_need_sync = 1;
+	}
+
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_need_sync)
+			continue;
+		sb->s_need_sync = 0;
+		if (sb->s_flags & MS_RDONLY)
+			continue;	/* hm.  Was remounted r/o meanwhile */
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			__fsync_super(sb, wait);
+		up_read(&sb->s_umount);
+		/* restart only when sb is no longer on the list */
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	mutex_unlock(&mutex);
+}
+
 SYSCALL_DEFINE0(sync)
 {
 	sync_filesystems(0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f00df653cf2..d3f7159993c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,7 +1942,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int fsync_super(struct super_block *);
 extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
@@ -1959,6 +1958,7 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
+extern int fsync_super(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
@@ -2077,7 +2077,6 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
-extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
-- 
cgit v1.2.3-70-g09d2


From 60b0680fa236ac4e17ce31a50048c9d75f9ec831 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:53 +0200
Subject: vfs: Rename fsync_super() to sync_filesystem() (version 4)

Rename the function so that it better describe what it really does. Also
remove the unnecessary include of buffer_head.h.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c            |  4 ++--
 fs/cachefiles/interface.c |  2 +-
 fs/super.c                |  5 ++---
 fs/sync.c                 | 12 ++++++------
 include/linux/fs.h        |  2 +-
 5 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4b6a3b9d01e..3a6d4fb2a32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
-		int res = fsync_super(sb);
+		int res = sync_filesystem(sb);
 		drop_super(sb);
 		return res;
 	}
@@ -246,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		fsync_super(sb);
+		sync_filesystem(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d11..dafd484d7bd 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disc */
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = fsync_super(cache->mnt->mnt_sb);
+	ret = sync_filesystem(cache->mnt->mnt_sb);
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO)
diff --git a/fs/super.c b/fs/super.c
index f822c74f25b..721236e6177 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>		/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -304,7 +303,7 @@ void generic_shutdown_super(struct super_block *sb)
 
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
-		fsync_super(sb);
+		sync_filesystem(sb);
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
@@ -543,7 +542,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
-	fsync_super(sb);
+	sync_filesystem(sb);
 
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
diff --git a/fs/sync.c b/fs/sync.c
index d5fa7b79982..8aa870a4d40 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -25,7 +25,7 @@
  * case write_inode() functions do sync_dirty_buffer() and thus effectively
  * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait)
 {
 	vfs_dq_sync(sb);
 	sync_inodes_sb(sb, wait);
@@ -43,16 +43,16 @@ static int __fsync_super(struct super_block *sb, int wait)
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int fsync_super(struct super_block *sb)
+int sync_filesystem(struct super_block *sb)
 {
 	int ret;
 
-	ret = __fsync_super(sb, 0);
+	ret = __sync_filesystem(sb, 0);
 	if (ret < 0)
 		return ret;
-	return __fsync_super(sb, 1);
+	return __sync_filesystem(sb, 1);
 }
-EXPORT_SYMBOL_GPL(fsync_super);
+EXPORT_SYMBOL_GPL(sync_filesystem);
 
 /*
  * Sync all the data for all the filesystems (called by sys_sync() and
@@ -92,7 +92,7 @@ restart:
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			__fsync_super(sb, wait);
+			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d3f7159993c..fb1822bed7c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1958,7 +1958,7 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
-extern int fsync_super(struct super_block *);
+extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
-- 
cgit v1.2.3-70-g09d2


From 850b201b087f5525a0a7278551c2bcd0423c3b26 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Apr 2009 16:43:54 +0200
Subject: quota: cleanup dquota sync functions (version 4)

Currently the VFS calls vfs_dq_sync to sync out disk quotas for a given
superblock.  This is a small wrapper around sync_dquots which for the
case of a non-NULL superblock is a small wrapper around quota_sync_sb.

Just make quota_sync_sb global (rename it to sync_quota_sb) and call it
directly.  Also call it directly for those cases in quota.c that have a
superblock and leave sync_dquots purely an iterator over sync_quota_sb and
remove it's superblock argument.

To make this nicer move the check for the lack of a quota_sync method
from the callers into sync_quota_sb.

[folded build fix from Alexander Beregalov <a.beregalov@gmail.com>]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/quota/quota.c         | 25 ++++++++++++++-----------
 fs/sync.c                |  2 +-
 include/linux/quotaops.h | 11 +++--------
 3 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f07..95c5b42384b 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
 	return error;
 }
 
-static void quota_sync_sb(struct super_block *sb, int type)
+#ifdef CONFIG_QUOTA
+void sync_quota_sb(struct super_block *sb, int type)
 {
 	int cnt;
 
+	if (!sb->s_qcop->quota_sync)
+		return;
+
 	sb->s_qcop->quota_sync(sb, type);
 
 	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	}
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
+#endif
 
-void sync_dquots(struct super_block *sb, int type)
+static void sync_dquots(int type)
 {
+	struct super_block *sb;
 	int cnt;
 
-	if (sb) {
-		if (sb->s_qcop->quota_sync)
-			quota_sync_sb(sb, type);
-		return;
-	}
-
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		if (sb->s_root && sb->s_qcop->quota_sync)
-			quota_sync_sb(sb, type);
+		if (sb->s_root)
+			sync_quota_sb(sb, type);
 		up_read(&sb->s_umount);
 		spin_lock(&sb_lock);
 		if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 			return sb->s_qcop->set_dqblk(sb, type, id, &idq);
 		}
 		case Q_SYNC:
-			sync_dquots(sb, type);
+			if (sb)
+				sync_quota_sb(sb, type);
+			else
+				sync_dquots(type);
 			return 0;
 
 		case Q_XQUOTAON:
diff --git a/fs/sync.c b/fs/sync.c
index 8aa870a4d40..d90ab776455 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,7 +27,7 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
-	vfs_dq_sync(sb);
+	sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 36353d95c8d..047310fa22f 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -20,7 +20,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 /*
  * declaration of quota_function calls in kernel.
  */
-void sync_dquots(struct super_block *sb, int type);
+void sync_quota_sb(struct super_block *sb, int type);
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -253,12 +253,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 
-/* The following two functions cannot be called inside a transaction */
-static inline void vfs_dq_sync(struct super_block *sb)
-{
-	sync_dquots(sb, -1);
-}
-
+/* Cannot be called inside a transaction */
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	int ret = -ENOSYS;
@@ -334,7 +329,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 {
 }
 
-static inline void vfs_dq_sync(struct super_block *sb)
+static inline void sync_quota_sb(struct super_block *sb, int type)
 {
 }
 
-- 
cgit v1.2.3-70-g09d2


From c3f8a40c1cd5591b882497d1d00d43d0e5bb4698 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:55 +0200
Subject: quota: Introduce writeout_quota_sb() (version 4)

Introduce this function which just writes all the quota structures but
avoids all the syncing and cache pruning work to expose quota structures
to userspace. Use this function from __sync_filesystem when wait == 0.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sync.c                | 6 +++++-
 include/linux/quotaops.h | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/sync.c b/fs/sync.c
index d90ab776455..4487b5560dc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,7 +27,11 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
-	sync_quota_sb(sb, -1);
+	/* Avoid doing twice syncing and cache pruning for quota sync */
+	if (!wait)
+		writeout_quota_sb(sb, -1);
+	else
+		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 047310fa22f..7bc45759368 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -21,6 +21,11 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
  * declaration of quota_function calls in kernel.
  */
 void sync_quota_sb(struct super_block *sb, int type);
+static inline void writeout_quota_sb(struct super_block *sb, int type)
+{
+	if (sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, type);
+}
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -333,6 +338,10 @@ static inline void sync_quota_sb(struct super_block *sb, int type)
 {
 }
 
+static inline void writeout_quota_sb(struct super_block *sb, int type)
+{
+}
+
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From f3da392e9ff14b9f388e74319e6d195848991c07 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:32:03 +0400
Subject: dcache: extrace and use d_unlinked()

d_unlinked() will be used in middle-term to ban checkpointing when opened
but unlinked file is detected, and in long term, to detect such situation
and special case on it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 7 +++----
 fs/namespace.c         | 8 ++++----
 include/linux/dcache.h | 5 +++++
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f..9e5cd3c3a6b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
 
 	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+	if (d_unlinked(dentry) &&
 		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
 
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 
 	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+	if (d_unlinked(dentry) &&
 		(prepend(&end, &buflen, "//deleted", 9) != 0))
 			goto Elong;
 	if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	read_unlock(&current->fs->lock);
 
 	error = -ENOENT;
-	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
 		char * cwd;
diff --git a/fs/namespace.c b/fs/namespace.c
index 120b8a6b99e..7e537f0393b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1384,7 +1384,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 		goto out_unlock;
 
 	err = -ENOENT;
-	if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+	if (!d_unlinked(path->dentry))
 		err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1566,7 +1566,7 @@ static int do_move_mount(struct path *path, char *old_name)
 	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out1;
 
-	if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+	if (d_unlinked(path->dentry))
 		goto out1;
 
 	err = -EINVAL;
@@ -2129,9 +2129,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = -ENOENT;
 	if (IS_DEADDIR(new.dentry->d_inode))
 		goto out2;
-	if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+	if (d_unlinked(new.dentry))
 		goto out2;
-	if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+	if (d_unlinked(old.dentry))
 		goto out2;
 	error = -EBUSY;
 	if (new.mnt == root.mnt ||
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 72ce2ae8859..30b93b2a01a 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -353,6 +353,11 @@ static inline int d_unhashed(struct dentry *dentry)
 	return (dentry->d_flags & DCACHE_UNHASHED);
 }
 
+static inline int d_unlinked(struct dentry *dentry)
+{
+	return d_unhashed(dentry) && !IS_ROOT(dentry);
+}
+
 static inline struct dentry *dget_parent(struct dentry *dentry)
 {
 	struct dentry *ret;
-- 
cgit v1.2.3-70-g09d2


From 62c6943b4b1e818aea60c11c5a68a50785b83119 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 May 2009 03:12:29 -0400
Subject: Trim a bit of crap from fs.h

do_remount_sb() is fs/internal.h fodder, fsync_no_super() is long gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h      | 5 +++++
 include/linux/fs.h | 3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/internal.h b/fs/internal.h
index dbec3cc2833..d55ef562f0b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -78,3 +78,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
+
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb1822bed7c..e7833ef5d1d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,7 +1942,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@ -2079,8 +2078,6 @@ extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
-extern int do_remount_sb(struct super_block *sb, int flags,
-			 void *data, int force);
 #ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 9fd5746fd3d7838bf6ff991d50f1257057d1156f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 21 May 2009 16:01:00 -0400
Subject: fs: Remove i_cindex from struct inode

The only user of the i_cindex element in the inode structure is used
is by the firewire drivers.  As part of an attempt to slim down the
inode structure to save memory --- since a typical Linux system will
have hundreds of thousands if not millions of inodes cached, a
reduction in the size inode has high leverage.

The firewire driver does not need i_cindex in any fast path, so it's
simple enough to calculate when it is needed, instead of wasting space
in the inode structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: krh@redhat.com
Cc: stefanr@s5r6.in-berlin.de
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/ieee1394/dv1394.c        |  5 +++--
 drivers/ieee1394/ieee1394_core.h |  6 +++++-
 fs/char_dev.c                    | 14 +++++++++++++-
 include/linux/cdev.h             |  2 ++
 include/linux/fs.h               |  1 -
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index 823a6297a1a..2cd00b5b45b 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -1789,12 +1789,13 @@ static int dv1394_open(struct inode *inode, struct file *file)
 	} else {
 		/* look up the card by ID */
 		unsigned long flags;
+		int idx = ieee1394_file_to_instance(file);
 
 		spin_lock_irqsave(&dv1394_cards_lock, flags);
 		if (!list_empty(&dv1394_cards)) {
 			struct video_card *p;
 			list_for_each_entry(p, &dv1394_cards, list) {
-				if ((p->id) == ieee1394_file_to_instance(file)) {
+				if ((p->id) == idx) {
 					video = p;
 					break;
 				}
@@ -1803,7 +1804,7 @@ static int dv1394_open(struct inode *inode, struct file *file)
 		spin_unlock_irqrestore(&dv1394_cards_lock, flags);
 
 		if (!video) {
-			debug_printk("dv1394: OHCI card %d not found", ieee1394_file_to_instance(file));
+			debug_printk("dv1394: OHCI card %d not found", idx);
 			return -ENODEV;
 		}
 
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index 21d50f73a21..28b9f58bafd 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -5,6 +5,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/types.h>
+#include <linux/cdev.h>
 #include <asm/atomic.h>
 
 #include "hosts.h"
@@ -155,7 +156,10 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
  */
 static inline unsigned char ieee1394_file_to_instance(struct file *file)
 {
-	return file->f_path.dentry->d_inode->i_cindex;
+	int idx = cdev_index(file->f_path.dentry->d_inode);
+	if (idx < 0)
+		idx = 0;
+	return idx;
 }
 
 extern int hpsb_disable_irm;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a55..b7c9d5187a7 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		p = inode->i_cdev;
 		if (!p) {
 			inode->i_cdev = p = new;
-			inode->i_cindex = idx;
 			list_add(&inode->i_devices, &p->list);
 			new = NULL;
 		} else if (!cdev_get(p))
@@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	return ret;
 }
 
+int cdev_index(struct inode *inode)
+{
+	int idx;
+	struct kobject *kobj;
+
+	kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+	if (!kobj)
+		return -1;
+	kobject_put(kobj);
+	return idx;
+}
+
 void cd_forget(struct inode *inode)
 {
 	spin_lock(&cdev_lock);
@@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb4591977b0..f389e319a45 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -28,6 +28,8 @@ int cdev_add(struct cdev *, dev_t, unsigned);
 
 void cdev_del(struct cdev *);
 
+int cdev_index(struct inode *inode);
+
 void cd_forget(struct inode *);
 
 extern struct backing_dev_info directly_mappable_cdev_bdi;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7833ef5d1d..bcd63706db8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -751,7 +751,6 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 	};
-	int			i_cindex;
 
 	__u32			i_generation;
 
-- 
cgit v1.2.3-70-g09d2


From 28ad0c118b0ed98b042d362acfe0017591921138 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 21 May 2009 16:01:02 -0400
Subject: fs: Rearrange inode structure elements to avoid waste due to padding

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcd63706db8..d883aa1fc2e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -729,8 +729,8 @@ struct inode {
 	struct timespec		i_atime;
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
-	unsigned int		i_blkbits;
 	blkcnt_t		i_blocks;
+	unsigned int		i_blkbits;
 	unsigned short          i_bytes;
 	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
-- 
cgit v1.2.3-70-g09d2


From 8688b8635266cf98f00c6b0350ea2dbe7c42c321 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 May 2009 05:45:04 -0400
Subject: linux/magic.h: move cramfs magic out of cramfs_fs.h

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/cramfs_fs.h | 3 +--
 include/linux/magic.h     | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cramfs_fs.h b/include/linux/cramfs_fs.h
index 3be4e5a27d8..6fc2bed368b 100644
--- a/include/linux/cramfs_fs.h
+++ b/include/linux/cramfs_fs.h
@@ -2,9 +2,8 @@
 #define __CRAMFS_H
 
 #include <linux/types.h>
+#include <linux/magic.h>
 
-#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
-#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define CRAMFS_SIGNATURE	"Compressed ROMFS"
 
 /*
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 927138cf305..1923327b986 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -6,6 +6,8 @@
 #define AFS_SUPER_MAGIC                0x5346414F
 #define AUTOFS_SUPER_MAGIC	0x0187
 #define CODA_SUPER_MAGIC	0x73757245
+#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
+#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
-- 
cgit v1.2.3-70-g09d2


From d5aacad548db1ff547adf35d0a77eb2a8ed4fe14 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 14:56:44 -0400
Subject: New helper - simple_fsync()

writes associated buffers, then does sync_inode() to write
the inode itself (and to make it clean).  Depends on
->write_inode() honouring the second argument.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 25 +++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf506..ddfa89948c3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 
 #include <asm/uaccess.h>
 
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
+int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0, /* metadata-only; caller takes care of data */
+	};
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = sync_inode(inode, &wbc);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+EXPORT_SYMBOL(simple_fsync);
+
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d883aa1fc2e..ede84fa7da5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2345,6 +2345,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
 
+extern int simple_fsync(struct file *, struct dentry *, int);
+
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *);
-- 
cgit v1.2.3-70-g09d2


From 79d25767583e4e086f8309bfd1f502660a64fe7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 09:30:08 -0400
Subject: Sanitize qnx4 fsync handling

* have directory operations use mark_buffer_dirty_inode(),
  so that sync_mapping_buffers() would get those.
* make qnx4_write_inode() honour its last argument.
* get rid of insane copies of very ancient "walk the indirect blocks"
  in qnx4/fsync - they never matched the actual fs layout and, fortunately,
  never'd been called.  Again, all this junk is not needed; ->fsync()
  should just do sync_mapping_buffers + sync_inode (and if we implement
  block allocation for qnx4, we'll need to use mark_buffer_dirty_inode()
  for extent blocks)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/Makefile        |   2 +-
 fs/qnx4/dir.c           |   2 +-
 fs/qnx4/file.c          |   2 +-
 fs/qnx4/fsync.c         | 169 ------------------------------------------------
 fs/qnx4/inode.c         |  38 ++++-------
 fs/qnx4/namei.c         |   4 +-
 include/linux/qnx4_fs.h |   2 -
 7 files changed, 17 insertions(+), 202 deletions(-)
 delete mode 100644 fs/qnx4/fsync.c

(limited to 'include/linux')

diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98ba..e4d408cc547 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
 
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48a..ff6c1ba6c4e 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -84,7 +84,7 @@ const struct file_operations qnx4_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 };
 
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b0203..e7033ea10e2 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -29,7 +29,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
-	.fsync		= qnx4_sync_file,
+	.fsync		= simple_fsync,
 #endif
 };
 
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544be..00000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* 
- * QNX4 file system, Linux implementation.
- * 
- * Version : 0.1
- * 
- * Using parts of the xiafs filesystem.
- * 
- * History :
- * 
- * 24-03-1998 by Richard Frowijn : first release.
- */
-
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-
-#include <asm/system.h>
-
-/*
- * The functions for qnx4 fs file synchronization.
- */
-
-#ifdef CONFIG_QNX4FS_RW
-
-static int sync_block(struct inode *inode, unsigned short *block, int wait)
-{
-	struct buffer_head *bh;
-	unsigned short tmp;
-
-	if (!*block)
-		return 0;
-	tmp = *block;
-	bh = sb_find_get_block(inode->i_sb, *block);
-	if (!bh)
-		return 0;
-	if (*block != tmp) {
-		brelse(bh);
-		return 1;
-	}
-	if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
-		brelse(bh);
-		return -1;
-	}
-	if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
-		brelse(bh);
-		return 0;
-	}
-	ll_rw_block(WRITE, 1, &bh);
-	atomic_dec(&bh->b_count);
-	return 0;
-}
-
-#ifdef WTF
-static int sync_iblock(struct inode *inode, unsigned short *iblock,
-		       struct buffer_head **bh, int wait)
-{
-	int rc;
-	unsigned short tmp;
-
-	*bh = NULL;
-	tmp = *iblock;
-	if (!tmp)
-		return 0;
-	rc = sync_block(inode, iblock, wait);
-	if (rc)
-		return rc;
-	*bh = sb_bread(inode->i_sb, tmp);
-	if (tmp != *iblock) {
-		brelse(*bh);
-		*bh = NULL;
-		return 1;
-	}
-	if (!*bh)
-		return -1;
-	return 0;
-}
-#endif
-
-static int sync_direct(struct inode *inode, int wait)
-{
-	int i;
-	int rc, err = 0;
-
-	for (i = 0; i < 7; i++) {
-		rc = sync_block(inode,
-				(unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	return err;
-}
-
-#ifdef WTF
-static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
-{
-	int i;
-	struct buffer_head *ind_bh;
-	int rc, err = 0;
-
-	rc = sync_iblock(inode, iblock, &ind_bh, wait);
-	if (rc || !ind_bh)
-		return rc;
-
-	for (i = 0; i < 512; i++) {
-		rc = sync_block(inode,
-				((unsigned short *) ind_bh->b_data) + i,
-				wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	brelse(ind_bh);
-	return err;
-}
-
-static int sync_dindirect(struct inode *inode, unsigned short *diblock,
-			  int wait)
-{
-	int i;
-	struct buffer_head *dind_bh;
-	int rc, err = 0;
-
-	rc = sync_iblock(inode, diblock, &dind_bh, wait);
-	if (rc || !dind_bh)
-		return rc;
-
-	for (i = 0; i < 512; i++) {
-		rc = sync_indirect(inode,
-				((unsigned short *) dind_bh->b_data) + i,
-				   wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	brelse(dind_bh);
-	return err;
-}
-#endif
-
-int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
-{
-        struct inode *inode = dentry->d_inode;
-	int wait, err = 0;
-        
-        (void) file;
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	      S_ISLNK(inode->i_mode)))
-		return -EINVAL;
-
-	lock_kernel();
-	for (wait = 0; wait <= 1; wait++) {
-		err |= sync_direct(inode, wait);
-	}
-	err |= qnx4_sync_inode(inode);
-	unlock_kernel();
-	return (err < 0) ? -EIO : 0;
-}
-
-#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 95c12fc613f..40712867b8a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -24,6 +24,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include <linux/vfs.h>
 #include <asm/uaccess.h>
 
@@ -34,31 +35,6 @@ static const struct super_operations qnx4_sops;
 
 #ifdef CONFIG_QNX4FS_RW
 
-int qnx4_sync_inode(struct inode *inode)
-{
-	int err = 0;
-# if 0
-	struct buffer_head *bh;
-
-   	bh = qnx4_update_inode(inode);
-	if (bh && buffer_dirty(bh))
-	{
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-		{
-			printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
-				inode->i_sb->s_id, inode->i_ino);
-			err = -1;
-		}
-	        brelse (bh);
-	} else if (!bh) {
-		err = -1;
-	}
-# endif
-
-	return err;
-}
-
 static void qnx4_delete_inode(struct inode *inode)
 {
 	QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,7 +46,7 @@ static void qnx4_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static int qnx4_write_inode(struct inode *inode, int unused)
+static int qnx4_write_inode(struct inode *inode, int do_sync)
 {
 	struct qnx4_inode_entry *raw_inode;
 	int block, ino;
@@ -107,6 +83,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
 	raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
 	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk("qnx4: IO error syncing inode [%s:%08x]\n",
+					inode->i_sb->s_id, ino);
+			brelse(bh);
+			unlock_kernel();
+			return -EIO;
+		}
+	}
 	brelse(bh);
 	unlock_kernel();
 	return 0;
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a408..123270c5376 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -187,7 +187,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +232,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index 787d19ea9f4..acbaec3524e 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -126,8 +126,6 @@ extern void qnx4_truncate(struct inode *inode);
 extern void qnx4_free_inode(struct inode *inode);
 extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
 extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-extern int qnx4_sync_file(struct file *file, struct dentry *dentry, int);
-extern int qnx4_sync_inode(struct inode *inode);
 
 static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
 {
-- 
cgit v1.2.3-70-g09d2


From 964f5369667b342994fe3f384e9ba41d404ee796 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 09:47:13 -0400
Subject: fs/qnx4: sanitize includes

fs-internal parts of qnx4_fs.h taken to fs/qnx4/qnx4.h, includes adjusted,
qnx4_fs.h doesn't need unifdef anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/bitmap.c        |  7 +-----
 fs/qnx4/dir.c           |  7 +-----
 fs/qnx4/file.c          |  3 +--
 fs/qnx4/inode.c         | 11 +++------
 fs/qnx4/namei.c         |  9 +-------
 fs/qnx4/qnx4.h          | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/qnx4/truncate.c      |  6 +----
 include/linux/Kbuild    |  2 +-
 include/linux/qnx4_fs.h | 59 -------------------------------------------------
 9 files changed, 66 insertions(+), 95 deletions(-)
 create mode 100644 fs/qnx4/qnx4.h

(limited to 'include/linux')

diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e962..e1cd061a25f 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
  * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include "qnx4.h"
 
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ff6c1ba6c4e..003c68f3238 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
-
+#include "qnx4.h"
 
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index e7033ea10e2..09b170ac936 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
  * 27-06-1998 by Frank Denis : file overwriting.
  */
 
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
+#include "qnx4.h"
 
 /*
  * We have mostly NULL's here: the current defaults are ok for
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 40712867b8a..681df5fcd16 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,20 +13,15 @@
  */
 
 #include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/vfs.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
 
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 123270c5376..5972ed21493 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 
 
 /*
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 00000000000..9efc089454f
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+
+#define QNX4_DEBUG 0
+
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+
+struct qnx4_sb_info {
+	struct buffer_head	*sb_buf;	/* superblock buffer */
+	struct qnx4_super_block	*sb;		/* our superblock */
+	unsigned int		Version;	/* may be useful */
+	struct qnx4_inode_entry	*BitMap;	/* useful */
+};
+
+struct qnx4_inode_info {
+	struct qnx4_inode_entry raw;
+	loff_t mmu_private;
+	struct inode vfs_inode;
+};
+
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+	return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+	return &qnx4_i(inode)->raw;
+}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1d..d94d9ee241f 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
  * 30-06-1998 by Frank DENIS : ugly filler.
  */
 
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include "qnx4.h"
 
 #ifdef CONFIG_QNX4FS_RW
 
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3f0eaa397ef..b3afd2219ad 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -135,6 +135,7 @@ header-y += posix_types.h
 header-y += ppdev.h
 header-y += prctl.h
 header-y += qnxtypes.h
+header-y += qnx4_fs.h
 header-y += radeonfb.h
 header-y += raw.h
 header-y += resource.h
@@ -308,7 +309,6 @@ unifdef-y += poll.h
 unifdef-y += ppp_defs.h
 unifdef-y += ppp-comp.h
 unifdef-y += ptrace.h
-unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
 unifdef-y += irqnr.h
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index acbaec3524e..8b9aee1a9ce 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -85,63 +85,4 @@ struct qnx4_super_block {
 	struct qnx4_inode_entry AltBoot;
 };
 
-#ifdef __KERNEL__
-
-#define QNX4_DEBUG 0
-
-#if QNX4_DEBUG
-#define QNX4DEBUG(X) printk X
-#else
-#define QNX4DEBUG(X) (void) 0
-#endif
-
-struct qnx4_sb_info {
-	struct buffer_head	*sb_buf;	/* superblock buffer */
-	struct qnx4_super_block	*sb;		/* our superblock */
-	unsigned int		Version;	/* may be useful */
-	struct qnx4_inode_entry	*BitMap;	/* useful */
-};
-
-struct qnx4_inode_info {
-	struct qnx4_inode_entry raw;
-	loff_t mmu_private;
-	struct inode vfs_inode;
-};
-
-extern struct inode *qnx4_iget(struct super_block *, unsigned long);
-extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
-extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
-extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
-
-extern struct buffer_head *qnx4_bread(struct inode *, int, int);
-
-extern const struct inode_operations qnx4_file_inode_operations;
-extern const struct inode_operations qnx4_dir_inode_operations;
-extern const struct file_operations qnx4_file_operations;
-extern const struct file_operations qnx4_dir_operations;
-extern int qnx4_is_free(struct super_block *sb, long block);
-extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
-extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
-extern void qnx4_truncate(struct inode *inode);
-extern void qnx4_free_inode(struct inode *inode);
-extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
-extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-
-static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
-{
-	return container_of(inode, struct qnx4_inode_info, vfs_inode);
-}
-
-static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
-{
-	return &qnx4_i(inode)->raw;
-}
-
-#endif				/* __KERNEL__ */
-
 #endif
-- 
cgit v1.2.3-70-g09d2


From ca371c0d7e23d0d0afae65fc83a0e91cf7399573 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 12 Jun 2009 10:33:53 +0300
Subject: memcg: fix page_cgroup fatal error in FLATMEM

Now, SLAB is configured in very early stage and it can be used in
init routine now.

But replacing alloc_bootmem() in FLAT/DISCONTIGMEM's page_cgroup()
initialization breaks the allocation, now.
(Works well in SPARSEMEM case...it supports MEMORY_HOTPLUG and
 size of page_cgroup is in reasonable size (< 1 << MAX_ORDER.)

This patch revive FLATMEM+memory cgroup by using alloc_bootmem.

In future,
We stop to support FLATMEM (if no users) or rewrite codes for flatmem
completely.But this will adds more messy codes and overheads.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/page_cgroup.h | 18 +++++++++++++++++-
 init/main.c                 |  5 +++++
 mm/page_cgroup.c            | 29 ++++++++++-------------------
 3 files changed, 32 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 7339c7bf733..13f126c89ae 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -18,7 +18,19 @@ struct page_cgroup {
 };
 
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
-void __init page_cgroup_init(void);
+
+#ifdef CONFIG_SPARSEMEM
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+extern void __init page_cgroup_init(void);
+#else
+void __init page_cgroup_init_flatmem(void);
+static inline void __init page_cgroup_init(void)
+{
+}
+#endif
+
 struct page_cgroup *lookup_page_cgroup(struct page *page);
 
 enum {
@@ -87,6 +99,10 @@ static inline void page_cgroup_init(void)
 {
 }
 
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+
 #endif
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/init/main.c b/init/main.c
index 5616661eac0..b3e8f14c568 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,6 +539,11 @@ void __init __weak thread_info_cache_init(void)
  */
 static void __init mm_init(void)
 {
+	/*
+	 * page_cgroup requires countinous pages as memmap
+	 * and it's bigger than MAX_ORDER unless SPARSEMEM.
+	 */
+	page_cgroup_init_flatmem();
 	mem_init();
 	kmem_cache_init();
 	vmalloc_init();
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3dd4a909a1d..11a8a10a390 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,8 +47,6 @@ static int __init alloc_node_page_cgroup(int nid)
 	struct page_cgroup *base, *pc;
 	unsigned long table_size;
 	unsigned long start_pfn, nr_pages, index;
-	struct page *page;
-	unsigned int order;
 
 	start_pfn = NODE_DATA(nid)->node_start_pfn;
 	nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -57,13 +55,11 @@ static int __init alloc_node_page_cgroup(int nid)
 		return 0;
 
 	table_size = sizeof(struct page_cgroup) * nr_pages;
-	order = get_order(table_size);
-	page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
-	if (!page)
-		page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
-	if (!page)
+
+	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	if (!base)
 		return -ENOMEM;
-	base = page_address(page);
 	for (index = 0; index < nr_pages; index++) {
 		pc = base + index;
 		__init_page_cgroup(pc, start_pfn + index);
@@ -73,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid)
 	return 0;
 }
 
-void __init page_cgroup_init(void)
+void __init page_cgroup_init_flatmem(void)
 {
 
 	int nid, fail;
@@ -117,16 +113,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
 	if (!section->page_cgroup) {
 		nid = page_to_nid(pfn_to_page(pfn));
 		table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-		if (slab_is_available()) {
-			base = kmalloc_node(table_size,
-					GFP_KERNEL | __GFP_NOWARN, nid);
-			if (!base)
-				base = vmalloc_node(table_size, nid);
-		} else {
-			base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-				table_size,
-				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
-		}
+		VM_BUG_ON(!slab_is_available());
+		base = kmalloc_node(table_size,
+				GFP_KERNEL | __GFP_NOWARN, nid);
+		if (!base)
+			base = vmalloc_node(table_size, nid);
 	} else {
 		/*
  		 * We don't have to allocate page_cgroup again, but
-- 
cgit v1.2.3-70-g09d2


From 9a71af2c3627b379b7c31917a7f6ee0d29bc559b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:53 -0600
Subject: module_param: invbool should take a 'bool', not an 'int'

It takes an 'int' for historical reasons, and there are only two
users: simply switch it over to bool.

The other user (uvesafb.c) will get a (harmless-on-x86) warning until
the next patch is applied.

Cc: Brad Douglas <brad@neruo.com>
Cc: Michal Januszewski <spock@gentoo.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/video/aty/aty128fb.c | 2 +-
 include/linux/moduleparam.h  | 2 +-
 kernel/params.c              | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index 35e8eb02b9e..e4e4d433b00 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -354,7 +354,7 @@ static int default_crt_on __devinitdata = 0;
 static int default_lcd_on __devinitdata = 1;
 
 #ifdef CONFIG_MTRR
-static int mtrr = 1;
+static bool mtrr = true;
 #endif
 
 #ifdef CONFIG_PMAC_BACKLIGHT
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index a4f0b931846..9bbca8e8c19 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -192,7 +192,7 @@ extern int param_get_bool(char *buffer, struct kernel_param *kp);
 
 extern int param_set_invbool(const char *val, struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, struct kernel_param *kp);
-#define param_check_invbool(name, p) __param_check(name, p, int)
+#define param_check_invbool(name, p) __param_check(name, p, bool)
 
 /* Comma-separated array: *nump is set to number they actually specified. */
 #define module_param_array_named(name, array, type, nump, perm)		\
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd..023abbf5f89 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -272,13 +272,13 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
 	dummy.arg = &boolval;
 	ret = param_set_bool(val, &dummy);
 	if (ret == 0)
-		*(int *)kp->arg = !boolval;
+		*(bool *)kp->arg = !boolval;
 	return ret;
 }
 
 int param_get_invbool(char *buffer, struct kernel_param *kp)
 {
-	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 
 /* We break the rule and mangle the string. */
-- 
cgit v1.2.3-70-g09d2


From 45fcc70c0b6ee0c508e1fdb5fef735c3546803f4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:56 -0600
Subject: module_param: split perm field into flags and perm

Impact: cleanup

Rather than hack KPARAM_KMALLOCED into the perm field, separate it out.
Since the perm field was 32 bits and only needs 16, we don't add bloat.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 8 ++++++--
 kernel/params.c             | 9 +++------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 9bbca8e8c19..009a5f76876 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -36,9 +36,13 @@ typedef int (*param_set_fn)(const char *val, struct kernel_param *kp);
 /* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
 typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
 
+/* Flag bits for kernel_param.flags */
+#define KPARAM_KMALLOCED	1
+
 struct kernel_param {
 	const char *name;
-	unsigned int perm;
+	u16 perm;
+	u16 flags;
 	param_set_fn set;
 	param_get_fn get;
 	union {
@@ -88,7 +92,7 @@ struct kparam_array
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, perm, set, get, { arg } }
+	= { __param_str_##name, perm, 0, set, get, { arg } }
 
 #define module_param_call(name, set, get, arg, perm)			      \
 	__module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm)
diff --git a/kernel/params.c b/kernel/params.c
index 023abbf5f89..b4660dc13db 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
-/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
-#define KPARAM_KMALLOCED	0x80000000
-
 #if 0
 #define DEBUGP printk
 #else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	if (kp->perm & KPARAM_KMALLOCED)
+	if (kp->flags & KPARAM_KMALLOCED)
 		kfree(*(char **)kp->arg);
 
 	/* This is a hack.  We can't need to strdup in early boot, and we
 	 * don't need to; this mangled commandline is preserved. */
 	if (slab_is_available()) {
-		kp->perm |= KPARAM_KMALLOCED;
+		kp->flags |= KPARAM_KMALLOCED;
 		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
 		if (!kp->arg)
 			return -ENOMEM;
@@ -591,7 +588,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
 	unsigned int i;
 
 	for (i = 0; i < num; i++)
-		if (params[i].perm & KPARAM_KMALLOCED)
+		if (params[i].flags & KPARAM_KMALLOCED)
 			kfree(*(char **)params[i].arg);
 }
 
-- 
cgit v1.2.3-70-g09d2


From d2c123c27db841c6c11a63de9c144823d2b1ba76 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:56 -0600
Subject: module_param: add __same_type convenience wrapper for
 __builtin_types_compatible_p

Impact: new API

__builtin_types_compatible_p() is a little awkward to use: it takes two
types rather than types or variables, and it's just damn long.

(typeof(type) == type, so this works on types as well as vars).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/compiler.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 37bcb50a4d7..04fb5135b4e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -261,6 +261,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 # define __section(S) __attribute__ ((__section__(#S)))
 #endif
 
+/* Are two types/vars the same type (ignoring qualifiers)? */
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
  * is also forbidden from reordering successive instances of ACCESS_ONCE(),
-- 
cgit v1.2.3-70-g09d2


From fddd520122953550ec2c8b60e7ca0d0f0d115d97 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:57 -0600
Subject: module_param: allow 'bool' module_params to be bool, not just int.

Impact: API cleanup

For historical reasons, 'bool' parameters must be an int, not a bool.
But there are around 600 users, so a conversion seems like useless churn.

So we use __same_type() to distinguish, and handle both cases.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 32 +++++++++++++++++++++++---------
 kernel/params.c             | 33 ++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 009a5f76876..6547c3cdbc4 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -38,6 +38,7 @@ typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
 
 /* Flag bits for kernel_param.flags */
 #define KPARAM_KMALLOCED	1
+#define KPARAM_ISBOOL		2
 
 struct kernel_param {
 	const char *name;
@@ -83,7 +84,7 @@ struct kparam_array
    parameters.  perm sets the visibility in sysfs: 000 means it's
    not there, read bits mean it's readable, write bits mean it's
    writable. */
-#define __module_param_call(prefix, name, set, get, arg, perm)		\
+#define __module_param_call(prefix, name, set, get, arg, isbool, perm)	\
 	/* Default value instead of permissions? */			\
 	static int __param_perm_check_##name __attribute__((unused)) =	\
 	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2))	\
@@ -92,10 +93,13 @@ struct kparam_array
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, perm, 0, set, get, { arg } }
+	= { __param_str_##name, perm, isbool ? KPARAM_ISBOOL : 0,	\
+	    set, get, { arg } }
 
 #define module_param_call(name, set, get, arg, perm)			      \
-	__module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm)
+	__module_param_call(MODULE_PARAM_PREFIX,			      \
+			    name, set, get, arg,			      \
+			    __same_type(*(arg), bool), perm)
 
 /* Helper functions: type is byte, short, ushort, int, uint, long,
    ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
@@ -124,15 +128,16 @@ struct kparam_array
 #define core_param(name, var, type, perm)				\
 	param_check_##type(name, &(var));				\
 	__module_param_call("", name, param_set_##type, param_get_##type, \
-			    &var, perm)
+			    &var, __same_type(var, bool), perm)
 #endif /* !MODULE */
 
 /* Actually copy string: maxlen param is usually sizeof(string). */
 #define module_param_string(name, string, len, perm)			\
 	static const struct kparam_string __param_string_##name		\
 		= { len, string };					\
-	module_param_call(name, param_set_copystring, param_get_string,	\
-			  .str = &__param_string_##name, perm);		\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    param_set_copystring, param_get_string,	\
+			    .str = &__param_string_##name, 0, perm);	\
 	__MODULE_PARM_TYPE(name, "string")
 
 /* Called on module insert or kernel boot */
@@ -190,9 +195,16 @@ extern int param_set_charp(const char *val, struct kernel_param *kp);
 extern int param_get_charp(char *buffer, struct kernel_param *kp);
 #define param_check_charp(name, p) __param_check(name, p, char *)
 
+/* For historical reasons "bool" parameters can be (unsigned) "int". */
 extern int param_set_bool(const char *val, struct kernel_param *kp);
 extern int param_get_bool(char *buffer, struct kernel_param *kp);
-#define param_check_bool(name, p) __param_check(name, p, int)
+#define param_check_bool(name, p)					\
+	static inline void __check_##name(void)				\
+	{								\
+		BUILD_BUG_ON(!__same_type(*(p), bool) &&		\
+			     !__same_type(*(p), unsigned int) &&	\
+			     !__same_type(*(p), int));			\
+	}
 
 extern int param_set_invbool(const char *val, struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, struct kernel_param *kp);
@@ -203,8 +215,10 @@ extern int param_get_invbool(char *buffer, struct kernel_param *kp);
 	static const struct kparam_array __param_arr_##name		\
 	= { ARRAY_SIZE(array), nump, param_set_##type, param_get_##type,\
 	    sizeof(array[0]), array };					\
-	module_param_call(name, param_array_set, param_array_get, 	\
-			  .arr = &__param_arr_##name, perm);		\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    param_array_set, param_array_get,		\
+			    .arr = &__param_arr_##name,			\
+			    __same_type(array[0], bool), perm);		\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
 #define module_param_array(name, type, nump, perm)		\
diff --git a/kernel/params.c b/kernel/params.c
index b4660dc13db..7f6912ced2b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -238,35 +238,54 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
 	return sprintf(buffer, "%s", *((char **)kp->arg));
 }
 
+/* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, struct kernel_param *kp)
 {
+	bool v;
+
 	/* No equals means "set"... */
 	if (!val) val = "1";
 
 	/* One of =[yYnN01] */
 	switch (val[0]) {
 	case 'y': case 'Y': case '1':
-		*(int *)kp->arg = 1;
-		return 0;
+		v = true;
+		break;
 	case 'n': case 'N': case '0':
-		*(int *)kp->arg = 0;
-		return 0;
+		v = false;
+		break;
+	default:
+		return -EINVAL;
 	}
-	return -EINVAL;
+
+	if (kp->flags & KPARAM_ISBOOL)
+		*(bool *)kp->arg = v;
+	else
+		*(int *)kp->arg = v;
+	return 0;
 }
 
 int param_get_bool(char *buffer, struct kernel_param *kp)
 {
+	bool val;
+	if (kp->flags & KPARAM_ISBOOL)
+		val = *(bool *)kp->arg;
+	else
+		val = *(int *)kp->arg;
+
 	/* Y and N chosen as being relatively non-coder friendly */
-	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+	return sprintf(buffer, "%c", val ? 'Y' : 'N');
 }
 
+/* This one must be bool. */
 int param_set_invbool(const char *val, struct kernel_param *kp)
 {
-	int boolval, ret;
+	int ret;
+	bool boolval;
 	struct kernel_param dummy;
 
 	dummy.arg = &boolval;
+	dummy.flags = KPARAM_ISBOOL;
 	ret = param_set_bool(val, &dummy);
 	if (ret == 0)
 		*(bool *)kp->arg = !boolval;
-- 
cgit v1.2.3-70-g09d2


From ad6561dffa17f17bb68d7207d422c26c381c4313 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:47:03 -0600
Subject: module: trim exception table on init free.

It's theoretically possible that there are exception table entries
which point into the (freed) init text of modules.  These could cause
future problems if other modules get loaded into that memory and cause
an exception as we'd see the wrong fixup.  The only case I know of is
kvm-intel.ko (when CONFIG_CC_OPTIMIZE_FOR_SIZE=n).

Amerigo fixed this long-standing FIXME in the x86 version, but this
patch is more general.

This implements trim_init_extable(); most archs are simple since they
use the standard lib/extable.c sort code.  Alpha and IA64 use relative
addresses in their fixups, so thier trimming is a slight variation.

Sparc32 is unique; it doesn't seem to define ARCH_HAS_SORT_EXTABLE,
yet it defines its own sort_extable() which overrides the one in lib.
It doesn't sort, so we have to mark deleted entries instead of
actually trimming them.

Inspired-by: Amerigo Wang <amwang@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-alpha@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
---
 arch/alpha/mm/extable.c             | 21 +++++++++++++++++++++
 arch/ia64/mm/extable.c              | 26 ++++++++++++++++++++++++++
 arch/sparc/include/asm/uaccess_32.h |  3 +++
 arch/sparc/mm/extable.c             | 29 +++++++++++++++++++++++++++++
 include/linux/module.h              |  1 +
 kernel/module.c                     |  1 +
 lib/extable.c                       | 21 ++++++++++++++++++++-
 7 files changed, 101 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/extable.c b/arch/alpha/mm/extable.c
index 62dc379d301..813c9b63c0e 100644
--- a/arch/alpha/mm/extable.c
+++ b/arch/alpha/mm/extable.c
@@ -48,6 +48,27 @@ void sort_extable(struct exception_table_entry *start,
 	     cmp_ex, swap_ex);
 }
 
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[0]), m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+				  m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
 const struct exception_table_entry *
 search_extable(const struct exception_table_entry *first,
 	       const struct exception_table_entry *last,
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
index 71c50dd8f87..e95d5ad9285 100644
--- a/arch/ia64/mm/extable.c
+++ b/arch/ia64/mm/extable.c
@@ -53,6 +53,32 @@ void sort_extable (struct exception_table_entry *start,
 	     cmp_ex, swap_ex);
 }
 
+static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->insn + x->insn;
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[0]), m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+				  m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
 const struct exception_table_entry *
 search_extable (const struct exception_table_entry *first,
 		const struct exception_table_entry *last,
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 47d5619d43f..8303ac48103 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -17,6 +17,9 @@
 
 #ifndef __ASSEMBLY__
 
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
+
 /* Sparc is not segmented, however we need to be able to fool access_ok()
  * when doing system calls from kernel mode legitimately.
  *
diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c
index 16cc28935e3..a61c349448e 100644
--- a/arch/sparc/mm/extable.c
+++ b/arch/sparc/mm/extable.c
@@ -28,6 +28,10 @@ search_extable(const struct exception_table_entry *start,
 	 *	word 3: last insn address + 4 bytes
 	 *	word 4: fixup code address
 	 *
+	 * Deleted entries are encoded as:
+	 *	word 1: unused
+	 *	word 2: -1
+	 *
 	 * See asm/uaccess.h for more details.
 	 */
 
@@ -39,6 +43,10 @@ search_extable(const struct exception_table_entry *start,
 			continue;
 		}
 
+		/* A deleted entry; see trim_init_extable */
+		if (walk->fixup == -1)
+			continue;
+
 		if (walk->insn == value)
 			return walk;
 	}
@@ -57,6 +65,27 @@ search_extable(const struct exception_table_entry *start,
         return NULL;
 }
 
+#ifdef CONFIG_MODULES
+/* We could memmove them around; easier to mark the trimmed ones. */
+void trim_init_extable(struct module *m)
+{
+	unsigned int i;
+	bool range;
+
+	for (i = 0; i < m->num_exentries; i += range ? 2 : 1) {
+		range = m->extable[i].fixup == 0;
+
+		if (within_module_init(m->extable[i].insn, m)) {
+			m->extable[i].fixup = -1;
+			if (range)
+				m->extable[i+1].fixup = -1;
+		}
+		if (range)
+			i++;
+	}
+}
+#endif /* CONFIG_MODULES */
+
 /* Special extable search, which handles ranges.  Returns fixup */
 unsigned long search_extables_range(unsigned long addr, unsigned long *g2)
 {
diff --git a/include/linux/module.h b/include/linux/module.h
index a8f2c0aa4c3..a7bc6e7b43a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -77,6 +77,7 @@ search_extable(const struct exception_table_entry *first,
 void sort_extable(struct exception_table_entry *start,
 		  struct exception_table_entry *finish);
 void sort_main_extable(void);
+void trim_init_extable(struct module *m);
 
 #ifdef MODULE
 #define MODULE_GENERIC_TABLE(gtype,name)			\
diff --git a/kernel/module.c b/kernel/module.c
index 35f7de00bf0..e4ab36ce767 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2455,6 +2455,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
+	trim_init_extable(mod);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
diff --git a/lib/extable.c b/lib/extable.c
index 179c0874559..4cac81ec225 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -39,7 +39,26 @@ void sort_extable(struct exception_table_entry *start,
 	sort(start, finish - start, sizeof(struct exception_table_entry),
 	     cmp_ex, NULL);
 }
-#endif
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries && within_module_init(m->extable[0].insn, m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+		within_module_init(m->extable[m->num_exentries-1].insn, m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+#endif /* !ARCH_HAS_SORT_EXTABLE */
 
 #ifndef ARCH_HAS_SEARCH_EXTABLE
 /*
-- 
cgit v1.2.3-70-g09d2


From f1a3c979059b2033d0b1cc4f9ee5c90bf92b5f94 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 17:56:09 +0200
Subject: perf_counter: PERF_TYPE_HW_CACHE is a hardware counter too

is_software_counter() was missing the new HW_CACHE category.

( This could have caused some counter scheduling artifacts
  with mixed sw and hw counters and counter groups. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6e133954e2e..7c4f32f6ae1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -621,7 +621,8 @@ extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
 static inline int is_software_counter(struct perf_counter *counter)
 {
 	return (counter->attr.type != PERF_TYPE_RAW) &&
-		(counter->attr.type != PERF_TYPE_HARDWARE);
+		(counter->attr.type != PERF_TYPE_HARDWARE) &&
+		(counter->attr.type != PERF_TYPE_HW_CACHE);
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
-- 
cgit v1.2.3-70-g09d2


From 974802eaa1afdc87e00821df7020a2b3c6fee623 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 12 Jun 2009 12:46:55 +0200
Subject: perf_counter: Add forward/backward attribute ABI compatibility

Provide for means of extending the perf_counter_attr in a 'natural' way.

We allow growing the structure by appending fields at the end by specifying
the full structure size inside it.

When a new kernel sees a smaller (old) structure, it will 0 pad the tail.
When an old kernel sees a larger (new) structure, it will verify the tail
consists of 0s, otherwise fail.

If we fail due to a size-mismatch, we return -E2BIG and write the kernel's
native attribe size back into the provided structure.

Furthermore, add some attribute verification, so that we'll fail counter
creation when unknown bits are present (PERF_SAMPLE, PERF_FORMAT, or in
the __reserved fields).

(This ABI detail is introduced while keeping the existing syscall ABI.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 19 ++++++++--
 include/linux/syscalls.h     |  2 +-
 kernel/perf_counter.c        | 89 ++++++++++++++++++++++++++++++++++++++++++--
 tools/perf/perf.h            |  5 ++-
 4 files changed, 105 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7c4f32f6ae1..1b3118a1023 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -120,6 +120,8 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_ID				= 1U << 6,
 	PERF_SAMPLE_CPU				= 1U << 7,
 	PERF_SAMPLE_PERIOD			= 1U << 8,
+
+	PERF_SAMPLE_MAX = 1U << 9,		/* non-ABI */
 };
 
 /*
@@ -131,17 +133,26 @@ enum perf_counter_read_format {
 	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
+
+	PERF_FORMAT_MAX = 1U << 3, 		/* non-ABI */
 };
 
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_attr {
+
 	/*
 	 * Major type: hardware/software/tracepoint/etc.
 	 */
 	__u32			type;
-	__u32			__reserved_1;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	__u32			size;
 
 	/*
 	 * Type specific configuration information.
@@ -168,12 +179,12 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_2   : 53;
+				__reserved_1   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
-	__u32			__reserved_3;
+	__u32			__reserved_2;
 
-	__u64			__reserved_4;
+	__u64			__reserved_3;
 };
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6c84ad8bd7..418d90f5eff 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
 asmlinkage long sys_perf_counter_open(
-		const struct perf_counter_attr __user *attr_uptr,
+		struct perf_counter_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 663bbe01505..29b685f551a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3584,6 +3584,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	case PERF_TYPE_TRACEPOINT:
 		pmu = tp_perf_counter_init(counter);
 		break;
+
+	default:
+		break;
 	}
 done:
 	err = 0;
@@ -3610,6 +3613,85 @@ done:
 	return counter;
 }
 
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+			  struct perf_counter_attr *attr)
+{
+	int ret;
+	u32 size;
+
+	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = PERF_ATTR_SIZE_VER0;
+
+	if (size < PERF_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned long val;
+		unsigned long __user *addr;
+		unsigned long __user *end;
+
+		addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+				sizeof(unsigned long));
+		end  = PTR_ALIGN((void __user *)uattr + size,
+				sizeof(unsigned long));
+
+		for (; addr < end; addr += sizeof(unsigned long)) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * If the type exists, the corresponding creation will verify
+	 * the attr->config.
+	 */
+	if (attr->type >= PERF_TYPE_MAX)
+		return -EINVAL;
+
+	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+		return -EINVAL;
+
+	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+		return -EINVAL;
+
+	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+		return -EINVAL;
+
+out:
+	return ret;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	ret = -E2BIG;
+	goto out;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3619,7 +3701,7 @@ done:
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_attr __user *, attr_uptr,
+		struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
@@ -3635,8 +3717,9 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
-		return -EFAULT;
+	ret = perf_copy_attr(attr_uptr, &attr);
+	if (ret)
+		return ret;
 
 	if (!attr.exclude_kernel) {
 		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index af0a5046d74..87a1aca4a42 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -53,11 +53,12 @@ static inline unsigned long long rdclock(void)
 	_min1 < _min2 ? _min1 : _min2; })
 
 static inline int
-sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
+sys_perf_counter_open(struct perf_counter_attr *attr,
 		      pid_t pid, int cpu, int group_fd,
 		      unsigned long flags)
 {
-	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
+	attr->size = sizeof(*attr);
+	return syscall(__NR_perf_counter_open, attr, pid, cpu,
 		       group_fd, flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 20f77f5654042cf484d8964b618faf9d620f639b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:16:33 -0600
Subject: virtio: fix obsolete documentation on probe function

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/virtio.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 06005fa9e98..9410394bbf9 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -99,8 +99,7 @@ void unregister_virtio_device(struct virtio_device *dev);
  * @id_table: the ids serviced by this driver.
  * @feature_table: an array of feature numbers supported by this device.
  * @feature_table_size: number of entries in the feature table array.
- * @probe: the function to call when a device is found.  Returns a token for
- *    remove, or PTR_ERR().
+ * @probe: the function to call when a device is found.  Returns 0 or -errno.
  * @remove: the function when a device is removed.
  * @config_changed: optional function to call when the device configuration
  *    changes; may be called in interrupt context.
-- 
cgit v1.2.3-70-g09d2


From 9499f5e7ed5224c40706f0cec6542a9916bc7606 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:16:35 -0600
Subject: virtio: add names to virtqueue struct, mapping from devices to
 queues.

Add a linked list of all virtqueues for a virtio device: this helps for
debugging and is also needed for upcoming interface change.

Also, add a "name" field for clearer debug messages.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c          |  2 +-
 drivers/char/hw_random/virtio-rng.c |  2 +-
 drivers/char/virtio_console.c       |  4 ++--
 drivers/lguest/lguest_device.c      |  5 +++--
 drivers/net/virtio_net.c            |  6 +++---
 drivers/s390/kvm/kvm_virtio.c       |  7 ++++---
 drivers/virtio/virtio.c             |  2 ++
 drivers/virtio/virtio_balloon.c     |  4 ++--
 drivers/virtio/virtio_pci.c         |  5 +++--
 drivers/virtio/virtio_ring.c        | 27 ++++++++++++++++++++-------
 include/linux/virtio.h              | 12 ++++++++----
 include/linux/virtio_config.h       |  6 ++++--
 include/linux/virtio_ring.h         |  3 ++-
 net/9p/trans_virtio.c               |  2 +-
 14 files changed, 56 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c0facaa55cf..db55a50d9f6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -288,7 +288,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	sg_init_table(vblk->sg, vblk->sg_elems);
 
 	/* We expect one virtqueue, for output. */
-	vblk->vq = vdev->config->find_vq(vdev, 0, blk_done);
+	vblk->vq = vdev->config->find_vq(vdev, 0, blk_done, "requests");
 	if (IS_ERR(vblk->vq)) {
 		err = PTR_ERR(vblk->vq);
 		goto out_free_vblk;
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 86e83f88313..2aeafcea95f 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -94,7 +94,7 @@ static int virtrng_probe(struct virtio_device *vdev)
 	int err;
 
 	/* We expect a single virtqueue. */
-	vq = vdev->config->find_vq(vdev, 0, random_recv_done);
+	vq = vdev->config->find_vq(vdev, 0, random_recv_done, "input");
 	if (IS_ERR(vq))
 		return PTR_ERR(vq);
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ff6f5a4b58f..58684e4a081 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -202,13 +202,13 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
 	/* Find the input queue. */
 	/* FIXME: This is why we want to wean off hvc: we do nothing
 	 * when input comes in. */
-	in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input);
+	in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input, "input");
 	if (IS_ERR(in_vq)) {
 		err = PTR_ERR(in_vq);
 		goto free;
 	}
 
-	out_vq = vdev->config->find_vq(vdev, 1, NULL);
+	out_vq = vdev->config->find_vq(vdev, 1, NULL, "output");
 	if (IS_ERR(out_vq)) {
 		err = PTR_ERR(out_vq);
 		goto free_in_vq;
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index df44d962626..4babed899d5 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -228,7 +228,8 @@ extern void lguest_setup_irq(unsigned int irq);
  * function. */
 static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 				    unsigned index,
-				    void (*callback)(struct virtqueue *vq))
+				    void (*callback)(struct virtqueue *vq),
+				    const char *name)
 {
 	struct lguest_device *ldev = to_lgdev(vdev);
 	struct lguest_vq_info *lvq;
@@ -263,7 +264,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	/* OK, tell virtio_ring.c to set up a virtqueue now we know its size
 	 * and we've got a pointer to its pages. */
 	vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
-				 vdev, lvq->pages, lg_notify, callback);
+				 vdev, lvq->pages, lg_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4d1d47953fc..be3b734ff5a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -906,20 +906,20 @@ static int virtnet_probe(struct virtio_device *vdev)
 		vi->mergeable_rx_bufs = true;
 
 	/* We expect two virtqueues, receive then send. */
-	vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done);
+	vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done, "input");
 	if (IS_ERR(vi->rvq)) {
 		err = PTR_ERR(vi->rvq);
 		goto free;
 	}
 
-	vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done);
+	vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done, "output");
 	if (IS_ERR(vi->svq)) {
 		err = PTR_ERR(vi->svq);
 		goto free_recv;
 	}
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
-		vi->cvq = vdev->config->find_vq(vdev, 2, NULL);
+		vi->cvq = vdev->config->find_vq(vdev, 2, NULL, "control");
 		if (IS_ERR(vi->cvq)) {
 			err = PTR_ERR(vi->svq);
 			goto free_send;
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index cbc8566fab7..ba8995fbf04 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -173,8 +173,9 @@ static void kvm_notify(struct virtqueue *vq)
  * this device and sets it up.
  */
 static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
-				    unsigned index,
-				    void (*callback)(struct virtqueue *vq))
+				     unsigned index,
+				     void (*callback)(struct virtqueue *vq),
+				     const char *name)
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
 	struct kvm_vqconfig *config;
@@ -194,7 +195,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 
 	vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN,
 				 vdev, (void *) config->address,
-				 kvm_notify, callback);
+				 kvm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 6b681036486..3f52c767dfe 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -186,6 +186,8 @@ int register_virtio_device(struct virtio_device *dev)
 	/* Acknowledge that we've seen the device. */
 	add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
+	INIT_LIST_HEAD(&dev->vqs);
+
 	/* device_register() causes the bus infrastructure to look for a
 	 * matching driver. */
 	err = device_register(&dev->dev);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 9c76a061a04..0fa73b4d18b 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -218,13 +218,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->vdev = vdev;
 
 	/* We expect two virtqueues. */
-	vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack);
+	vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack, "inflate");
 	if (IS_ERR(vb->inflate_vq)) {
 		err = PTR_ERR(vb->inflate_vq);
 		goto out_free_vb;
 	}
 
-	vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack);
+	vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack, "deflate");
 	if (IS_ERR(vb->deflate_vq)) {
 		err = PTR_ERR(vb->deflate_vq);
 		goto out_del_inflate_vq;
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 330aacbdec1..be4047abd5b 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -208,7 +208,8 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 
 /* the config->find_vq() implementation */
 static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
-				    void (*callback)(struct virtqueue *vq))
+				    void (*callback)(struct virtqueue *vq),
+				    const char *name)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtio_pci_vq_info *info;
@@ -247,7 +248,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 
 	/* create the vring */
 	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
-				 vdev, info->queue, vp_notify, callback);
+				 vdev, info->queue, vp_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto out_activate_queue;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5c52369ab9b..579fa693d5d 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -23,21 +23,30 @@
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
-#define BAD_RING(_vq, fmt...)			\
-	do { dev_err(&(_vq)->vq.vdev->dev, fmt); BUG(); } while(0)
+#define BAD_RING(_vq, fmt, args...)				\
+	do {							\
+		dev_err(&(_vq)->vq.vdev->dev,			\
+			"%s:"fmt, (_vq)->vq.name, ##args);	\
+		BUG();						\
+	} while (0)
 /* Caller is supposed to guarantee no reentry. */
 #define START_USE(_vq)						\
 	do {							\
 		if ((_vq)->in_use)				\
-			panic("in_use = %i\n", (_vq)->in_use);	\
+			panic("%s:in_use = %i\n",		\
+			      (_vq)->vq.name, (_vq)->in_use);	\
 		(_vq)->in_use = __LINE__;			\
 		mb();						\
-	} while(0)
+	} while (0)
 #define END_USE(_vq) \
 	do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; mb(); } while(0)
 #else
-#define BAD_RING(_vq, fmt...)			\
-	do { dev_err(&_vq->vq.vdev->dev, fmt); (_vq)->broken = true; } while(0)
+#define BAD_RING(_vq, fmt, args...)				\
+	do {							\
+		dev_err(&_vq->vq.vdev->dev,			\
+			"%s:"fmt, (_vq)->vq.name, ##args);	\
+		(_vq)->broken = true;				\
+	} while (0)
 #define START_USE(vq)
 #define END_USE(vq)
 #endif
@@ -284,7 +293,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 				      struct virtio_device *vdev,
 				      void *pages,
 				      void (*notify)(struct virtqueue *),
-				      void (*callback)(struct virtqueue *))
+				      void (*callback)(struct virtqueue *),
+				      const char *name)
 {
 	struct vring_virtqueue *vq;
 	unsigned int i;
@@ -303,10 +313,12 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.vq_ops = &vring_vq_ops;
+	vq->vq.name = name;
 	vq->notify = notify;
 	vq->broken = false;
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
+	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
 #endif
@@ -327,6 +339,7 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 
 void vring_del_virtqueue(struct virtqueue *vq)
 {
+	list_del(&vq->list);
 	kfree(to_vvq(vq));
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 9410394bbf9..4fca4f5440b 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -10,14 +10,17 @@
 
 /**
  * virtqueue - a queue to register buffers for sending or receiving.
+ * @list: the chain of virtqueues for this device
  * @callback: the function to call when buffers are consumed (can be NULL).
+ * @name: the name of this virtqueue (mainly for debugging)
  * @vdev: the virtio device this queue was created for.
  * @vq_ops: the operations for this virtqueue (see below).
  * @priv: a pointer for the virtqueue implementation to use.
  */
-struct virtqueue
-{
+struct virtqueue {
+	struct list_head list;
 	void (*callback)(struct virtqueue *vq);
+	const char *name;
 	struct virtio_device *vdev;
 	struct virtqueue_ops *vq_ops;
 	void *priv;
@@ -76,15 +79,16 @@ struct virtqueue_ops {
  * @dev: underlying device.
  * @id: the device type identification (used to match it with a driver).
  * @config: the configuration ops for this device.
+ * @vqs: the list of virtqueues for this device.
  * @features: the features supported by both driver and device.
  * @priv: private pointer for the driver's use.
  */
-struct virtio_device
-{
+struct virtio_device {
 	int index;
 	struct device dev;
 	struct virtio_device_id id;
 	struct virtio_config_ops *config;
+	struct list_head vqs;
 	/* Note that this is a Linux set_bit-style bitmap. */
 	unsigned long features[1];
 	void *priv;
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index bf8ec283b23..9fae274751e 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -55,7 +55,8 @@
  * @find_vq: find a virtqueue and instantiate it.
  *	vdev: the virtio_device
  *	index: the 0-based virtqueue number in case there's more than one.
- *	callback: the virqtueue callback
+ *	callback: the virtqueue callback
+ *	name: the virtqueue name (mainly for debugging)
  *	Returns the new virtqueue or ERR_PTR() (eg. -ENOENT).
  * @del_vq: free a virtqueue found by find_vq().
  * @get_features: get the array of feature bits for this device.
@@ -77,7 +78,8 @@ struct virtio_config_ops
 	void (*reset)(struct virtio_device *vdev);
 	struct virtqueue *(*find_vq)(struct virtio_device *vdev,
 				     unsigned index,
-				     void (*callback)(struct virtqueue *));
+				     void (*callback)(struct virtqueue *),
+				     const char *name);
 	void (*del_vq)(struct virtqueue *vq);
 	u32 (*get_features)(struct virtio_device *vdev);
 	void (*finalize_features)(struct virtio_device *vdev);
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 71e03722fb5..166c519689d 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -119,7 +119,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 				      struct virtio_device *vdev,
 				      void *pages,
 				      void (*notify)(struct virtqueue *vq),
-				      void (*callback)(struct virtqueue *vq));
+				      void (*callback)(struct virtqueue *vq),
+				      const char *name);
 void vring_del_virtqueue(struct virtqueue *vq);
 /* Filter out transport-specific feature bits. */
 void vring_transport_features(struct virtio_device *vdev);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index bb8579a141a..ab8791f9aba 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -246,7 +246,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 	chan->vdev = vdev;
 
 	/* We expect one virtqueue, for requests. */
-	chan->vq = vdev->config->find_vq(vdev, 0, req_done);
+	chan->vq = vdev->config->find_vq(vdev, 0, req_done, "requests");
 	if (IS_ERR(chan->vq)) {
 		err = PTR_ERR(chan->vq);
 		goto out_free_vq;
-- 
cgit v1.2.3-70-g09d2


From d2a7ddda9ffb1c8961abff6714b0f1eb925c120f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 12 Jun 2009 22:16:36 -0600
Subject: virtio: find_vqs/del_vqs virtio operations

This replaces find_vq/del_vq with find_vqs/del_vqs virtio operations,
and updates all drivers. This is needed for MSI support, because MSI
needs to know the total number of vectors upfront.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ lguest/9p compile fixes)
---
 drivers/block/virtio_blk.c          |  6 ++---
 drivers/char/hw_random/virtio-rng.c |  6 ++---
 drivers/char/virtio_console.c       | 26 +++++++++-----------
 drivers/lguest/lguest_device.c      | 36 ++++++++++++++++++++++++++--
 drivers/net/virtio_net.c            | 45 ++++++++++++++---------------------
 drivers/s390/kvm/kvm_virtio.c       | 36 ++++++++++++++++++++++++++--
 drivers/virtio/virtio_balloon.c     | 27 +++++++++------------
 drivers/virtio/virtio_pci.c         | 37 +++++++++++++++++++++++------
 include/linux/virtio_config.h       | 47 ++++++++++++++++++++++++++++---------
 net/9p/trans_virtio.c               |  6 ++---
 10 files changed, 183 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index db55a50d9f6..07d8e595e51 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -288,7 +288,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	sg_init_table(vblk->sg, vblk->sg_elems);
 
 	/* We expect one virtqueue, for output. */
-	vblk->vq = vdev->config->find_vq(vdev, 0, blk_done, "requests");
+	vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
 	if (IS_ERR(vblk->vq)) {
 		err = PTR_ERR(vblk->vq);
 		goto out_free_vblk;
@@ -388,7 +388,7 @@ out_put_disk:
 out_mempool:
 	mempool_destroy(vblk->pool);
 out_free_vq:
-	vdev->config->del_vq(vblk->vq);
+	vdev->config->del_vqs(vdev);
 out_free_vblk:
 	kfree(vblk);
 out:
@@ -409,7 +409,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	blk_cleanup_queue(vblk->disk->queue);
 	put_disk(vblk->disk);
 	mempool_destroy(vblk->pool);
-	vdev->config->del_vq(vblk->vq);
+	vdev->config->del_vqs(vdev);
 	kfree(vblk);
 }
 
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 2aeafcea95f..f2041fede82 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -94,13 +94,13 @@ static int virtrng_probe(struct virtio_device *vdev)
 	int err;
 
 	/* We expect a single virtqueue. */
-	vq = vdev->config->find_vq(vdev, 0, random_recv_done, "input");
+	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
 	if (IS_ERR(vq))
 		return PTR_ERR(vq);
 
 	err = hwrng_register(&virtio_hwrng);
 	if (err) {
-		vdev->config->del_vq(vq);
+		vdev->config->del_vqs(vdev);
 		return err;
 	}
 
@@ -112,7 +112,7 @@ static void virtrng_remove(struct virtio_device *vdev)
 {
 	vdev->config->reset(vdev);
 	hwrng_unregister(&virtio_hwrng);
-	vdev->config->del_vq(vq);
+	vdev->config->del_vqs(vdev);
 }
 
 static struct virtio_device_id id_table[] = {
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 58684e4a081..c74dacfa679 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -188,6 +188,9 @@ static void hvc_handle_input(struct virtqueue *vq)
  * Finally we put our input buffer in the input queue, ready to receive. */
 static int __devinit virtcons_probe(struct virtio_device *dev)
 {
+	vq_callback_t *callbacks[] = { hvc_handle_input, NULL};
+	const char *names[] = { "input", "output" };
+	struct virtqueue *vqs[2];
 	int err;
 
 	vdev = dev;
@@ -199,20 +202,15 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
 		goto fail;
 	}
 
-	/* Find the input queue. */
+	/* Find the queues. */
 	/* FIXME: This is why we want to wean off hvc: we do nothing
 	 * when input comes in. */
-	in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input, "input");
-	if (IS_ERR(in_vq)) {
-		err = PTR_ERR(in_vq);
+	err = vdev->config->find_vqs(vdev, 2, vqs, callbacks, names);
+	if (err)
 		goto free;
-	}
 
-	out_vq = vdev->config->find_vq(vdev, 1, NULL, "output");
-	if (IS_ERR(out_vq)) {
-		err = PTR_ERR(out_vq);
-		goto free_in_vq;
-	}
+	in_vq = vqs[0];
+	out_vq = vqs[1];
 
 	/* Start using the new console output. */
 	virtio_cons.get_chars = get_chars;
@@ -233,17 +231,15 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
 	hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
 	if (IS_ERR(hvc)) {
 		err = PTR_ERR(hvc);
-		goto free_out_vq;
+		goto free_vqs;
 	}
 
 	/* Register the input buffer the first time. */
 	add_inbuf();
 	return 0;
 
-free_out_vq:
-	vdev->config->del_vq(out_vq);
-free_in_vq:
-	vdev->config->del_vq(in_vq);
+free_vqs:
+	vdev->config->del_vqs(vdev);
 free:
 	kfree(inbuf);
 fail:
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 4babed899d5..e082cdac88b 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -313,6 +313,38 @@ static void lg_del_vq(struct virtqueue *vq)
 	kfree(lvq);
 }
 
+static void lg_del_vqs(struct virtio_device *vdev)
+{
+	struct virtqueue *vq, *n;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+		lg_del_vq(vq);
+}
+
+static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+		       struct virtqueue *vqs[],
+		       vq_callback_t *callbacks[],
+		       const char *names[])
+{
+	struct lguest_device *ldev = to_lgdev(vdev);
+	int i;
+
+	/* We must have this many virtqueues. */
+	if (nvqs > ldev->desc->num_vq)
+		return -ENOENT;
+
+	for (i = 0; i < nvqs; ++i) {
+		vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
+		if (IS_ERR(vqs[i]))
+			goto error;
+	}
+	return 0;
+
+error:
+	lg_del_vqs(vdev);
+	return PTR_ERR(vqs[i]);
+}
+
 /* The ops structure which hooks everything together. */
 static struct virtio_config_ops lguest_config_ops = {
 	.get_features = lg_get_features,
@@ -322,8 +354,8 @@ static struct virtio_config_ops lguest_config_ops = {
 	.get_status = lg_get_status,
 	.set_status = lg_set_status,
 	.reset = lg_reset,
-	.find_vq = lg_find_vq,
-	.del_vq = lg_del_vq,
+	.find_vqs = lg_find_vqs,
+	.del_vqs = lg_del_vqs,
 };
 
 /* The root device for the lguest virtio devices.  This makes them appear as
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index be3b734ff5a..7fa620ddeb2 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -845,6 +845,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 	int err;
 	struct net_device *dev;
 	struct virtnet_info *vi;
+	struct virtqueue *vqs[3];
+	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
+	const char *names[] = { "input", "output", "control" };
+	int nvqs;
 
 	/* Allocate ourselves a network device with room for our info */
 	dev = alloc_etherdev(sizeof(struct virtnet_info));
@@ -905,25 +909,19 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
-	/* We expect two virtqueues, receive then send. */
-	vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done, "input");
-	if (IS_ERR(vi->rvq)) {
-		err = PTR_ERR(vi->rvq);
+	/* We expect two virtqueues, receive then send,
+	 * and optionally control. */
+	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
+
+	err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names);
+	if (err)
 		goto free;
-	}
 
-	vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done, "output");
-	if (IS_ERR(vi->svq)) {
-		err = PTR_ERR(vi->svq);
-		goto free_recv;
-	}
+	vi->rvq = vqs[0];
+	vi->svq = vqs[1];
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
-		vi->cvq = vdev->config->find_vq(vdev, 2, NULL, "control");
-		if (IS_ERR(vi->cvq)) {
-			err = PTR_ERR(vi->svq);
-			goto free_send;
-		}
+		vi->cvq = vqs[2];
 
 		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
 			dev->features |= NETIF_F_HW_VLAN_FILTER;
@@ -941,7 +939,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
-		goto free_ctrl;
+		goto free_vqs;
 	}
 
 	/* Last of all, set up some receive buffers. */
@@ -962,13 +960,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 unregister:
 	unregister_netdev(dev);
-free_ctrl:
-	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
-		vdev->config->del_vq(vi->cvq);
-free_send:
-	vdev->config->del_vq(vi->svq);
-free_recv:
-	vdev->config->del_vq(vi->rvq);
+free_vqs:
+	vdev->config->del_vqs(vdev);
 free:
 	free_netdev(dev);
 	return err;
@@ -994,12 +987,10 @@ static void virtnet_remove(struct virtio_device *vdev)
 
 	BUG_ON(vi->num != 0);
 
-	vdev->config->del_vq(vi->svq);
-	vdev->config->del_vq(vi->rvq);
-	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
-		vdev->config->del_vq(vi->cvq);
 	unregister_netdev(vi->dev);
 
+	vdev->config->del_vqs(vi->vdev);
+
 	while (vi->pages)
 		__free_pages(get_a_page(vi, GFP_KERNEL), 0);
 
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index ba8995fbf04..e38e5d306fa 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -227,6 +227,38 @@ static void kvm_del_vq(struct virtqueue *vq)
 				       KVM_S390_VIRTIO_RING_ALIGN));
 }
 
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+	struct virtqueue *vq, *n;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+		kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[])
+{
+	struct kvm_device *kdev = to_kvmdev(vdev);
+	int i;
+
+	/* We must have this many virtqueues. */
+	if (nvqs > kdev->desc->num_vq)
+		return -ENOENT;
+
+	for (i = 0; i < nvqs; ++i) {
+		vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+		if (IS_ERR(vqs[i]))
+			goto error;
+	}
+	return 0;
+
+error:
+	kvm_del_vqs(vdev);
+	return PTR_ERR(vqs[i]);
+}
+
 /*
  * The config ops structure as defined by virtio config
  */
@@ -238,8 +270,8 @@ static struct virtio_config_ops kvm_vq_configspace_ops = {
 	.get_status = kvm_get_status,
 	.set_status = kvm_set_status,
 	.reset = kvm_reset,
-	.find_vq = kvm_find_vq,
-	.del_vq = kvm_del_vq,
+	.find_vqs = kvm_find_vqs,
+	.del_vqs = kvm_del_vqs,
 };
 
 /*
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0fa73b4d18b..26b27826479 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -204,6 +204,9 @@ static int balloon(void *_vballoon)
 static int virtballoon_probe(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb;
+	struct virtqueue *vqs[2];
+	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack };
+	const char *names[] = { "inflate", "deflate" };
 	int err;
 
 	vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
@@ -218,22 +221,17 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->vdev = vdev;
 
 	/* We expect two virtqueues. */
-	vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack, "inflate");
-	if (IS_ERR(vb->inflate_vq)) {
-		err = PTR_ERR(vb->inflate_vq);
+	err = vdev->config->find_vqs(vdev, 2, vqs, callbacks, names);
+	if (err)
 		goto out_free_vb;
-	}
 
-	vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack, "deflate");
-	if (IS_ERR(vb->deflate_vq)) {
-		err = PTR_ERR(vb->deflate_vq);
-		goto out_del_inflate_vq;
-	}
+	vb->inflate_vq = vqs[0];
+	vb->deflate_vq = vqs[1];
 
 	vb->thread = kthread_run(balloon, vb, "vballoon");
 	if (IS_ERR(vb->thread)) {
 		err = PTR_ERR(vb->thread);
-		goto out_del_deflate_vq;
+		goto out_del_vqs;
 	}
 
 	vb->tell_host_first
@@ -241,10 +239,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
 
 	return 0;
 
-out_del_deflate_vq:
-	vdev->config->del_vq(vb->deflate_vq);
-out_del_inflate_vq:
-	vdev->config->del_vq(vb->inflate_vq);
+out_del_vqs:
+	vdev->config->del_vqs(vdev);
 out_free_vb:
 	kfree(vb);
 out:
@@ -264,8 +260,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	/* Now we reset the device so we can clean up the queues. */
 	vdev->config->reset(vdev);
 
-	vdev->config->del_vq(vb->deflate_vq);
-	vdev->config->del_vq(vb->inflate_vq);
+	vdev->config->del_vqs(vdev);
 	kfree(vb);
 }
 
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index be4047abd5b..027f13fbe49 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -276,11 +276,7 @@ static void vp_del_vq(struct virtqueue *vq)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 	struct virtio_pci_vq_info *info = vq->priv;
-	unsigned long flags, size;
-
-	spin_lock_irqsave(&vp_dev->lock, flags);
-	list_del(&info->node);
-	spin_unlock_irqrestore(&vp_dev->lock, flags);
+	unsigned long size;
 
 	vring_del_virtqueue(vq);
 
@@ -293,14 +289,41 @@ static void vp_del_vq(struct virtqueue *vq)
 	kfree(info);
 }
 
+static void vp_del_vqs(struct virtio_device *vdev)
+{
+	struct virtqueue *vq, *n;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+		vp_del_vq(vq);
+}
+
+static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+		       struct virtqueue *vqs[],
+		       vq_callback_t *callbacks[],
+		       const char *names[])
+{
+	int i;
+
+	for (i = 0; i < nvqs; ++i) {
+		vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]);
+		if (IS_ERR(vqs[i]))
+			goto error;
+	}
+	return 0;
+
+error:
+	vp_del_vqs(vdev);
+	return PTR_ERR(vqs[i]);
+}
+
 static struct virtio_config_ops virtio_pci_config_ops = {
 	.get		= vp_get,
 	.set		= vp_set,
 	.get_status	= vp_get_status,
 	.set_status	= vp_set_status,
 	.reset		= vp_reset,
-	.find_vq	= vp_find_vq,
-	.del_vq		= vp_del_vq,
+	.find_vqs	= vp_find_vqs,
+	.del_vqs	= vp_del_vqs,
 	.get_features	= vp_get_features,
 	.finalize_features = vp_finalize_features,
 };
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 9fae274751e..4cd290c06a8 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -29,6 +29,7 @@
 #define VIRTIO_F_NOTIFY_ON_EMPTY	24
 
 #ifdef __KERNEL__
+#include <linux/err.h>
 #include <linux/virtio.h>
 
 /**
@@ -49,16 +50,26 @@
  * @set_status: write the status byte
  *	vdev: the virtio_device
  *	status: the new status byte
+ * @request_vqs: request the specified number of virtqueues
+ *	vdev: the virtio_device
+ *	max_vqs: the max number of virtqueues we want
+ *      If supplied, must call before any virtqueues are instantiated.
+ *      To modify the max number of virtqueues after request_vqs has been
+ *      called, call free_vqs and then request_vqs with a new value.
+ * @free_vqs: cleanup resources allocated by request_vqs
+ *	vdev: the virtio_device
+ *      If supplied, must call after all virtqueues have been deleted.
  * @reset: reset the device
  *	vdev: the virtio device
  *	After this, status and feature negotiation must be done again
- * @find_vq: find a virtqueue and instantiate it.
+ * @find_vqs: find virtqueues and instantiate them.
  *	vdev: the virtio_device
- *	index: the 0-based virtqueue number in case there's more than one.
- *	callback: the virtqueue callback
- *	name: the virtqueue name (mainly for debugging)
- *	Returns the new virtqueue or ERR_PTR() (eg. -ENOENT).
- * @del_vq: free a virtqueue found by find_vq().
+ *	nvqs: the number of virtqueues to find
+ *	vqs: on success, includes new virtqueues
+ *	callbacks: array of callbacks, for each virtqueue
+ *	names: array of virtqueue names (mainly for debugging)
+ *	Returns 0 on success or error status
+ * @del_vqs: free virtqueues found by find_vqs().
  * @get_features: get the array of feature bits for this device.
  *	vdev: the virtio_device
  *	Returns the first 32 feature bits (all we currently need).
@@ -67,6 +78,7 @@
  *	This gives the final feature bits for the device: it can change
  *	the dev->feature bits if it wants.
  */
+typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops
 {
 	void (*get)(struct virtio_device *vdev, unsigned offset,
@@ -76,11 +88,11 @@ struct virtio_config_ops
 	u8 (*get_status)(struct virtio_device *vdev);
 	void (*set_status)(struct virtio_device *vdev, u8 status);
 	void (*reset)(struct virtio_device *vdev);
-	struct virtqueue *(*find_vq)(struct virtio_device *vdev,
-				     unsigned index,
-				     void (*callback)(struct virtqueue *),
-				     const char *name);
-	void (*del_vq)(struct virtqueue *vq);
+	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[]);
+	void (*del_vqs)(struct virtio_device *);
 	u32 (*get_features)(struct virtio_device *vdev);
 	void (*finalize_features)(struct virtio_device *vdev);
 };
@@ -128,5 +140,18 @@ static inline int virtio_config_buf(struct virtio_device *vdev,
 	vdev->config->get(vdev, offset, buf, len);
 	return 0;
 }
+
+static inline
+struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
+					vq_callback_t *c, const char *n)
+{
+	vq_callback_t *callbacks[] = { c };
+	const char *names[] = { n };
+	struct virtqueue *vq;
+	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names);
+	if (err < 0)
+		return ERR_PTR(err);
+	return vq;
+}
 #endif /* __KERNEL__ */
 #endif /* _LINUX_VIRTIO_CONFIG_H */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index ab8791f9aba..a49484e67e1 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -246,7 +246,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 	chan->vdev = vdev;
 
 	/* We expect one virtqueue, for requests. */
-	chan->vq = vdev->config->find_vq(vdev, 0, req_done, "requests");
+	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
 	if (IS_ERR(chan->vq)) {
 		err = PTR_ERR(chan->vq);
 		goto out_free_vq;
@@ -261,7 +261,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 	return 0;
 
 out_free_vq:
-	vdev->config->del_vq(chan->vq);
+	vdev->config->del_vqs(vdev);
 fail:
 	mutex_lock(&virtio_9p_lock);
 	chan_index--;
@@ -332,7 +332,7 @@ static void p9_virtio_remove(struct virtio_device *vdev)
 	BUG_ON(chan->inuse);
 
 	if (chan->initialized) {
-		vdev->config->del_vq(chan->vq);
+		vdev->config->del_vqs(vdev);
 		chan->initialized = false;
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 82af8ce84ed65d2fb6d8c017d3f2bbbf161061fb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 14 May 2009 13:55:41 +0300
Subject: virtio_pci: optional MSI-X support

This implements optional MSI-X support in virtio_pci.
MSI-X is used whenever the host supports at least 2 MSI-X
vectors: 1 for configuration changes and 1 for virtqueues.
Per-virtqueue vectors are allocated if enough vectors
available.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ whitespace, style)
---
 drivers/virtio/virtio_pci.c | 228 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/virtio_pci.h  |  10 +-
 2 files changed, 218 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 951e673e50a..193c8f0e5cc 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -42,6 +42,26 @@ struct virtio_pci_device
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
 	struct list_head virtqueues;
+
+	/* MSI-X support */
+	int msix_enabled;
+	int intx_enabled;
+	struct msix_entry *msix_entries;
+	/* Name strings for interrupts. This size should be enough,
+	 * and I'm too lazy to allocate each name separately. */
+	char (*msix_names)[256];
+	/* Number of available vectors */
+	unsigned msix_vectors;
+	/* Vectors allocated */
+	unsigned msix_used_vectors;
+};
+
+/* Constants for MSI-X */
+/* Use first vector for configuration changes, second and the rest for
+ * virtqueues Thus, we need at least 2 vectors for MSI. */
+enum {
+	VP_MSIX_CONFIG_VECTOR = 0,
+	VP_MSIX_VQ_VECTOR = 1,
 };
 
 struct virtio_pci_vq_info
@@ -60,6 +80,9 @@ struct virtio_pci_vq_info
 
 	/* the list node for the virtqueues list */
 	struct list_head node;
+
+	/* MSI-X vector (or none) */
+	unsigned vector;
 };
 
 /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
@@ -109,7 +132,8 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr +
+				VIRTIO_PCI_CONFIG(vp_dev) + offset;
 	u8 *ptr = buf;
 	int i;
 
@@ -123,7 +147,8 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr +
+				VIRTIO_PCI_CONFIG(vp_dev) + offset;
 	const u8 *ptr = buf;
 	int i;
 
@@ -221,7 +246,122 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 	return vp_vring_interrupt(irq, opaque);
 }
 
-/* the config->find_vq() implementation */
+static void vp_free_vectors(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i;
+
+	if (vp_dev->intx_enabled) {
+		free_irq(vp_dev->pci_dev->irq, vp_dev);
+		vp_dev->intx_enabled = 0;
+	}
+
+	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
+	vp_dev->msix_used_vectors = 0;
+
+	if (vp_dev->msix_enabled) {
+		/* Disable the vector used for configuration */
+		iowrite16(VIRTIO_MSI_NO_VECTOR,
+			  vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+		/* Flush the write out to device */
+		ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+
+		vp_dev->msix_enabled = 0;
+		pci_disable_msix(vp_dev->pci_dev);
+	}
+}
+
+static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
+			  int *options, int noptions)
+{
+	int i;
+	for (i = 0; i < noptions; ++i)
+		if (!pci_enable_msix(dev, entries, options[i]))
+			return options[i];
+	return -EBUSY;
+}
+
+static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	const char *name = dev_name(&vp_dev->vdev.dev);
+	unsigned i, v;
+	int err = -ENOMEM;
+	/* We want at most one vector per queue and one for config changes.
+	 * Fallback to separate vectors for config and a shared for queues.
+	 * Finally fall back to regular interrupts. */
+	int options[] = { max_vqs + 1, 2 };
+	int nvectors = max(options[0], options[1]);
+
+	vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
+				       GFP_KERNEL);
+	if (!vp_dev->msix_entries)
+		goto error_entries;
+	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
+				     GFP_KERNEL);
+	if (!vp_dev->msix_names)
+		goto error_names;
+
+	for (i = 0; i < nvectors; ++i)
+		vp_dev->msix_entries[i].entry = i;
+
+	err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries,
+			     options, ARRAY_SIZE(options));
+	if (err < 0) {
+		/* Can't allocate enough MSI-X vectors, use regular interrupt */
+		vp_dev->msix_vectors = 0;
+		err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
+				  IRQF_SHARED, name, vp_dev);
+		if (err)
+			goto error_irq;
+		vp_dev->intx_enabled = 1;
+	} else {
+		vp_dev->msix_vectors = err;
+		vp_dev->msix_enabled = 1;
+
+		/* Set the vector used for configuration */
+		v = vp_dev->msix_used_vectors;
+		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+			 "%s-config", name);
+		err = request_irq(vp_dev->msix_entries[v].vector,
+				  vp_config_changed, 0, vp_dev->msix_names[v],
+				  vp_dev);
+		if (err)
+			goto error_irq;
+		++vp_dev->msix_used_vectors;
+
+		iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+		/* Verify we had enough resources to assign the vector */
+		v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+		if (v == VIRTIO_MSI_NO_VECTOR) {
+			err = -EBUSY;
+			goto error_irq;
+		}
+	}
+
+	if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) {
+		/* Shared vector for all VQs */
+		v = vp_dev->msix_used_vectors;
+		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+			 "%s-virtqueues", name);
+		err = request_irq(vp_dev->msix_entries[v].vector,
+				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
+				  vp_dev);
+		if (err)
+			goto error_irq;
+		++vp_dev->msix_used_vectors;
+	}
+	return 0;
+error_irq:
+	vp_free_vectors(vdev);
+	kfree(vp_dev->msix_names);
+error_names:
+	kfree(vp_dev->msix_entries);
+error_entries:
+	return err;
+}
+
 static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 				    void (*callback)(struct virtqueue *vq),
 				    const char *name)
@@ -230,7 +370,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 	struct virtio_pci_vq_info *info;
 	struct virtqueue *vq;
 	unsigned long flags, size;
-	u16 num;
+	u16 num, vector;
 	int err;
 
 	/* Select the queue we're interested in */
@@ -249,6 +389,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 
 	info->queue_index = index;
 	info->num = num;
+	info->vector = VIRTIO_MSI_NO_VECTOR;
 
 	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
 	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
@@ -272,12 +413,43 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 	vq->priv = info;
 	info->vq = vq;
 
+	/* allocate per-vq vector if available and necessary */
+	if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) {
+		vector = vp_dev->msix_used_vectors;
+		snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names,
+			 "%s-%s", dev_name(&vp_dev->vdev.dev), name);
+		err = request_irq(vp_dev->msix_entries[vector].vector,
+				  vring_interrupt, 0,
+				  vp_dev->msix_names[vector], vq);
+		if (err)
+			goto out_request_irq;
+		info->vector = vector;
+		++vp_dev->msix_used_vectors;
+	} else
+		vector = VP_MSIX_VQ_VECTOR;
+
+	 if (callback && vp_dev->msix_enabled) {
+		iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		if (vector == VIRTIO_MSI_NO_VECTOR) {
+			err = -EBUSY;
+			goto out_assign;
+		}
+	}
+
 	spin_lock_irqsave(&vp_dev->lock, flags);
 	list_add(&info->node, &vp_dev->virtqueues);
 	spin_unlock_irqrestore(&vp_dev->lock, flags);
 
 	return vq;
 
+out_assign:
+	if (info->vector != VIRTIO_MSI_NO_VECTOR) {
+		free_irq(vp_dev->msix_entries[info->vector].vector, vq);
+		--vp_dev->msix_used_vectors;
+	}
+out_request_irq:
+	vring_del_virtqueue(vq);
 out_activate_queue:
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 	free_pages_exact(info->queue, size);
@@ -286,17 +458,27 @@ out_info:
 	return ERR_PTR(err);
 }
 
-/* the config->del_vq() implementation */
 static void vp_del_vq(struct virtqueue *vq)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 	struct virtio_pci_vq_info *info = vq->priv;
 	unsigned long size;
 
+	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+
+	if (info->vector != VIRTIO_MSI_NO_VECTOR)
+		free_irq(vp_dev->msix_entries[info->vector].vector, vq);
+
+	if (vp_dev->msix_enabled) {
+		iowrite16(VIRTIO_MSI_NO_VECTOR,
+			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		/* Flush the write out to device */
+		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+	}
+
 	vring_del_virtqueue(vq);
 
 	/* Select and deactivate the queue */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
@@ -304,30 +486,46 @@ static void vp_del_vq(struct virtqueue *vq)
 	kfree(info);
 }
 
+/* the config->del_vqs() implementation */
 static void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtqueue *vq, *n;
 
 	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
 		vp_del_vq(vq);
+
+	vp_free_vectors(vdev);
 }
 
+/* the config->find_vqs() implementation */
 static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char *names[])
 {
-	int i;
+	int vectors = 0;
+	int i, err;
+
+	/* How many vectors would we like? */
+	for (i = 0; i < nvqs; ++i)
+		if (callbacks[i])
+			++vectors;
+
+	err = vp_request_vectors(vdev, vectors);
+	if (err)
+		goto error_request;
 
 	for (i = 0; i < nvqs; ++i) {
 		vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]);
 		if (IS_ERR(vqs[i]))
-			goto error;
+			goto error_find;
 	}
 	return 0;
 
-error:
+error_find:
 	vp_del_vqs(vdev);
+
+error_request:
 	return PTR_ERR(vqs[i]);
 }
 
@@ -349,7 +547,7 @@ static void virtio_pci_release_dev(struct device *_d)
 	struct virtio_pci_device *vp_dev = to_vp_device(dev);
 	struct pci_dev *pci_dev = vp_dev->pci_dev;
 
-	free_irq(pci_dev->irq, vp_dev);
+	vp_del_vqs(dev);
 	pci_set_drvdata(pci_dev, NULL);
 	pci_iounmap(pci_dev, vp_dev->ioaddr);
 	pci_release_regions(pci_dev);
@@ -408,21 +606,13 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
 	vp_dev->vdev.id.device = pci_dev->subsystem_device;
 
-	/* register a handler for the queue with the PCI device's interrupt */
-	err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
-			  dev_name(&vp_dev->vdev.dev), vp_dev);
-	if (err)
-		goto out_set_drvdata;
-
 	/* finally register the virtio device */
 	err = register_virtio_device(&vp_dev->vdev);
 	if (err)
-		goto out_req_irq;
+		goto out_set_drvdata;
 
 	return 0;
 
-out_req_irq:
-	free_irq(pci_dev->irq, vp_dev);
 out_set_drvdata:
 	pci_set_drvdata(pci_dev, NULL);
 	pci_iounmap(pci_dev, vp_dev->ioaddr);
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index cd0fd5d181a..9a3d7c48c62 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -47,9 +47,17 @@
 /* The bit of the ISR which indicates a device configuration change. */
 #define VIRTIO_PCI_ISR_CONFIG		0x2
 
+/* MSI-X registers: only enabled if MSI-X is enabled. */
+/* A 16-bit vector for configuration changes. */
+#define VIRTIO_MSI_CONFIG_VECTOR        20
+/* A 16-bit vector for selected queue notifications. */
+#define VIRTIO_MSI_QUEUE_VECTOR         22
+/* Vector value used to disable MSI for queue */
+#define VIRTIO_MSI_NO_VECTOR            0xffff
+
 /* The remaining space is defined by each driver as the per-driver
  * configuration space */
-#define VIRTIO_PCI_CONFIG		20
+#define VIRTIO_PCI_CONFIG(dev)		((dev)->msix_enabled ? 24 : 20)
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
-- 
cgit v1.2.3-70-g09d2


From ee006b353f1ca8c9a8470b72b462beb011d62e32 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 11 May 2009 18:11:44 +0100
Subject: virtio: teach virtio_has_feature() about transport features

Drivers don't add transport features to their table, so we
shouldn't check these with virtio_check_driver_offered_feature().

We could perhaps add an ->offered_feature() virtio_config_op,
but that perhaps that would be overkill for a consitency check
like this.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/virtio_config.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 4cd290c06a8..99f514575f6 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -113,7 +113,9 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 	if (__builtin_constant_p(fbit))
 		BUILD_BUG_ON(fbit >= 32);
 
-	virtio_check_driver_offered_feature(vdev, fbit);
+	if (fbit < VIRTIO_TRANSPORT_F_START)
+		virtio_check_driver_offered_feature(vdev, fbit);
+
 	return test_bit(fbit, vdev->features);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9fa29b9df32ba4db055f3977933cd0c1b8fe67cd Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 11 May 2009 18:11:45 +0100
Subject: virtio: indirect ring entries (VIRTIO_RING_F_INDIRECT_DESC)

Add a new feature flag for indirect ring entries. These are ring
entries which point to a table of buffer descriptors.

The idea here is to increase the ring capacity by allowing a larger
effective ring size whereby the ring size dictates the number of
requests that may be outstanding, rather than the size of those
requests.

This should be most effective in the case of block I/O where we can
potentially benefit by concurrently dispatching a large number of
large requests. Even in the simple case of single segment block
requests, this results in a threefold increase in ring capacity.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 75 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/virtio_ring.h  |  5 +++
 2 files changed, 78 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 579fa693d5d..a882f260651 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -61,6 +61,9 @@ struct vring_virtqueue
 	/* Other side has made a mess, don't try any more. */
 	bool broken;
 
+	/* Host supports indirect buffers */
+	bool indirect;
+
 	/* Number of free buffers */
 	unsigned int num_free;
 	/* Head of free buffer list. */
@@ -85,6 +88,55 @@ struct vring_virtqueue
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+/* Set up an indirect table of descriptors and add it to the queue. */
+static int vring_add_indirect(struct vring_virtqueue *vq,
+			      struct scatterlist sg[],
+			      unsigned int out,
+			      unsigned int in)
+{
+	struct vring_desc *desc;
+	unsigned head;
+	int i;
+
+	desc = kmalloc((out + in) * sizeof(struct vring_desc), GFP_ATOMIC);
+	if (!desc)
+		return vq->vring.num;
+
+	/* Transfer entries from the sg list into the indirect page */
+	for (i = 0; i < out; i++) {
+		desc[i].flags = VRING_DESC_F_NEXT;
+		desc[i].addr = sg_phys(sg);
+		desc[i].len = sg->length;
+		desc[i].next = i+1;
+		sg++;
+	}
+	for (; i < (out + in); i++) {
+		desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+		desc[i].addr = sg_phys(sg);
+		desc[i].len = sg->length;
+		desc[i].next = i+1;
+		sg++;
+	}
+
+	/* Last one doesn't continue. */
+	desc[i-1].flags &= ~VRING_DESC_F_NEXT;
+	desc[i-1].next = 0;
+
+	/* We're about to use a buffer */
+	vq->num_free--;
+
+	/* Use a single buffer which doesn't continue */
+	head = vq->free_head;
+	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
+	vq->vring.desc[head].addr = virt_to_phys(desc);
+	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
+
+	/* Update free pointer */
+	vq->free_head = vq->vring.desc[head].next;
+
+	return head;
+}
+
 static int vring_add_buf(struct virtqueue *_vq,
 			 struct scatterlist sg[],
 			 unsigned int out,
@@ -94,12 +146,21 @@ static int vring_add_buf(struct virtqueue *_vq,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	unsigned int i, avail, head, uninitialized_var(prev);
 
+	START_USE(vq);
+
 	BUG_ON(data == NULL);
+
+	/* If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect. FIXME: tune this threshold */
+	if (vq->indirect && (out + in) > 1 && vq->num_free) {
+		head = vring_add_indirect(vq, sg, out, in);
+		if (head != vq->vring.num)
+			goto add_head;
+	}
+
 	BUG_ON(out + in > vq->vring.num);
 	BUG_ON(out + in == 0);
 
-	START_USE(vq);
-
 	if (vq->num_free < out + in) {
 		pr_debug("Can't add buf len %i - avail = %i\n",
 			 out + in, vq->num_free);
@@ -136,6 +197,7 @@ static int vring_add_buf(struct virtqueue *_vq,
 	/* Update free pointer */
 	vq->free_head = i;
 
+add_head:
 	/* Set token. */
 	vq->data[head] = data;
 
@@ -179,6 +241,11 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 
 	/* Put back on free list: find end */
 	i = head;
+
+	/* Free the indirect table */
+	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
+		kfree(phys_to_virt(vq->vring.desc[i].addr));
+
 	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
 		i = vq->vring.desc[i].next;
 		vq->num_free++;
@@ -323,6 +390,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->in_use = false;
 #endif
 
+	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback)
 		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
@@ -351,6 +420,8 @@ void vring_transport_features(struct virtio_device *vdev)
 
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
 		switch (i) {
+		case VIRTIO_RING_F_INDIRECT_DESC:
+			break;
 		default:
 			/* We don't understand this bit. */
 			clear_bit(i, vdev->features);
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 166c519689d..693e0ec5afa 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -14,6 +14,8 @@
 #define VRING_DESC_F_NEXT	1
 /* This marks a buffer as write-only (otherwise read-only). */
 #define VRING_DESC_F_WRITE	2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT	4
 
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
@@ -24,6 +26,9 @@
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
 
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc
 {
-- 
cgit v1.2.3-70-g09d2


From a32a8813d0173163ba44d8f9556e0d89fdc4fb46 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:02 -0600
Subject: lguest: improve interrupt handling, speed up stream networking

lguest never checked for pending interrupts when enabling interrupts, and
things still worked.  However, it makes a significant difference to TCP
performance, so it's time we fixed it by introducing a pending_irq flag
and checking it on irq_restore and irq_enable.

These two routines are now too big to patch into the 8/10 bytes
patch space, so we drop that code.

Note: The high latency on interrupt delivery had a very curious
effect: once everything else was optimized, networking without GSO was
faster than networking with GSO, since more interrupts were sent and
hence a greater chance of one getting through to the Guest!

Note2: (Almost) Closing the same loophole for iret doesn't have any
measurable effect, so I'm leaving that patch for the moment.

Before:
	1GB tcpblast Guest->Host:		30.7 seconds
	1GB tcpblast Guest->Host (no GSO):	76.0 seconds

After:
	1GB tcpblast Guest->Host:		6.8 seconds
	1GB tcpblast Guest->Host (no GSO):	27.8 seconds

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h   |  1 +
 arch/x86/lguest/boot.c                | 21 +++++++++++++++------
 arch/x86/lguest/i386_head.S           |  2 --
 drivers/lguest/core.c                 |  7 ++++---
 drivers/lguest/hypercalls.c           |  4 ++++
 drivers/lguest/interrupts_and_traps.c | 16 +++++++++++++---
 drivers/lguest/lg.h                   |  4 ++--
 include/linux/lguest.h                |  4 ++++
 8 files changed, 43 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487..f9a9f781124 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -17,6 +17,7 @@
 #define LHCALL_LOAD_TLS		16
 #define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
+#define LHCALL_SEND_INTERRUPTS	19
 
 #define LGUEST_TRAP_ENTRY 0x1F
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2392a7a171c..37b8c1d3e02 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -205,6 +205,12 @@ PV_CALLEE_SAVE_REGS_THUNK(save_fl);
 static void restore_fl(unsigned long flags)
 {
 	lguest_data.irq_enabled = flags;
+	mb();
+	/* Null hcall forces interrupt delivery now, if irq_pending is
+	 * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags
+	 * enables interrupts. */
+	if (flags & lguest_data.irq_pending)
+		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
 }
 PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
 
@@ -219,6 +225,11 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 static void irq_enable(void)
 {
 	lguest_data.irq_enabled = X86_EFLAGS_IF;
+	mb();
+	/* Null hcall forces interrupt delivery now. */
+	if (lguest_data.irq_pending)
+		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
+
 }
 PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
 
@@ -972,10 +983,10 @@ static void lguest_restart(char *reason)
  *
  * Our current solution is to allow the paravirt back end to optionally patch
  * over the indirect calls to replace them with something more efficient.  We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts.  We usually have 6 or 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
+ * patch two of the simplest of the most commonly called functions: disable
+ * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
+ * into: the Guest versions of these operations are small enough that we can
+ * fit comfortably.
  *
  * First we need assembly templates of each of the patchable Guest operations,
  * and these are in i386_head.S. */
@@ -986,8 +997,6 @@ static const struct lguest_insns
 	const char *start, *end;
 } lguest_insns[] = {
 	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
-	[PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
 	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
 };
 
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f7954198947..3e0c5545d59 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,8 +46,6 @@ ENTRY(lguest_entry)
 	.globl lgstart_##name; .globl lgend_##name
 
 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
 /*:*/
 
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 8ca1def5b14..03fbc88c002 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -189,6 +189,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
 		unsigned int irq;
+		bool more;
 
 		/* First we run any hypercalls the Guest wants done. */
 		if (cpu->hcall)
@@ -213,9 +214,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
-		irq = interrupt_pending(cpu);
+		irq = interrupt_pending(cpu, &more);
 		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq);
+			try_deliver_interrupt(cpu, irq, more);
 
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -233,7 +234,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			set_current_state(TASK_INTERRUPTIBLE);
 			/* Just before we sleep, make sure nothing snuck in
 			 * which we should be doing. */
-			if (interrupt_pending(cpu) < LGUEST_IRQS
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS
 			    || cpu->break_out)
 				set_current_state(TASK_RUNNING);
 			else
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 54d66f05fef..f252b71ae79 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* This call does nothing, except by breaking out of the Guest
 		 * it makes us process all the asynchronous hypercalls. */
 		break;
+	case LHCALL_SEND_INTERRUPTS:
+		/* This call does nothing too, but by breaking out of the Guest
+		 * it makes us process any pending interrupts. */
+		break;
 	case LHCALL_LGUEST_INIT:
 		/* You can't get here unless you're already initialized.  Don't
 		 * do that. */
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index a8c966fee1e..5a10754b479 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -131,7 +131,7 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
  * interrupt_pending() returns the first pending interrupt which isn't blocked
  * by the Guest.  It is called before every entry to the Guest, and just before
  * we go to sleep when the Guest has halted itself. */
-unsigned int interrupt_pending(struct lg_cpu *cpu)
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
@@ -149,13 +149,14 @@ unsigned int interrupt_pending(struct lg_cpu *cpu)
 
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
+	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
 
 	return irq;
 }
 
 /* This actually diverts the Guest to running an interrupt handler, once an
  * interrupt has been identified by interrupt_pending(). */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
 {
 	struct desc_struct *idt;
 
@@ -178,8 +179,12 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
 		u32 irq_enabled;
 		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
 			irq_enabled = 0;
-		if (!irq_enabled)
+		if (!irq_enabled) {
+			/* Make sure they know an IRQ is pending. */
+			put_user(X86_EFLAGS_IF,
+				 &cpu->lg->lguest_data->irq_pending);
 			return;
+		}
 	}
 
 	/* Look at the IDT entry the Guest gave us for this interrupt.  The
@@ -202,6 +207,11 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
 	 * here is a compromise which means at least it gets updated every
 	 * timer interrupt. */
 	write_timestamp(cpu);
+
+	/* If there are no other interrupts we want to deliver, clear
+	 * the pending flag. */
+	if (!more)
+		put_user(0, &cpu->lg->lguest_data->irq_pending);
 }
 /*:*/
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 6743cf147d9..573896533ac 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -139,8 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq);
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 175e63f4a8c..7bc1440fc47 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -30,6 +30,10 @@ struct lguest_data
 	/* Wallclock time set by the Host. */
 	struct timespec time;
 
+	/* Interrupt pending set by the Host.  The Guest should do a hypercall
+	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */
+	int irq_pending;
+
 	/* Async hypercall ring.  Instead of directly making hypercalls, we can
 	 * place them in here for processing the next time the Host wants.
 	 * This batching can be quite efficient. */
-- 
cgit v1.2.3-70-g09d2


From df60aeef4f4fe0645d9a195a7689005520422de5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:09 -0600
Subject: lguest: use eventfds for device notification

Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with
an address: the main Launcher process returns with this address, and figures
out what device to run.

A far nicer model is to let processes bind an eventfd to an address: if we
find one, we simply signal the eventfd.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Davide Libenzi <davidel@xmailserver.org>
---
 drivers/lguest/Kconfig          |  2 +-
 drivers/lguest/core.c           |  8 ++--
 drivers/lguest/lg.h             | 13 ++++++
 drivers/lguest/lguest_user.c    | 98 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/lguest_launcher.h |  1 +
 5 files changed, 116 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 8f63845db83..0aaa0597a62 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86_32 && EXPERIMENTAL && FUTEX
+	depends on X86_32 && EXPERIMENTAL && EVENTFD
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index d0298dc45d9..508569c9571 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher, in which case we return from the read() now. */
 		if (cpu->pending_notify) {
-			if (put_user(cpu->pending_notify, user))
-				return -EFAULT;
-			return sizeof(cpu->pending_notify);
+			if (!send_notify_to_eventfd(cpu)) {
+				if (put_user(cpu->pending_notify, user))
+					return -EFAULT;
+				return sizeof(cpu->pending_notify);
+			}
 		}
 
 		/* Check for signals */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 040cb70780e..32fefdc6ad3 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -82,6 +82,16 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
 
+struct lg_eventfd {
+	unsigned long addr;
+	struct file *event;
+};
+
+struct lg_eventfd_map {
+	unsigned int num;
+	struct lg_eventfd map[];
+};
+
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
@@ -102,6 +112,8 @@ struct lguest
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
+	struct lg_eventfd_map *eventfds;
+
 	/* Dead? */
 	const char *dead;
 };
@@ -154,6 +166,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
 void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
 		const unsigned long *def);
 void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+bool send_notify_to_eventfd(struct lg_cpu *cpu);
 void init_clockdev(struct lg_cpu *cpu);
 bool check_syscall_vector(struct lguest *lg);
 int init_interrupts(void);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 1982b45bd93..f6bf255f183 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,6 +7,8 @@
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
 #include "lg.h"
 
 /*L:055 When something happens, the Waker process needs a way to stop the
@@ -35,6 +37,81 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 	}
 }
 
+bool send_notify_to_eventfd(struct lg_cpu *cpu)
+{
+	unsigned int i;
+	struct lg_eventfd_map *map;
+
+	/* lg->eventfds is RCU-protected */
+	rcu_read_lock();
+	map = rcu_dereference(cpu->lg->eventfds);
+	for (i = 0; i < map->num; i++) {
+		if (map->map[i].addr == cpu->pending_notify) {
+			eventfd_signal(map->map[i].event, 1);
+			cpu->pending_notify = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return cpu->pending_notify == 0;
+}
+
+static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+{
+	struct lg_eventfd_map *new, *old = lg->eventfds;
+
+	if (!addr)
+		return -EINVAL;
+
+	/* Replace the old array with the new one, carefully: others can
+	 * be accessing it at the same time */
+	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	/* First make identical copy. */
+	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+	new->num = old->num;
+
+	/* Now append new entry. */
+	new->map[new->num].addr = addr;
+	new->map[new->num].event = eventfd_fget(fd);
+	if (IS_ERR(new->map[new->num].event)) {
+		kfree(new);
+		return PTR_ERR(new->map[new->num].event);
+	}
+	new->num++;
+
+	/* Now put new one in place. */
+	rcu_assign_pointer(lg->eventfds, new);
+
+	/* We're not in a big hurry.  Wait until noone's looking at old
+	 * version, then delete it. */
+	synchronize_rcu();
+	kfree(old);
+
+	return 0;
+}
+
+static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+{
+	unsigned long addr, fd;
+	int err;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	input++;
+	if (get_user(fd, input) != 0)
+		return -EFAULT;
+
+	mutex_lock(&lguest_lock);
+	err = add_eventfd(lg, addr, fd);
+	mutex_unlock(&lguest_lock);
+
+	return 0;
+}
+
 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
  * number to /dev/lguest. */
 static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
@@ -184,6 +261,13 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
 
+	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
+	if (!lg->eventfds) {
+		err = -ENOMEM;
+		goto free_lg;
+	}
+	lg->eventfds->num = 0;
+
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
@@ -191,7 +275,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto release_guest;
+		goto free_eventfds;
 
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can fail. */
@@ -210,7 +294,9 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-release_guest:
+free_eventfds:
+	kfree(lg->eventfds);
+free_lg:
 	kfree(lg);
 unlock:
 	mutex_unlock(&lguest_lock);
@@ -260,6 +346,8 @@ static ssize_t write(struct file *file, const char __user *in,
 		return user_send_irq(cpu, input);
 	case LHREQ_BREAK:
 		return break_guest_out(cpu, input);
+	case LHREQ_EVENTFD:
+		return attach_eventfd(lg, input);
 	default:
 		return -EINVAL;
 	}
@@ -297,6 +385,12 @@ static int close(struct inode *inode, struct file *file)
 		 * the Launcher's memory management structure. */
 		mmput(lg->cpus[i].mm);
 	}
+
+	/* Release any eventfds they registered. */
+	for (i = 0; i < lg->eventfds->num; i++)
+		fput(lg->eventfds->map[i].event);
+	kfree(lg->eventfds);
+
 	/* If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree(). */
 	if (!IS_ERR(lg->dead))
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index a53407a4165..9de964b9058 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -58,6 +58,7 @@ enum lguest_req
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+	LHREQ_EVENTFD, /* + address, fd. */
 };
 
 /* The alignment to use between consumer and producer parts of vring.
-- 
cgit v1.2.3-70-g09d2


From 5dac051bc6030963181b69faddd9e0ad04f85fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:10 -0600
Subject: lguest: remove obsolete LHREQ_BREAK call

We no longer need an efficient mechanism to force the Guest back into
host userspace, as each device is serviced without bothering the main
Guest process (aka. the Launcher).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c           | 11 +++--------
 drivers/lguest/lg.h             |  4 +---
 drivers/lguest/lguest_user.c    | 31 -------------------------------
 include/linux/lguest_launcher.h |  2 +-
 4 files changed, 5 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 508569c9571..a6974e9b8eb 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -209,10 +209,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (signal_pending(current))
 			return -ERESTARTSYS;
 
-		/* If Waker set break_out, return to Launcher. */
-		if (cpu->break_out)
-			return -EAGAIN;
-
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
@@ -231,13 +227,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			break;
 
 		/* If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer or LHREQ_BREAK from the Waker will wake us. */
+		 * clock timer will wake us. */
 		if (cpu->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			/* Just before we sleep, make sure nothing snuck in
+			/* Just before we sleep, make sure no interrupt snuck in
 			 * which we should be doing. */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS
-			    || cpu->break_out)
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
 				set_current_state(TASK_RUNNING);
 			else
 				schedule();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 32fefdc6ad3..d4e8979735c 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -71,9 +71,7 @@ struct lg_cpu {
 	/* Virtual clock device */
 	struct hrtimer hrt;
 
-	/* Do we need to stop what we're doing and return to userspace? */
-	int break_out;
-	wait_queue_head_t break_wq;
+	/* Did the Guest tell us to halt? */
 	int halted;
 
 	/* Pending virtual interrupts */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index f6bf255f183..32e29712105 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -11,32 +11,6 @@
 #include <linux/file.h>
 #include "lg.h"
 
-/*L:055 When something happens, the Waker process needs a way to stop the
- * kernel running the Guest and return to the Launcher.  So the Waker writes
- * LHREQ_BREAK and the value "1" to /dev/lguest to do this.  Once the Launcher
- * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
- * the Waker. */
-static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
-{
-	unsigned long on;
-
-	/* Fetch whether they're turning break on or off. */
-	if (get_user(on, input) != 0)
-		return -EFAULT;
-
-	if (on) {
-		cpu->break_out = 1;
-		if (!wake_up_process(cpu->tsk))
-			kick_process(cpu->tsk);
-		/* Wait for them to reset it */
-		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
-	} else {
-		cpu->break_out = 0;
-		wake_up(&cpu->break_wq);
-		return 0;
-	}
-}
-
 bool send_notify_to_eventfd(struct lg_cpu *cpu)
 {
 	unsigned int i;
@@ -202,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	 * address. */
 	lguest_arch_setup_regs(cpu, start_ip);
 
-	/* Initialize the queue for the Waker to wait on */
-	init_waitqueue_head(&cpu->break_wq);
-
 	/* We keep a pointer to the Launcher task (ie. current task) for when
 	 * other Guests want to wake this one (eg. console input). */
 	cpu->tsk = current;
@@ -344,8 +315,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_BREAK:
-		return break_guest_out(cpu, input);
 	case LHREQ_EVENTFD:
 		return attach_eventfd(lg, input);
 	default:
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 9de964b9058..bfefbdf7498 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -57,7 +57,7 @@ enum lguest_req
 	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+	LHREQ_BREAK, /* No longer used */
 	LHREQ_EVENTFD, /* + address, fd. */
 };
 
-- 
cgit v1.2.3-70-g09d2


From 7e85ee0c1d15ca5f8bff0f514f158eba1742dd87 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 12 Jun 2009 14:03:06 +0300
Subject: slab,slub: don't enable interrupts during early boot

As explained by Benjamin Herrenschmidt:

  Oh and btw, your patch alone doesn't fix powerpc, because it's missing
  a whole bunch of GFP_KERNEL's in the arch code... You would have to
  grep the entire kernel for things that check slab_is_available() and
  even then you'll be missing some.

  For example, slab_is_available() didn't always exist, and so in the
  early days on powerpc, we used a mem_init_done global that is set form
  mem_init() (not perfect but works in practice). And we still have code
  using that to do the test.

Therefore, mask out __GFP_WAIT, __GFP_IO, and __GFP_FS in the slab allocators
in early boot code to avoid enabling interrupts.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/gfp.h      |  3 +++
 include/linux/slab.h     |  2 ++
 include/linux/slob_def.h |  5 +++++
 include/linux/slub_def.h |  2 ++
 init/main.c              |  1 +
 mm/slab.c                | 18 ++++++++++++++++++
 mm/slub.c                | 16 ++++++++++++++++
 7 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0bbc15f5453..3760e7c5de0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -85,6 +85,9 @@ struct vm_area_struct;
 			__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
 			__GFP_NORETRY|__GFP_NOMEMALLOC)
 
+/* Control slab gfp mask during early boot */
+#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
+
 /* Control allocation constraints */
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 48803064ced..219b8fb4651 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -319,4 +319,6 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+void __init kmem_cache_init_late(void);
+
 #endif	/* _LINUX_SLAB_H */
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 0ec00b39d00..bb5368df4be 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -34,4 +34,9 @@ static __always_inline void *__kmalloc(size_t size, gfp_t flags)
 	return kmalloc(size, flags);
 }
 
+static inline void kmem_cache_init_late(void)
+{
+	/* Nothing to do */
+}
+
 #endif /* __LINUX_SLOB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index be5d40c43bd..4dcbc2c7149 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -302,4 +302,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 }
 #endif
 
+void __init kmem_cache_init_late(void);
+
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/init/main.c b/init/main.c
index b3e8f14c568..f6204f712e7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -640,6 +640,7 @@ asmlinkage void __init start_kernel(void)
 				 "enabled early\n");
 	early_boot_irqs_on();
 	local_irq_enable();
+	kmem_cache_init_late();
 
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
diff --git a/mm/slab.c b/mm/slab.c
index cd76964b53b..453efcb1c98 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -303,6 +303,12 @@ struct kmem_list3 {
 	int free_touched;		/* updated without locking */
 };
 
+/*
+ * The slab allocator is initialized with interrupts disabled. Therefore, make
+ * sure early boot allocations don't accidentally enable interrupts.
+ */
+static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
+
 /*
  * Need this for bootstrapping a per node allocator.
  */
@@ -1654,6 +1660,14 @@ void __init kmem_cache_init(void)
 	 */
 }
 
+void __init kmem_cache_init_late(void)
+{
+	/*
+	 * Interrupts are enabled now so all GFP allocations are safe.
+	 */
+	slab_gfp_mask = __GFP_BITS_MASK;
+}
+
 static int __init cpucache_init(void)
 {
 	int cpu;
@@ -3354,6 +3368,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
+	flags &= slab_gfp_mask;
+
 	lockdep_trace_alloc(flags);
 
 	if (slab_should_failslab(cachep, flags))
@@ -3434,6 +3450,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
+	flags &= slab_gfp_mask;
+
 	lockdep_trace_alloc(flags);
 
 	if (slab_should_failslab(cachep, flags))
diff --git a/mm/slub.c b/mm/slub.c
index 3964d3ce4c1..30354bfeb43 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -178,6 +178,12 @@ static enum {
 	SYSFS		/* Sysfs up */
 } slab_state = DOWN;
 
+/*
+ * The slab allocator is initialized with interrupts disabled. Therefore, make
+ * sure early boot allocations don't accidentally enable interrupts.
+ */
+static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
+
 /* A list of all slab caches on the system */
 static DECLARE_RWSEM(slub_lock);
 static LIST_HEAD(slab_caches);
@@ -1595,6 +1601,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
+	gfpflags &= slab_gfp_mask;
+
 	lockdep_trace_alloc(gfpflags);
 	might_sleep_if(gfpflags & __GFP_WAIT);
 
@@ -3104,6 +3112,14 @@ void __init kmem_cache_init(void)
 		nr_cpu_ids, nr_node_ids);
 }
 
+void __init kmem_cache_init_late(void)
+{
+	/*
+	 * Interrupts are enabled now so all GFP allocations are safe.
+	 */
+	slab_gfp_mask = __GFP_BITS_MASK;
+}
+
 /*
  * Find a mergeable slab cache
  */
-- 
cgit v1.2.3-70-g09d2


From 7ea2ac9b6632038377cb488c7d1cb60b88164d4d Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Tue, 14 Apr 2009 23:14:17 -0300
Subject: Trivial: fix typo s/balence/balance/

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/ultrasound.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ultrasound.h b/include/linux/ultrasound.h
index 6b7703e75ce..71339dc531c 100644
--- a/include/linux/ultrasound.h
+++ b/include/linux/ultrasound.h
@@ -34,7 +34,7 @@
  *		_GUS_VOICEOFF	- Stops voice (no parameters)
  *		_GUS_VOICEFADE	- Stops the voice smoothly.
  *		_GUS_VOICEMODE	- Alters the voice mode, don't start or stop voice (P1=voice mode)
- *		_GUS_VOICEBALA	- Sets voice balence (P1, 0=left, 7=middle and 15=right, default 7)
+ *		_GUS_VOICEBALA	- Sets voice balance (P1, 0=left, 7=middle and 15=right, default 7)
  *		_GUS_VOICEFREQ	- Sets voice (sample) playback frequency (P1=Hz)
  *		_GUS_VOICEVOL	- Sets voice volume (P1=volume, 0xfff=max, 0xeff=half, 0x000=off)
  *		_GUS_VOICEVOL2	- Sets voice volume (P1=volume, 0xfff=max, 0xeff=half, 0x000=off)
-- 
cgit v1.2.3-70-g09d2


From e39a71ef80877f4e30d808af9acceec80f4d2f7c Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 15 May 2009 00:53:26 +0200
Subject: PM: Rename device_power_down/up()

Rename the functions performing "_noirq" dev_pm_ops
operations from device_power_down() and device_power_up()
to device_suspend_noirq() and device_resume_noirq().

The new function names are chosen to show that the functions
are responsible for calling the _noirq() versions to finalize
the suspend/resume operation. The current function names do
not perform power down/up anymore so the names may be misleading.

Global function renames:
- device_power_down() -> device_suspend_noirq()
- device_power_up() -> device_resume_noirq()

Static function renames:
- suspend_device_noirq() -> __device_suspend_noirq()
- resume_device_noirq() -> __device_resume_noirq()

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Len Brown <lenb@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/kernel/apm_32.c  |  8 ++++----
 drivers/base/power/main.c | 26 +++++++++++++-------------
 drivers/xen/manage.c      | 10 +++++-----
 include/linux/pm.h        |  4 ++--
 kernel/kexec.c            |  8 ++++----
 kernel/power/disk.c       | 16 ++++++++--------
 kernel/power/main.c       |  4 ++--
 7 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac4..31ae547da15 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1235,7 +1235,7 @@ static int suspend(int vetoable)
 
 	device_suspend(PMSG_SUSPEND);
 
-	device_power_down(PMSG_SUSPEND);
+	device_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1259,7 +1259,7 @@ static int suspend(int vetoable)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
 	device_resume(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
@@ -1277,7 +1277,7 @@ static void standby(void)
 {
 	int err;
 
-	device_power_down(PMSG_SUSPEND);
+	device_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 3e4bc699bc0..c5a35bc9d63 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -315,13 +315,13 @@ static void pm_dev_err(struct device *dev, pm_message_t state, char *info,
 /*------------------------- Resume routines -------------------------*/
 
 /**
- *	resume_device_noirq - Power on one device (early resume).
+ *	__device_resume_noirq - Power on one device (early resume).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	Must be called with interrupts disabled.
  */
-static int resume_device_noirq(struct device *dev, pm_message_t state)
+static int __device_resume_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -363,7 +363,7 @@ static void dpm_power_up(pm_message_t state)
 			int error;
 
 			dev->power.status = DPM_OFF;
-			error = resume_device_noirq(dev, state);
+			error = __device_resume_noirq(dev, state);
 			if (error)
 				pm_dev_err(dev, state, " early", error);
 		}
@@ -371,18 +371,18 @@ static void dpm_power_up(pm_message_t state)
 }
 
 /**
- *	device_power_up - Turn on all devices that need special attention.
+ *	device_resume_noirq - Turn on all devices that need special attention.
  *	@state: PM transition of the system being carried out.
  *
  *	Call the "early" resume handlers and enable device drivers to receive
  *	interrupts.
  */
-void device_power_up(pm_message_t state)
+void device_resume_noirq(pm_message_t state)
 {
 	dpm_power_up(state);
 	resume_device_irqs();
 }
-EXPORT_SYMBOL_GPL(device_power_up);
+EXPORT_SYMBOL_GPL(device_resume_noirq);
 
 /**
  *	resume_device - Restore state for one device.
@@ -577,13 +577,13 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 }
 
 /**
- *	suspend_device_noirq - Shut down one device (late suspend).
+ *	__device_suspend_noirq - Shut down one device (late suspend).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	This is called with interrupts off and only a single CPU running.
  */
-static int suspend_device_noirq(struct device *dev, pm_message_t state)
+static int __device_suspend_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -602,7 +602,7 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- *	device_power_down - Shut down special devices.
+ *	device_suspend_noirq - Shut down special devices.
  *	@state: PM transition of the system being carried out.
  *
  *	Prevent device drivers from receiving interrupts and call the "late"
@@ -610,7 +610,7 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state)
  *
  *	Must be called under dpm_list_mtx.
  */
-int device_power_down(pm_message_t state)
+int device_suspend_noirq(pm_message_t state)
 {
 	struct device *dev;
 	int error = 0;
@@ -618,7 +618,7 @@ int device_power_down(pm_message_t state)
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
 	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
-		error = suspend_device_noirq(dev, state);
+		error = __device_suspend_noirq(dev, state);
 		if (error) {
 			pm_dev_err(dev, state, " late", error);
 			break;
@@ -627,10 +627,10 @@ int device_power_down(pm_message_t state)
 	}
 	mutex_unlock(&dpm_list_mtx);
 	if (error)
-		device_power_up(resume_event(state));
+		device_resume_noirq(resume_event(state));
 	return error;
 }
-EXPORT_SYMBOL_GPL(device_power_down);
+EXPORT_SYMBOL_GPL(device_suspend_noirq);
 
 /**
  *	suspend_device - Save state of one device.
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index fddc2025dec..d5b327ac403 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -43,7 +43,7 @@ static int xen_suspend(void *data)
 	if (err) {
 		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
 			err);
-		device_power_up(PMSG_RESUME);
+		device_resume_noirq(PMSG_RESUME);
 		return err;
 	}
 
@@ -69,7 +69,7 @@ static int xen_suspend(void *data)
 	}
 
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
 	return 0;
 }
@@ -101,9 +101,9 @@ static void do_suspend(void)
 	printk(KERN_DEBUG "suspending xenstore...\n");
 	xs_suspend();
 
-	err = device_power_down(PMSG_SUSPEND);
+	err = device_suspend_noirq(PMSG_SUSPEND);
 	if (err) {
-		printk(KERN_ERR "device_power_down failed: %d\n", err);
+		printk(KERN_ERR "device_suspend_noirq failed: %d\n", err);
 		goto resume_devices;
 	}
 
@@ -119,7 +119,7 @@ static void do_suspend(void)
 	} else
 		xs_suspend_cancel();
 
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
 resume_devices:
 	device_resume(PMSG_RESUME);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1d4e2d28982..2170252074f 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -382,12 +382,12 @@ struct dev_pm_info {
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
 extern int sysdev_resume(void);
-extern void device_power_up(pm_message_t state);
+extern void device_resume_noirq(pm_message_t state);
 extern void device_resume(pm_message_t state);
 
 extern void device_pm_unlock(void);
 extern int sysdev_suspend(pm_message_t state);
-extern int device_power_down(pm_message_t state);
+extern int device_suspend_noirq(pm_message_t state);
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e4983770913..5a3da87adae 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1452,13 +1452,13 @@ int kernel_kexec(void)
 		if (error)
 			goto Resume_console;
 		/* At this point, device_suspend() has been called,
-		 * but *not* device_power_down(). We *must*
-		 * device_power_down() now.  Otherwise, drivers for
+		 * but *not* device_suspend_noirq(). We *must* call
+		 * device_suspend_noirq() now.  Otherwise, drivers for
 		 * some devices (e.g. interrupt controllers) become
 		 * desynchronized with the actual state of the
 		 * hardware at resume time, and evil weirdness ensues.
 		 */
-		error = device_power_down(PMSG_FREEZE);
+		error = device_suspend_noirq(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
 		error = disable_nonboot_cpus();
@@ -1486,7 +1486,7 @@ int kernel_kexec(void)
 		local_irq_enable();
  Enable_cpus:
 		enable_nonboot_cpus();
-		device_power_up(PMSG_RESTORE);
+		device_resume_noirq(PMSG_RESTORE);
  Resume_devices:
 		device_resume(PMSG_RESTORE);
  Resume_console:
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5cb080e7eeb..1c18bc894a2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -216,12 +216,12 @@ static int create_image(int platform_mode)
 		return error;
 
 	/* At this point, device_suspend() has been called, but *not*
-	 * device_power_down(). We *must* call device_power_down() now.
+	 * device_suspend_noirq(). We *must* call device_suspend_noirq() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
 	 * become desynchronized with the actual state of the hardware
 	 * at resume time, and evil weirdness ensues.
 	 */
-	error = device_power_down(PMSG_FREEZE);
+	error = device_suspend_noirq(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
 
  Power_up:
 	sysdev_resume();
-	/* NOTE:  device_power_up() is just a resume() for devices
+	/* NOTE:  device_resume_noirq() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
  Platform_finish:
 	platform_finish(platform_mode);
 
-	device_power_up(in_suspend ?
+	device_resume_noirq(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
 	return error;
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	error = device_power_down(PMSG_QUIESCE);
+	error = device_suspend_noirq(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
  Cleanup:
 	platform_restore_cleanup(platform_mode);
 
-	device_power_up(PMSG_RECOVER);
+	device_resume_noirq(PMSG_RECOVER);
 
 	return error;
 }
@@ -454,7 +454,7 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
-	error = device_power_down(PMSG_HIBERNATE);
+	error = device_suspend_noirq(PMSG_HIBERNATE);
 	if (error)
 		goto Resume_devices;
 
@@ -479,7 +479,7 @@ int hibernation_platform_enter(void)
  Platofrm_finish:
 	hibernation_ops->finish();
 
-	device_power_up(PMSG_RESTORE);
+	device_suspend_noirq(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 868028280d1..2f6638ee03c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state)
 			return error;
 	}
 
-	error = device_power_down(PMSG_SUSPEND);
+	error = device_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Platfrom_finish;
@@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state)
 		suspend_ops->wake();
 
  Power_up_devices:
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
  Platfrom_finish:
 	if (suspend_ops->finish)
-- 
cgit v1.2.3-70-g09d2


From d161630297a20802d01c55847bfcba85d2118a9f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 24 May 2009 22:05:42 +0200
Subject: PM core: rename suspend and resume functions

This patch (as1241) renames a bunch of functions in the PM core.
Rather than go through a boring list of name changes, suffice it to
say that in the end we have a bunch of pairs of functions:

	device_resume_noirq	dpm_resume_noirq
	device_resume		dpm_resume
	device_complete		dpm_complete
	device_suspend_noirq	dpm_suspend_noirq
	device_suspend		dpm_suspend
	device_prepare		dpm_prepare

in which device_X does the X operation on a single device and dpm_X
invokes device_X for all devices in the dpm_list.

In addition, the old dpm_power_up and device_resume_noirq have been
combined into a single function (dpm_resume_noirq).

Lastly, dpm_suspend_start and dpm_resume_end are the renamed versions
of the former top-level device_suspend and device_resume routines.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/kernel/apm_32.c  | 14 ++++-----
 drivers/base/power/main.c | 80 ++++++++++++++++++++---------------------------
 drivers/xen/manage.c      | 16 +++++-----
 include/linux/pm.h        | 11 +++----
 kernel/kexec.c            | 14 ++++-----
 kernel/power/disk.c       | 30 +++++++++---------
 kernel/power/main.c       |  8 ++---
 7 files changed, 80 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 31ae547da15..79302e9a33a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
 	int err;
 	struct apm_user	*as;
 
-	device_suspend(PMSG_SUSPEND);
+	dpm_suspend_start(PMSG_SUSPEND);
 
-	device_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
-	device_resume(PMSG_RESUME);
+	dpm_resume_end(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
 	for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
 {
 	int err;
 
-	device_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				device_resume(PMSG_RESUME);
+				dpm_resume_end(PMSG_RESUME);
 				queue_event(event, NULL);
 			}
 			ignore_normal_resume = 0;
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index c5a35bc9d63..1f3d82260db 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -315,13 +315,13 @@ static void pm_dev_err(struct device *dev, pm_message_t state, char *info,
 /*------------------------- Resume routines -------------------------*/
 
 /**
- *	__device_resume_noirq - Power on one device (early resume).
+ *	device_resume_noirq - Power on one device (early resume).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	Must be called with interrupts disabled.
  */
-static int __device_resume_noirq(struct device *dev, pm_message_t state)
+static int device_resume_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -344,16 +344,16 @@ static int __device_resume_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- *	dpm_power_up - Power on all regular (non-sysdev) devices.
+ *	dpm_resume_noirq - Power on all regular (non-sysdev) devices.
  *	@state: PM transition of the system being carried out.
  *
- *	Execute the appropriate "noirq resume" callback for all devices marked
- *	as DPM_OFF_IRQ.
+ *	Call the "noirq" resume handlers for all devices marked as
+ *	DPM_OFF_IRQ and enable device drivers to receive interrupts.
  *
  *	Must be called under dpm_list_mtx.  Device drivers should not receive
  *	interrupts while it's being executed.
  */
-static void dpm_power_up(pm_message_t state)
+void dpm_resume_noirq(pm_message_t state)
 {
 	struct device *dev;
 
@@ -363,33 +363,21 @@ static void dpm_power_up(pm_message_t state)
 			int error;
 
 			dev->power.status = DPM_OFF;
-			error = __device_resume_noirq(dev, state);
+			error = device_resume_noirq(dev, state);
 			if (error)
 				pm_dev_err(dev, state, " early", error);
 		}
 	mutex_unlock(&dpm_list_mtx);
-}
-
-/**
- *	device_resume_noirq - Turn on all devices that need special attention.
- *	@state: PM transition of the system being carried out.
- *
- *	Call the "early" resume handlers and enable device drivers to receive
- *	interrupts.
- */
-void device_resume_noirq(pm_message_t state)
-{
-	dpm_power_up(state);
 	resume_device_irqs();
 }
-EXPORT_SYMBOL_GPL(device_resume_noirq);
+EXPORT_SYMBOL_GPL(dpm_resume_noirq);
 
 /**
- *	resume_device - Restore state for one device.
+ *	device_resume - Restore state for one device.
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static int resume_device(struct device *dev, pm_message_t state)
+static int device_resume(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -462,7 +450,7 @@ static void dpm_resume(pm_message_t state)
 			dev->power.status = DPM_RESUMING;
 			mutex_unlock(&dpm_list_mtx);
 
-			error = resume_device(dev, state);
+			error = device_resume(dev, state);
 
 			mutex_lock(&dpm_list_mtx);
 			if (error)
@@ -480,11 +468,11 @@ static void dpm_resume(pm_message_t state)
 }
 
 /**
- *	complete_device - Complete a PM transition for given device
+ *	device_complete - Complete a PM transition for given device
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static void complete_device(struct device *dev, pm_message_t state)
+static void device_complete(struct device *dev, pm_message_t state)
 {
 	down(&dev->sem);
 
@@ -527,7 +515,7 @@ static void dpm_complete(pm_message_t state)
 			dev->power.status = DPM_ON;
 			mutex_unlock(&dpm_list_mtx);
 
-			complete_device(dev, state);
+			device_complete(dev, state);
 
 			mutex_lock(&dpm_list_mtx);
 		}
@@ -540,19 +528,19 @@ static void dpm_complete(pm_message_t state)
 }
 
 /**
- *	device_resume - Restore state of each device in system.
+ *	dpm_resume_end - Restore state of each device in system.
  *	@state: PM transition of the system being carried out.
  *
  *	Resume all the devices, unlock them all, and allow new
  *	devices to be registered once again.
  */
-void device_resume(pm_message_t state)
+void dpm_resume_end(pm_message_t state)
 {
 	might_sleep();
 	dpm_resume(state);
 	dpm_complete(state);
 }
-EXPORT_SYMBOL_GPL(device_resume);
+EXPORT_SYMBOL_GPL(dpm_resume_end);
 
 
 /*------------------------- Suspend routines -------------------------*/
@@ -577,13 +565,13 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 }
 
 /**
- *	__device_suspend_noirq - Shut down one device (late suspend).
+ *	device_suspend_noirq - Shut down one device (late suspend).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	This is called with interrupts off and only a single CPU running.
  */
-static int __device_suspend_noirq(struct device *dev, pm_message_t state)
+static int device_suspend_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -602,15 +590,15 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- *	device_suspend_noirq - Shut down special devices.
+ *	dpm_suspend_noirq - Power down all regular (non-sysdev) devices.
  *	@state: PM transition of the system being carried out.
  *
- *	Prevent device drivers from receiving interrupts and call the "late"
+ *	Prevent device drivers from receiving interrupts and call the "noirq"
  *	suspend handlers.
  *
  *	Must be called under dpm_list_mtx.
  */
-int device_suspend_noirq(pm_message_t state)
+int dpm_suspend_noirq(pm_message_t state)
 {
 	struct device *dev;
 	int error = 0;
@@ -618,7 +606,7 @@ int device_suspend_noirq(pm_message_t state)
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
 	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
-		error = __device_suspend_noirq(dev, state);
+		error = device_suspend_noirq(dev, state);
 		if (error) {
 			pm_dev_err(dev, state, " late", error);
 			break;
@@ -627,17 +615,17 @@ int device_suspend_noirq(pm_message_t state)
 	}
 	mutex_unlock(&dpm_list_mtx);
 	if (error)
-		device_resume_noirq(resume_event(state));
+		dpm_resume_noirq(resume_event(state));
 	return error;
 }
-EXPORT_SYMBOL_GPL(device_suspend_noirq);
+EXPORT_SYMBOL_GPL(dpm_suspend_noirq);
 
 /**
- *	suspend_device - Save state of one device.
+ *	device_suspend - Save state of one device.
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static int suspend_device(struct device *dev, pm_message_t state)
+static int device_suspend(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -704,7 +692,7 @@ static int dpm_suspend(pm_message_t state)
 		get_device(dev);
 		mutex_unlock(&dpm_list_mtx);
 
-		error = suspend_device(dev, state);
+		error = device_suspend(dev, state);
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
@@ -723,11 +711,11 @@ static int dpm_suspend(pm_message_t state)
 }
 
 /**
- *	prepare_device - Execute the ->prepare() callback(s) for given device.
+ *	device_prepare - Execute the ->prepare() callback(s) for given device.
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static int prepare_device(struct device *dev, pm_message_t state)
+static int device_prepare(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -781,7 +769,7 @@ static int dpm_prepare(pm_message_t state)
 		dev->power.status = DPM_PREPARING;
 		mutex_unlock(&dpm_list_mtx);
 
-		error = prepare_device(dev, state);
+		error = device_prepare(dev, state);
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
@@ -807,12 +795,12 @@ static int dpm_prepare(pm_message_t state)
 }
 
 /**
- *	device_suspend - Save state and stop all devices in system.
+ *	dpm_suspend_start - Save state and stop all devices in system.
  *	@state: PM transition of the system being carried out.
  *
  *	Prepare and suspend all devices.
  */
-int device_suspend(pm_message_t state)
+int dpm_suspend_start(pm_message_t state)
 {
 	int error;
 
@@ -822,7 +810,7 @@ int device_suspend(pm_message_t state)
 		error = dpm_suspend(state);
 	return error;
 }
-EXPORT_SYMBOL_GPL(device_suspend);
+EXPORT_SYMBOL_GPL(dpm_suspend_start);
 
 void __suspend_report_result(const char *function, void *fn, int ret)
 {
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index d5b327ac403..10d03d7931c 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -43,7 +43,7 @@ static int xen_suspend(void *data)
 	if (err) {
 		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
 			err);
-		device_resume_noirq(PMSG_RESUME);
+		dpm_resume_noirq(PMSG_RESUME);
 		return err;
 	}
 
@@ -69,7 +69,7 @@ static int xen_suspend(void *data)
 	}
 
 	sysdev_resume();
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
 	return 0;
 }
@@ -92,18 +92,18 @@ static void do_suspend(void)
 	}
 #endif
 
-	err = device_suspend(PMSG_SUSPEND);
+	err = dpm_suspend_start(PMSG_SUSPEND);
 	if (err) {
-		printk(KERN_ERR "xen suspend: device_suspend %d\n", err);
+		printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
 		goto out;
 	}
 
 	printk(KERN_DEBUG "suspending xenstore...\n");
 	xs_suspend();
 
-	err = device_suspend_noirq(PMSG_SUSPEND);
+	err = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (err) {
-		printk(KERN_ERR "device_suspend_noirq failed: %d\n", err);
+		printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
 		goto resume_devices;
 	}
 
@@ -119,10 +119,10 @@ static void do_suspend(void)
 	} else
 		xs_suspend_cancel();
 
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
 resume_devices:
-	device_resume(PMSG_RESUME);
+	dpm_resume_end(PMSG_RESUME);
 
 	/* Make sure timer events get retriggered on all CPUs */
 	clock_was_set();
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2170252074f..b3f74764a58 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -382,14 +382,13 @@ struct dev_pm_info {
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
 extern int sysdev_resume(void);
-extern void device_resume_noirq(pm_message_t state);
-extern void device_resume(pm_message_t state);
+extern void dpm_resume_noirq(pm_message_t state);
+extern void dpm_resume_end(pm_message_t state);
 
 extern void device_pm_unlock(void);
 extern int sysdev_suspend(pm_message_t state);
-extern int device_suspend_noirq(pm_message_t state);
-extern int device_suspend(pm_message_t state);
-extern int device_prepare_suspend(pm_message_t state);
+extern int dpm_suspend_noirq(pm_message_t state);
+extern int dpm_suspend_start(pm_message_t state);
 
 extern void __suspend_report_result(const char *function, void *fn, int ret);
 
@@ -403,7 +402,7 @@ extern void __suspend_report_result(const char *function, void *fn, int ret);
 #define device_pm_lock() do {} while (0)
 #define device_pm_unlock() do {} while (0)
 
-static inline int device_suspend(pm_message_t state)
+static inline int dpm_suspend_start(pm_message_t state)
 {
 	return 0;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5a3da87adae..ae1c35201cc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1448,17 +1448,17 @@ int kernel_kexec(void)
 			goto Restore_console;
 		}
 		suspend_console();
-		error = device_suspend(PMSG_FREEZE);
+		error = dpm_suspend_start(PMSG_FREEZE);
 		if (error)
 			goto Resume_console;
-		/* At this point, device_suspend() has been called,
-		 * but *not* device_suspend_noirq(). We *must* call
-		 * device_suspend_noirq() now.  Otherwise, drivers for
+		/* At this point, dpm_suspend_start() has been called,
+		 * but *not* dpm_suspend_noirq(). We *must* call
+		 * dpm_suspend_noirq() now.  Otherwise, drivers for
 		 * some devices (e.g. interrupt controllers) become
 		 * desynchronized with the actual state of the
 		 * hardware at resume time, and evil weirdness ensues.
 		 */
-		error = device_suspend_noirq(PMSG_FREEZE);
+		error = dpm_suspend_noirq(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
 		error = disable_nonboot_cpus();
@@ -1486,9 +1486,9 @@ int kernel_kexec(void)
 		local_irq_enable();
  Enable_cpus:
 		enable_nonboot_cpus();
-		device_resume_noirq(PMSG_RESTORE);
+		dpm_resume_noirq(PMSG_RESTORE);
  Resume_devices:
-		device_resume(PMSG_RESTORE);
+		dpm_resume_end(PMSG_RESTORE);
  Resume_console:
 		resume_console();
 		thaw_processes();
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 1c18bc894a2..a9beba68b6c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -215,13 +215,13 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
-	/* At this point, device_suspend() has been called, but *not*
-	 * device_suspend_noirq(). We *must* call device_suspend_noirq() now.
+	/* At this point, dpm_suspend_start() has been called, but *not*
+	 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
 	 * become desynchronized with the actual state of the hardware
 	 * at resume time, and evil weirdness ensues.
 	 */
-	error = device_suspend_noirq(PMSG_FREEZE);
+	error = dpm_suspend_noirq(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
 
  Power_up:
 	sysdev_resume();
-	/* NOTE:  device_resume_noirq() is just a resume() for devices
+	/* NOTE:  dpm_resume_noirq() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
  Platform_finish:
 	platform_finish(platform_mode);
 
-	device_resume_noirq(in_suspend ?
+	dpm_resume_noirq(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
 	return error;
@@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
-	error = device_suspend(PMSG_FREEZE);
+	error = dpm_suspend_start(PMSG_FREEZE);
 	if (error)
 		goto Recover_platform;
 
@@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
 	/* Control returns here after successful restore */
 
  Resume_devices:
-	device_resume(in_suspend ?
+	dpm_resume_end(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 	resume_console();
  Close:
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	error = device_suspend_noirq(PMSG_QUIESCE);
+	error = dpm_suspend_noirq(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
  Cleanup:
 	platform_restore_cleanup(platform_mode);
 
-	device_resume_noirq(PMSG_RECOVER);
+	dpm_resume_noirq(PMSG_RECOVER);
 
 	return error;
 }
@@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
-	error = device_suspend(PMSG_QUIESCE);
+	error = dpm_suspend_start(PMSG_QUIESCE);
 	if (!error) {
 		error = resume_target_kernel(platform_mode);
-		device_resume(PMSG_RECOVER);
+		dpm_resume_end(PMSG_RECOVER);
 	}
 	resume_console();
 	pm_restore_console();
@@ -447,14 +447,14 @@ int hibernation_platform_enter(void)
 
 	entering_platform_hibernation = true;
 	suspend_console();
-	error = device_suspend(PMSG_HIBERNATE);
+	error = dpm_suspend_start(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
 			hibernation_ops->recover();
 		goto Resume_devices;
 	}
 
-	error = device_suspend_noirq(PMSG_HIBERNATE);
+	error = dpm_suspend_noirq(PMSG_HIBERNATE);
 	if (error)
 		goto Resume_devices;
 
@@ -479,11 +479,11 @@ int hibernation_platform_enter(void)
  Platofrm_finish:
 	hibernation_ops->finish();
 
-	device_suspend_noirq(PMSG_RESTORE);
+	dpm_suspend_noirq(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
-	device_resume(PMSG_RESTORE);
+	dpm_resume_end(PMSG_RESTORE);
 	resume_console();
 
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2f6638ee03c..46386b9f8dd 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state)
 			return error;
 	}
 
-	error = device_suspend_noirq(PMSG_SUSPEND);
+	error = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Platfrom_finish;
@@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state)
 		suspend_ops->wake();
 
  Power_up_devices:
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
  Platfrom_finish:
 	if (suspend_ops->finish)
@@ -363,7 +363,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	}
 	suspend_console();
 	suspend_test_start();
-	error = device_suspend(PMSG_SUSPEND);
+	error = dpm_suspend_start(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
 		goto Recover_platform;
@@ -376,7 +376,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 
  Resume_devices:
 	suspend_test_start();
-	device_resume(PMSG_RESUME);
+	dpm_resume_end(PMSG_RESUME);
 	suspend_test_finish("resume devices");
 	resume_console();
  Close:
-- 
cgit v1.2.3-70-g09d2


From e240b58c79144708530138e05f17c6d0d8d744a8 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Sun, 24 May 2009 22:05:54 +0200
Subject: PM: Remove bus_type suspend_late()/resume_early() V2

Remove the ->suspend_late() and ->resume_early() callbacks
from struct bus_type V2. These callbacks are legacy stuff
at this point and since there seem to be no in-tree users
we may as well remove them. New users should use dev_pm_ops.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/devices.txt | 34 +++++-----------------------------
 drivers/base/power/main.c       |  7 -------
 include/linux/device.h          |  2 --
 3 files changed, 5 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 421e7d00ffd..c9abbd86bc1 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -75,9 +75,6 @@ may need to apply in domain-specific ways to their devices:
 struct bus_type {
 	...
 	int  (*suspend)(struct device *dev, pm_message_t state);
-	int  (*suspend_late)(struct device *dev, pm_message_t state);
-
-	int  (*resume_early)(struct device *dev);
 	int  (*resume)(struct device *dev);
 };
 
@@ -226,20 +223,7 @@ The phases are seen by driver notifications issued in this order:
 
 	This call should handle parts of device suspend logic that require
 	sleeping.  It probably does work to quiesce the device which hasn't
-	been abstracted into class.suspend() or bus.suspend_late().
-
-   3	bus.suspend_late(dev, message) is called with IRQs disabled, and
-	with only one CPU active.  Until the bus.resume_early() phase
-	completes (see later), IRQs are not enabled again.  This method
-	won't be exposed by all busses; for message based busses like USB,
-	I2C, or SPI, device interactions normally require IRQs.  This bus
-	call may be morphed into a driver call with bus-specific parameters.
-
-	This call might save low level hardware state that might otherwise
-	be lost in the upcoming low power state, and actually put the
-	device into a low power state ... so that in some cases the device
-	may stay partly usable until this late.  This "late" call may also
-	help when coping with hardware that behaves badly.
+	been abstracted into class.suspend().
 
 The pm_message_t parameter is currently used to refine those semantics
 (described later).
@@ -351,19 +335,11 @@ devices processing each phase's calls before the next phase begins.
 
 The phases are seen by driver notifications issued in this order:
 
-   1	bus.resume_early(dev) is called with IRQs disabled, and with
-   	only one CPU active.  As with bus.suspend_late(), this method
-	won't be supported on busses that require IRQs in order to
-	interact with devices.
-
-	This reverses the effects of bus.suspend_late().
-
-   2	bus.resume(dev) is called next.  This may be morphed into a device
-   	driver call with bus-specific parameters; implementations may sleep.
-
-	This reverses the effects of bus.suspend().
+   1	bus.resume(dev) reverses the effects of bus.suspend().  This may
+	be morphed into a device driver call with bus-specific parameters;
+	implementations may sleep.
 
-   3	class.resume(dev) is called for devices associated with a class
+   2	class.resume(dev) is called for devices associated with a class
 	that has such a method.  Implementations may sleep.
 
 	This reverses the effects of class.suspend(), and would usually
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 1f3d82260db..68f9f3cecf7 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -334,9 +334,6 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 	if (dev->bus->pm) {
 		pm_dev_dbg(dev, state, "EARLY ");
 		error = pm_noirq_op(dev, dev->bus->pm, state);
-	} else if (dev->bus->resume_early) {
-		pm_dev_dbg(dev, state, "legacy EARLY ");
-		error = dev->bus->resume_early(dev);
 	}
  End:
 	TRACE_RESUME(error);
@@ -581,10 +578,6 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 	if (dev->bus->pm) {
 		pm_dev_dbg(dev, state, "LATE ");
 		error = pm_noirq_op(dev, dev->bus->pm, state);
-	} else if (dev->bus->suspend_late) {
-		pm_dev_dbg(dev, state, "legacy LATE ");
-		error = dev->bus->suspend_late(dev, state);
-		suspend_report_result(dev->bus->suspend_late, error);
 	}
 	return error;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 5d5c197bad4..84d79cde9f7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -62,8 +62,6 @@ struct bus_type {
 	void (*shutdown)(struct device *dev);
 
 	int (*suspend)(struct device *dev, pm_message_t state);
-	int (*suspend_late)(struct device *dev, pm_message_t state);
-	int (*resume_early)(struct device *dev);
 	int (*resume)(struct device *dev);
 
 	struct dev_pm_ops *pm;
-- 
cgit v1.2.3-70-g09d2


From 00725787511e20dbd1fdc1fb233606120ae5c8cf Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 22:13:25 +0200
Subject: PM: Remove device_type suspend()/resume()

This patch removes the legacy callbacks ->suspend() and
->resume() from struct device_type. These callbacks seem
unused, and new code should instead make use of struct
dev_pm_ops.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/main.c | 7 -------
 include/linux/device.h    | 3 ---
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 68f9f3cecf7..fae72545898 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -399,9 +399,6 @@ static int device_resume(struct device *dev, pm_message_t state)
 		if (dev->type->pm) {
 			pm_dev_dbg(dev, state, "type ");
 			error = pm_op(dev, dev->type->pm, state);
-		} else if (dev->type->resume) {
-			pm_dev_dbg(dev, state, "legacy type ");
-			error = dev->type->resume(dev);
 		}
 		if (error)
 			goto End;
@@ -641,10 +638,6 @@ static int device_suspend(struct device *dev, pm_message_t state)
 		if (dev->type->pm) {
 			pm_dev_dbg(dev, state, "type ");
 			error = pm_op(dev, dev->type->pm, state);
-		} else if (dev->type->suspend) {
-			pm_dev_dbg(dev, state, "legacy type ");
-			error = dev->type->suspend(dev, state);
-			suspend_report_result(dev->type->suspend, error);
 		}
 		if (error)
 			goto End;
diff --git a/include/linux/device.h b/include/linux/device.h
index 84d79cde9f7..a4a7b10aaa4 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -289,9 +289,6 @@ struct device_type {
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
 	void (*release)(struct device *dev);
 
-	int (*suspend)(struct device *dev, pm_message_t state);
-	int (*resume)(struct device *dev);
-
 	struct dev_pm_ops *pm;
 };
 
-- 
cgit v1.2.3-70-g09d2


From fce2b111fae9151a53dabb36513b398d03337a19 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Wed, 10 Jun 2009 01:28:19 +0200
Subject: PM/Hibernate: Move NVS routines into a seperate file (v2).

The *_nvs_* routines in swsusp.c make use of the io*map()
functions, which are only provided for HAS_IOMEM, thus
breaking compilation if HAS_IOMEM is not set. Fix this
by moving the *_nvs_* routines into hibernate_nvs.c, which
is only compiled if HAS_IOMEM is set.

[rjw: Change the name of the new file to hibernate_nvs.c, add the
 license line to the header comment.]

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/suspend.h      |  18 +++---
 kernel/power/Kconfig         |   4 ++
 kernel/power/Makefile        |   1 +
 kernel/power/hibernate_nvs.c | 135 +++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c        | 122 --------------------------------------
 5 files changed, 151 insertions(+), 129 deletions(-)
 create mode 100644 kernel/power/hibernate_nvs.c

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 795032edfc4..cd15df6c63c 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -245,11 +245,6 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct platform_hibernation_ops *ops);
 extern int hibernate(void);
-extern int hibernate_nvs_register(unsigned long start, unsigned long size);
-extern int hibernate_nvs_alloc(void);
-extern void hibernate_nvs_free(void);
-extern void hibernate_nvs_save(void);
-extern void hibernate_nvs_restore(void);
 extern bool system_entering_hibernation(void);
 #else /* CONFIG_HIBERNATION */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
@@ -258,6 +253,16 @@ static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
+static inline bool system_entering_hibernation(void) { return false; }
+#endif /* CONFIG_HIBERNATION */
+
+#ifdef CONFIG_HIBERNATION_NVS
+extern int hibernate_nvs_register(unsigned long start, unsigned long size);
+extern int hibernate_nvs_alloc(void);
+extern void hibernate_nvs_free(void);
+extern void hibernate_nvs_save(void);
+extern void hibernate_nvs_restore(void);
+#else /* CONFIG_HIBERNATION_NVS */
 static inline int hibernate_nvs_register(unsigned long a, unsigned long b)
 {
 	return 0;
@@ -266,8 +271,7 @@ static inline int hibernate_nvs_alloc(void) { return 0; }
 static inline void hibernate_nvs_free(void) {}
 static inline void hibernate_nvs_save(void) {}
 static inline void hibernate_nvs_restore(void) {}
-static inline bool system_entering_hibernation(void) { return false; }
-#endif /* CONFIG_HIBERNATION */
+#endif /* CONFIG_HIBERNATION_NVS */
 
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96..72067cbdb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
 
 	  Turning OFF this setting is NOT recommended! If in doubt, say Y.
 
+config HIBERNATION_NVS
+	bool
+
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+	select HIBERNATION_NVS if HAS_IOMEM
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index eadb17fc8f5..c3b81c30e5d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,6 @@ obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 00000000000..39ac698ef83
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	hibernate_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			hibernate_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+}
+
+/**
+ *	hibernate_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 87b901cb392..6a07f4dbf2f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -186,125 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 			centisecs / 100, centisecs % 100,
 			kps / 1000, (kps % 1000) / 10);
 }
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
-	unsigned long phys_start;
-	unsigned int size;
-	void *kaddr;
-	void *data;
-	struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- *	hibernate_nvs_register - register platform NVS memory region to save
- *	@start - physical address of the region
- *	@size - size of the region
- *
- *	The NVS region need not be page-aligned (both ends) and we arrange
- *	things so that the data from page-aligned addresses in this region will
- *	be copied into separate RAM pages.
- */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
-{
-	struct nvs_page *entry, *next;
-
-	while (size > 0) {
-		unsigned int nr_bytes;
-
-		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-		if (!entry)
-			goto Error;
-
-		list_add_tail(&entry->node, &nvs_list);
-		entry->phys_start = start;
-		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-		entry->size = (size < nr_bytes) ? size : nr_bytes;
-
-		start += entry->size;
-		size -= entry->size;
-	}
-	return 0;
-
- Error:
-	list_for_each_entry_safe(entry, next, &nvs_list, node) {
-		list_del(&entry->node);
-		kfree(entry);
-	}
-	return -ENOMEM;
-}
-
-/**
- *	hibernate_nvs_free - free data pages allocated for saving NVS regions
- */
-void hibernate_nvs_free(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			free_page((unsigned long)entry->data);
-			entry->data = NULL;
-			if (entry->kaddr) {
-				iounmap(entry->kaddr);
-				entry->kaddr = NULL;
-			}
-		}
-}
-
-/**
- *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int hibernate_nvs_alloc(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node) {
-		entry->data = (void *)__get_free_page(GFP_KERNEL);
-		if (!entry->data) {
-			hibernate_nvs_free();
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-/**
- *	hibernate_nvs_save - save NVS memory regions
- */
-void hibernate_nvs_save(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			entry->kaddr = ioremap(entry->phys_start, entry->size);
-			memcpy(entry->data, entry->kaddr, entry->size);
-		}
-}
-
-/**
- *	hibernate_nvs_restore - restore NVS memory regions
- *
- *	This function is going to be called with interrupts disabled, so it
- *	cannot iounmap the virtual addresses used to access the NVS region.
- */
-void hibernate_nvs_restore(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data)
-			memcpy(entry->kaddr, entry->data, entry->size);
-}
-- 
cgit v1.2.3-70-g09d2


From 5818a6e2519b34cd6d0220d89f5729ab2725e1bf Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 11 Jun 2009 21:59:21 +0200
Subject: PM: Add empty suspend/resume device irq functions

git commit 0a0c5168 "PM: Introduce functions for suspending and resuming
device interrupts" introduced some helper functions. However these
functions are only available for architectures which support
GENERIC_HARDIRQS.

Other architectures will see this build error:

drivers/built-in.o: In function `sysdev_suspend':
(.text+0x15138): undefined reference to `check_wakeup_irqs'
drivers/built-in.o: In function `device_power_up':
(.text+0x1cb66): undefined reference to `resume_device_irqs'
drivers/built-in.o: In function `device_power_down':
(.text+0x1cb92): undefined reference to `suspend_device_irqs'

To fix this add some empty inline functions for !GENERIC_HARDIRQS.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/interrupt.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ff374ceface..c41e812e9d5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -183,6 +183,7 @@ extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 
 /* The following three functions are for the core kernel use only. */
+#ifdef CONFIG_GENERIC_HARDIRQS
 extern void suspend_device_irqs(void);
 extern void resume_device_irqs(void);
 #ifdef CONFIG_PM_SLEEP
@@ -190,6 +191,11 @@ extern int check_wakeup_irqs(void);
 #else
 static inline int check_wakeup_irqs(void) { return 0; }
 #endif
+#else
+static inline void suspend_device_irqs(void) { };
+static inline void resume_device_irqs(void) { };
+static inline int check_wakeup_irqs(void) { return 0; }
+#endif
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 
-- 
cgit v1.2.3-70-g09d2


From acc6be5405b90c9f0fb0eb8a74ec4d4b7b5bf48f Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Tue, 20 May 2008 11:15:43 +0200
Subject: x86: add save_stack_trace_bp() for tracing from a specific stack
 frame

This will help kmemcheck (and possibly other debugging tools) since we
can now simply pass regs->bp to the stack tracer instead of specifying
the number of stack frames to skip, which is unreliable if gcc decides
to inline functions, etc.

Note that this makes the API incomplete for other architectures, but I
expect that those can be updated lazily, e.g. when they need it.

Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/kernel/stacktrace.c | 7 +++++++
 include/linux/stacktrace.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394..c3eb207181f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
+void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+{
+	dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
 	dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 1a8cecc4f38..551f6c7d504 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -11,6 +11,7 @@ struct stack_trace {
 };
 
 extern void save_stack_trace(struct stack_trace *trace);
+extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
 extern void save_stack_trace_tsk(struct task_struct *tsk,
 				struct stack_trace *trace);
 
-- 
cgit v1.2.3-70-g09d2


From b618ad31bb2020db6a36929122e5554e33210d47 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 13 Jun 2008 15:31:11 +0200
Subject: stacktrace: add forward-declaration struct task_struct

This is needed if the header is to be free-standing.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/stacktrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 551f6c7d504..51efbef38fb 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -4,6 +4,8 @@
 struct task_struct;
 
 #ifdef CONFIG_STACKTRACE
+struct task_struct;
+
 struct stack_trace {
 	unsigned int nr_entries, max_entries;
 	unsigned long *entries;
-- 
cgit v1.2.3-70-g09d2


From c4bf2f372db09ef8d16a25a60d523bfa1c50f7b5 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 11 Jun 2009 23:53:55 -0400
Subject: ACPI, PCI, x86: move MCFG parsing routine from ACPI to PCI file

Move
arch/x86/kernel/acpi/boot.c: acpi_parse_mcfg()
to
arch/x86/pci/mmconfig-shared.c: pci_parse_mcfg()
where it is used, and make it static.

Move associated globals and helper routine with it.

No functional change.

This code move is in preparation for SFI support,
which will allow the PCI code to find the MCFG table
on systems which do not support ACPI.

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci_x86.h |  3 ++
 arch/x86/kernel/acpi/boot.c    | 66 ------------------------------------------
 arch/x86/pci/mmconfig-shared.c | 65 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/acpi.h           |  3 --
 4 files changed, 67 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e60fd3e14bd..b399988eee3 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -121,6 +121,9 @@ extern int __init pcibios_init(void);
 extern int __init pci_mmcfg_arch_init(void);
 extern void __init pci_mmcfg_arch_free(void);
 
+extern struct acpi_mcfg_allocation *pci_mmcfg_config;
+extern int pci_mmcfg_config_num;
+
 /*
  * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
  * on their northbrige except through the * %eax register. As such, you MUST
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 817d6a5e115..f54e0e557cd 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -117,72 +117,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
 	early_iounmap(map, size);
 }
 
-#ifdef CONFIG_PCI_MMCONFIG
-
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
-/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
-struct acpi_mcfg_allocation *pci_mmcfg_config;
-int pci_mmcfg_config_num;
-
-static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
-{
-	if (!strcmp(mcfg->header.oem_id, "SGI"))
-		acpi_mcfg_64bit_base_addr = TRUE;
-
-	return 0;
-}
-
-int __init acpi_parse_mcfg(struct acpi_table_header *header)
-{
-	struct acpi_table_mcfg *mcfg;
-	unsigned long i;
-	int config_size;
-
-	if (!header)
-		return -EINVAL;
-
-	mcfg = (struct acpi_table_mcfg *)header;
-
-	/* how many config structures do we have */
-	pci_mmcfg_config_num = 0;
-	i = header->length - sizeof(struct acpi_table_mcfg);
-	while (i >= sizeof(struct acpi_mcfg_allocation)) {
-		++pci_mmcfg_config_num;
-		i -= sizeof(struct acpi_mcfg_allocation);
-	};
-	if (pci_mmcfg_config_num == 0) {
-		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
-		return -ENODEV;
-	}
-
-	config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
-	pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
-	if (!pci_mmcfg_config) {
-		printk(KERN_WARNING PREFIX
-		       "No memory for MCFG config tables\n");
-		return -ENOMEM;
-	}
-
-	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
-
-	acpi_mcfg_oem_check(mcfg);
-
-	for (i = 0; i < pci_mmcfg_config_num; ++i) {
-		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
-		    !acpi_mcfg_64bit_base_addr) {
-			printk(KERN_ERR PREFIX
-			       "MMCONFIG not in low 4GB of memory\n");
-			kfree(pci_mmcfg_config);
-			pci_mmcfg_config_num = 0;
-			return -ENODEV;
-		}
-	}
-
-	return 0;
-}
-#endif				/* CONFIG_PCI_MMCONFIG */
-
 #ifdef CONFIG_X86_LOCAL_APIC
 static int __init acpi_parse_madt(struct acpi_table_header *table)
 {
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 8766b0e216c..712443ec6d4 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -523,6 +523,69 @@ reject:
 
 static int __initdata known_bridge;
 
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
+/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
+struct acpi_mcfg_allocation *pci_mmcfg_config;
+int pci_mmcfg_config_num;
+
+static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
+{
+	if (!strcmp(mcfg->header.oem_id, "SGI"))
+		acpi_mcfg_64bit_base_addr = TRUE;
+
+	return 0;
+}
+
+static int __init pci_parse_mcfg(struct acpi_table_header *header)
+{
+	struct acpi_table_mcfg *mcfg;
+	unsigned long i;
+	int config_size;
+
+	if (!header)
+		return -EINVAL;
+
+	mcfg = (struct acpi_table_mcfg *)header;
+
+	/* how many config structures do we have */
+	pci_mmcfg_config_num = 0;
+	i = header->length - sizeof(struct acpi_table_mcfg);
+	while (i >= sizeof(struct acpi_mcfg_allocation)) {
+		++pci_mmcfg_config_num;
+		i -= sizeof(struct acpi_mcfg_allocation);
+	};
+	if (pci_mmcfg_config_num == 0) {
+		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
+		return -ENODEV;
+	}
+
+	config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
+	pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
+	if (!pci_mmcfg_config) {
+		printk(KERN_WARNING PREFIX
+		       "No memory for MCFG config tables\n");
+		return -ENOMEM;
+	}
+
+	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+
+	acpi_mcfg_oem_check(mcfg);
+
+	for (i = 0; i < pci_mmcfg_config_num; ++i) {
+		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
+		    !acpi_mcfg_64bit_base_addr) {
+			printk(KERN_ERR PREFIX
+			       "MMCONFIG not in low 4GB of memory\n");
+			kfree(pci_mmcfg_config);
+			pci_mmcfg_config_num = 0;
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
 static void __init __pci_mmcfg_init(int early)
 {
 	/* MMCONFIG disabled */
@@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early)
 	}
 
 	if (!known_bridge)
-		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+		acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 
 	pci_mmcfg_reject_broken(early);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88be890ee3c..73cb141150d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -113,9 +113,6 @@ void acpi_irq_stats_init(void);
 extern u32 acpi_irq_handled;
 extern u32 acpi_irq_not_handled;
 
-extern struct acpi_mcfg_allocation *pci_mmcfg_config;
-extern int pci_mmcfg_config_num;
-
 extern int sbf_port;
 extern unsigned long acpi_realmode_flags;
 
-- 
cgit v1.2.3-70-g09d2


From 4a7a16dc061e4c57bf288150f51bd4c2ace33723 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 12 Jun 2009 20:42:08 -0400
Subject: ACPI: move declaration acpi_early_init() to acpi.h

Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/acpi.h | 3 +++
 init/main.c          | 6 +-----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 73cb141150d..bf17681cb06 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -290,7 +290,10 @@ void __init acpi_s4_no_nvs(void);
 				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
 
 extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
+extern void acpi_early_init(void);
+
 #else	/* CONFIG_ACPI */
+static inline void acpi_early_init(void) { }
 
 static inline int early_acpi_boot_init(void)
 {
diff --git a/init/main.c b/init/main.c
index d721dad05dd..f1b9f0fdb1b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -24,6 +24,7 @@
 #include <linux/smp_lock.h>
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
+#include <linux/acpi.h>
 #include <linux/tty.h>
 #include <linux/gfp.h>
 #include <linux/percpu.h>
@@ -86,11 +87,6 @@ extern void sbus_init(void);
 extern void prio_tree_init(void);
 extern void radix_tree_init(void);
 extern void free_initmem(void);
-#ifdef	CONFIG_ACPI
-extern void acpi_early_init(void);
-#else
-static inline void acpi_early_init(void) { }
-#endif
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 8eae985f08138758e06503588f5f1196269bc415 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 9 May 2008 20:32:44 +0200
Subject: slab: move struct kmem_cache to headers

Move the SLAB struct kmem_cache definition to <linux/slab_def.h> like
with SLUB so kmemcheck can access ->ctor and ->flags.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/slab_def.h | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slab.c                | 81 ------------------------------------------------
 2 files changed, 81 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 713f841ecaa..850d057500d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -16,6 +16,87 @@
 #include <linux/compiler.h>
 #include <linux/kmemtrace.h>
 
+/*
+ * struct kmem_cache
+ *
+ * manages a cache.
+ */
+
+struct kmem_cache {
+/* 1) per-cpu data, touched during every alloc/free */
+	struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
+	unsigned int batchcount;
+	unsigned int limit;
+	unsigned int shared;
+
+	unsigned int buffer_size;
+	u32 reciprocal_buffer_size;
+/* 3) touched by every alloc & free from the backend */
+
+	unsigned int flags;		/* constant flags */
+	unsigned int num;		/* # of objs per slab */
+
+/* 4) cache_grow/shrink */
+	/* order of pgs per slab (2^n) */
+	unsigned int gfporder;
+
+	/* force GFP flags, e.g. GFP_DMA */
+	gfp_t gfpflags;
+
+	size_t colour;			/* cache colouring range */
+	unsigned int colour_off;	/* colour offset */
+	struct kmem_cache *slabp_cache;
+	unsigned int slab_size;
+	unsigned int dflags;		/* dynamic flags */
+
+	/* constructor func */
+	void (*ctor)(void *obj);
+
+/* 5) cache creation/removal */
+	const char *name;
+	struct list_head next;
+
+/* 6) statistics */
+#ifdef CONFIG_DEBUG_SLAB
+	unsigned long num_active;
+	unsigned long num_allocations;
+	unsigned long high_mark;
+	unsigned long grown;
+	unsigned long reaped;
+	unsigned long errors;
+	unsigned long max_freeable;
+	unsigned long node_allocs;
+	unsigned long node_frees;
+	unsigned long node_overflow;
+	atomic_t allochit;
+	atomic_t allocmiss;
+	atomic_t freehit;
+	atomic_t freemiss;
+
+	/*
+	 * If debugging is enabled, then the allocator can add additional
+	 * fields and/or padding to every object. buffer_size contains the total
+	 * object size including these internal fields, the following two
+	 * variables contain the offset to the user object and its size.
+	 */
+	int obj_offset;
+	int obj_size;
+#endif /* CONFIG_DEBUG_SLAB */
+
+	/*
+	 * We put nodelists[] at the end of kmem_cache, because we want to size
+	 * this array to nr_node_ids slots instead of MAX_NUMNODES
+	 * (see kmem_cache_init())
+	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
+	 * is statically defined, so we reserve the max number of nodes.
+	 */
+	struct kmem_list3 *nodelists[MAX_NUMNODES];
+	/*
+	 * Do not add fields after nodelists[]
+	 */
+};
+
 /* Size description struct for general caches. */
 struct cache_sizes {
 	size_t		 	cs_size;
diff --git a/mm/slab.c b/mm/slab.c
index f46b65d124e..bf0c3af143f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -374,87 +374,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
 	} while (0)
 
-/*
- * struct kmem_cache
- *
- * manages a cache.
- */
-
-struct kmem_cache {
-/* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache *array[NR_CPUS];
-/* 2) Cache tunables. Protected by cache_chain_mutex */
-	unsigned int batchcount;
-	unsigned int limit;
-	unsigned int shared;
-
-	unsigned int buffer_size;
-	u32 reciprocal_buffer_size;
-/* 3) touched by every alloc & free from the backend */
-
-	unsigned int flags;		/* constant flags */
-	unsigned int num;		/* # of objs per slab */
-
-/* 4) cache_grow/shrink */
-	/* order of pgs per slab (2^n) */
-	unsigned int gfporder;
-
-	/* force GFP flags, e.g. GFP_DMA */
-	gfp_t gfpflags;
-
-	size_t colour;			/* cache colouring range */
-	unsigned int colour_off;	/* colour offset */
-	struct kmem_cache *slabp_cache;
-	unsigned int slab_size;
-	unsigned int dflags;		/* dynamic flags */
-
-	/* constructor func */
-	void (*ctor)(void *obj);
-
-/* 5) cache creation/removal */
-	const char *name;
-	struct list_head next;
-
-/* 6) statistics */
-#if STATS
-	unsigned long num_active;
-	unsigned long num_allocations;
-	unsigned long high_mark;
-	unsigned long grown;
-	unsigned long reaped;
-	unsigned long errors;
-	unsigned long max_freeable;
-	unsigned long node_allocs;
-	unsigned long node_frees;
-	unsigned long node_overflow;
-	atomic_t allochit;
-	atomic_t allocmiss;
-	atomic_t freehit;
-	atomic_t freemiss;
-#endif
-#if DEBUG
-	/*
-	 * If debugging is enabled, then the allocator can add additional
-	 * fields and/or padding to every object. buffer_size contains the total
-	 * object size including these internal fields, the following two
-	 * variables contain the offset to the user object and its size.
-	 */
-	int obj_offset;
-	int obj_size;
-#endif
-	/*
-	 * We put nodelists[] at the end of kmem_cache, because we want to size
-	 * this array to nr_node_ids slots instead of MAX_NUMNODES
-	 * (see kmem_cache_init())
-	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
-	 * is statically defined, so we reserve the max number of nodes.
-	 */
-	struct kmem_list3 *nodelists[MAX_NUMNODES];
-	/*
-	 * Do not add fields after nodelists[]
-	 */
-};
-
 #define CFLGS_OFF_SLAB		(0x80000000UL)
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 
-- 
cgit v1.2.3-70-g09d2


From 7c692cbade8b8884f1c20500393bcc7cd6d24ef8 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Wed, 21 May 2008 22:53:13 +0200
Subject: tasklets: new tasklet scheduling function

Rationale: kmemcheck needs to be able to schedule a tasklet without
touching any dynamically allocated memory _at_ _all_ (since that would
lead to a recursive page fault). This tasklet is used for writing the
error reports to the kernel log.

The new scheduling function avoids touching any other tasklets by
inserting the new tasklist as the head of the "tasklet_hi" list instead
of on the tail.

Also don't wake up the softirq thread lest the scheduler access some
tracked memory and we go down with a recursive page fault.

In this case, we'd better just wait for the maximum time of 1/HZ for the
message to appear.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/interrupt.h | 14 ++++++++++++++
 kernel/softirq.c          | 11 +++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ff374ceface..dd574d51bca 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -466,6 +466,20 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
+extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
+
+/*
+ * This version avoids touching any other tasklets. Needed for kmemcheck
+ * in order not to take any page faults while enqueueing this tasklet;
+ * consider VERY carefully whether you really need this or
+ * tasklet_hi_schedule()...
+ */
+static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+		__tasklet_hi_schedule_first(t);
+}
+
 
 static inline void tasklet_disable_nosync(struct tasklet_struct *t)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543d..b41fb710e11 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+	BUG_ON(!irqs_disabled());
+
+	t->next = __get_cpu_var(tasklet_hi_vec).head;
+	__get_cpu_var(tasklet_hi_vec).head = t;
+	__raise_softirq_irqoff(HI_SOFTIRQ);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
 static void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
-- 
cgit v1.2.3-70-g09d2


From dd14be4c274fc484eccace03ae9726e516630331 Mon Sep 17 00:00:00 2001
From: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Date: Fri, 5 Jun 2009 15:40:32 +0200
Subject: i2c-ocores: Can add I2C devices to the bus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is sometimes a need for the ocores driver to add devices to the
bus when installed.

i2c_register_board_info can not always be used, because the I2C devices
 are not known at an early state, they could for instance be connected
 on a I2C bus on a PCI device which has the Open Cores IP.

i2c_new_device can not be used in all cases either since the resulting
bus nummer might be unknown.

The solution is the pass a list of I2C devices in the platform data to
the Open Cores driver. This is useful for MFD drivers.

Signed-off-by: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 Documentation/i2c/busses/i2c-ocores | 17 +++++++++++++++++
 drivers/i2c/busses/i2c-ocores.c     |  5 +++++
 include/linux/i2c-ocores.h          |  2 ++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
index cfcebb10d14..c269aaa2f26 100644
--- a/Documentation/i2c/busses/i2c-ocores
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -20,6 +20,8 @@ platform_device with the base address and interrupt number. The
 dev.platform_data of the device should also point to a struct
 ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
 distance between registers and the input clock speed.
+There is also a possibility to attach a list of i2c_board_info which
+the i2c-ocores driver will add to the bus upon creation.
 
 E.G. something like:
 
@@ -36,9 +38,24 @@ static struct resource ocores_resources[] = {
 	},
 };
 
+/* optional board info */
+struct i2c_board_info ocores_i2c_board_info[] = {
+	{
+		I2C_BOARD_INFO("tsc2003", 0x48),
+		.platform_data = &tsc2003_platform_data,
+		.irq = TSC_IRQ
+	},
+	{
+		I2C_BOARD_INFO("adv7180", 0x42 >> 1),
+		.irq = ADV_IRQ
+	}
+};
+
 static struct ocores_i2c_platform_data myi2c_data = {
 	.regstep	= 2,		/* two bytes between registers */
 	.clock_khz	= 50000,	/* input clock of 50MHz */
+	.devices	= ocores_i2c_board_info, /* optional table of devices */
+	.num_devices	= ARRAY_SIZE(ocores_i2c_board_info), /* table size */
 };
 
 static struct platform_device myi2c = {
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index e5193bf7548..3542c6ba98f 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -216,6 +216,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 	struct ocores_i2c_platform_data *pdata;
 	struct resource *res, *res2;
 	int ret;
+	int i;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res)
@@ -271,6 +272,10 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 		goto add_adapter_failed;
 	}
 
+	/* add in known devices to the bus */
+	for (i = 0; i < pdata->num_devices; i++)
+		i2c_new_device(&i2c->adap, pdata->devices + i);
+
 	return 0;
 
 add_adapter_failed:
diff --git a/include/linux/i2c-ocores.h b/include/linux/i2c-ocores.h
index 8ed591b0887..4d5e57ff661 100644
--- a/include/linux/i2c-ocores.h
+++ b/include/linux/i2c-ocores.h
@@ -14,6 +14,8 @@
 struct ocores_i2c_platform_data {
 	u32 regstep;   /* distance between registers */
 	u32 clock_khz; /* input clock in kHz */
+	u8 num_devices; /* number of devices in the devices list */
+	struct i2c_board_info const *devices; /* devices connected to the bus */
 };
 
 #endif /* _LINUX_I2C_OCORES_H */
-- 
cgit v1.2.3-70-g09d2


From cd6d95d8449b7c9f415f26041e9ae173d387b6bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 12 Jun 2009 11:29:27 +0200
Subject: clocksource: prevent selection of low resolution clocksourse also for
 nohz=on

commit 3f68535adad (clocksource: sanity check sysfs clocksource
changes) prevents selection of non high resolution capable
clocksources when high resolution mode is active, but did not take
into account that the same rules apply for highres=off nohz=on.

Check the tick device mode instead of hrtimer_hres_active() to verify
whether the system needs to be protected from a switch to jiffies or
other non highres capable clock sources.

Reported-by: Luming Yu <luming.yu@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h    |  2 +-
 include/linux/tick.h       |  3 +++
 kernel/hrtimer.c           |  4 ++--
 kernel/time/clocksource.c  | 18 ++++++++++--------
 kernel/time/tick-oneshot.c | 17 +++++++++++++++++
 5 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 58021b0c396..0d2f7c8a33d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -305,7 +305,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
-extern int hrtimer_hres_active(void);
+
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 469b82d88b3..0482229c07d 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -97,10 +97,12 @@ extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
 extern void tick_check_idle(int cpu);
+extern int tick_oneshot_mode_active(void);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 static inline void tick_check_idle(int cpu) { }
+static inline int tick_oneshot_mode_active(void) { return 0; }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -109,6 +111,7 @@ static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 static inline void tick_check_idle(int cpu) { }
+static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1a70c18cdff..cb8a15c1958 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -476,7 +476,7 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
-int hrtimer_hres_active(void)
+static inline int hrtimer_hres_active(void)
 {
 	return __get_cpu_var(hrtimer_bases).hres_active;
 }
@@ -704,7 +704,7 @@ static int hrtimer_switch_to_hres(void)
 
 #else
 
-int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 18b9f5da4ee..592bf584d1d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -30,7 +30,6 @@
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
-#include <linux/hrtimer.h>
 
 void timecounter_init(struct timecounter *tc,
 		      const struct cyclecounter *cc,
@@ -511,13 +510,13 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 	}
 
 	/*
-	 * Check to make sure we don't switch to a non-HRT usable
-	 * clocksource if HRT is enabled and running
+	 * Check to make sure we don't switch to a non-highres capable
+	 * clocksource if the tick code is in oneshot mode (highres or nohz)
 	 */
-	if (hrtimer_hres_active() &&
+	if (tick_oneshot_mode_active() &&
 	    !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
 		printk(KERN_WARNING "%s clocksource is not HRT compatible. "
-			"Cannot switch while in HRT mode\n", ovr->name);
+			"Cannot switch while in HRT/NOHZ mode\n", ovr->name);
 		ovr = NULL;
 		override_name[0] = 0;
 	}
@@ -550,9 +549,12 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 
 	spin_lock_irq(&clocksource_lock);
 	list_for_each_entry(src, &clocksource_list, list) {
-		/* Don't show non-HRES clocksource if HRES is enabled */
-		if (!hrtimer_hres_active() ||
-				(src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+		/*
+		 * Don't show non-HRES clocksource if the tick code is
+		 * in one shot mode (highres=on or nohz=on)
+		 */
+		if (!tick_oneshot_mode_active() ||
+		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
 			count += snprintf(buf + count,
 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
 				  "%s ", src->name);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e76..a96c0e2b89c 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 	return 0;
 }
 
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 /**
  * tick_init_highres - switch to high resolution mode
-- 
cgit v1.2.3-70-g09d2


From d219dce76c64f2c883dad0537fa09a56d5ff0a10 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 13 Jun 2009 12:28:57 +0200
Subject: list_nulls: add hlist_nulls_add_head and hlist_nulls_del

This patch adds the hlist_nulls_add_head() function which is
based on hlist_nulls_add_head_rcu() but without the use of
rcu_assign_pointer(). It also adds hlist_nulls_del which is
exactly the same like hlist_nulls_del_rcu().

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/list_nulls.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 93150ecf3ea..5d10ae364b5 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -56,6 +56,18 @@ static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
 	return is_a_nulls(h->first);
 }
 
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	h->first = n;
+	if (!is_a_nulls(first))
+		first->pprev = &n->next;
+}
+
 static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 {
 	struct hlist_nulls_node *next = n->next;
@@ -65,6 +77,12 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 		next->pprev = pprev;
 }
 
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	n->pprev = LIST_POISON2;
+}
+
 /**
  * hlist_nulls_for_each_entry	- iterate over list of given type
  * @tpos:	the type * to use as a loop cursor.
-- 
cgit v1.2.3-70-g09d2


From dfec072ecd35ba6ecad2d51dde325253ac9a2936 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 4 Apr 2008 00:51:41 +0200
Subject: kmemcheck: add the kmemcheck core

General description: kmemcheck is a patch to the linux kernel that
detects use of uninitialized memory. It does this by trapping every
read and write to memory that was allocated dynamically (e.g. using
kmalloc()). If a memory address is read that has not previously been
written to, a message is printed to the kernel log.

Thanks to Andi Kleen for the set_memory_4k() solution.

Andrew Morton suggested documenting the shadow member of struct page.

Signed-off-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

[export kmemcheck_mark_initialized]
[build fix for setup_max_cpus]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegardno@ifi.uio.no>
---
 arch/x86/Makefile                    |   5 +
 arch/x86/include/asm/kmemcheck.h     |  42 +++
 arch/x86/include/asm/pgtable.h       |   9 +
 arch/x86/include/asm/pgtable_types.h |   4 +-
 arch/x86/mm/Makefile                 |   2 +
 arch/x86/mm/kmemcheck/Makefile       |   1 +
 arch/x86/mm/kmemcheck/error.c        | 229 ++++++++++++
 arch/x86/mm/kmemcheck/error.h        |  15 +
 arch/x86/mm/kmemcheck/kmemcheck.c    | 650 +++++++++++++++++++++++++++++++++++
 arch/x86/mm/kmemcheck/opcode.c       | 101 ++++++
 arch/x86/mm/kmemcheck/opcode.h       |   9 +
 arch/x86/mm/kmemcheck/pte.c          |  22 ++
 arch/x86/mm/kmemcheck/pte.h          |  10 +
 arch/x86/mm/kmemcheck/shadow.c       | 153 +++++++++
 arch/x86/mm/kmemcheck/shadow.h       |  16 +
 include/linux/kmemcheck.h            |  17 +
 include/linux/mm_types.h             |   8 +
 init/main.c                          |   1 +
 kernel/sysctl.c                      |  12 +
 19 files changed, 1304 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/kmemcheck.h
 create mode 100644 arch/x86/mm/kmemcheck/Makefile
 create mode 100644 arch/x86/mm/kmemcheck/error.c
 create mode 100644 arch/x86/mm/kmemcheck/error.h
 create mode 100644 arch/x86/mm/kmemcheck/kmemcheck.c
 create mode 100644 arch/x86/mm/kmemcheck/opcode.c
 create mode 100644 arch/x86/mm/kmemcheck/opcode.h
 create mode 100644 arch/x86/mm/kmemcheck/pte.c
 create mode 100644 arch/x86/mm/kmemcheck/pte.h
 create mode 100644 arch/x86/mm/kmemcheck/shadow.c
 create mode 100644 arch/x86/mm/kmemcheck/shadow.h
 create mode 100644 include/linux/kmemcheck.h

(limited to 'include/linux')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index edbd0ca6206..1b68659c41b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
         endif
 endif
 
+# Don't unroll struct assignments with kmemcheck enabled
+ifeq ($(CONFIG_KMEMCHECK),y)
+	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
+endif
+
 # Stackpointer is addressed different for 32 bit and 64 bit x86
 sp-$(CONFIG_X86_32) := esp
 sp-$(CONFIG_X86_64) := rsp
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 00000000000..ed01518f297
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
+#ifndef ASM_X86_KMEMCHECK_H
+#define ASM_X86_KMEMCHECK_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_KMEMCHECK
+bool kmemcheck_active(struct pt_regs *regs);
+
+void kmemcheck_show(struct pt_regs *regs);
+void kmemcheck_hide(struct pt_regs *regs);
+
+bool kmemcheck_fault(struct pt_regs *regs,
+	unsigned long address, unsigned long error_code);
+bool kmemcheck_trap(struct pt_regs *regs);
+#else
+static inline bool kmemcheck_active(struct pt_regs *regs)
+{
+	return false;
+}
+
+static inline void kmemcheck_show(struct pt_regs *regs)
+{
+}
+
+static inline void kmemcheck_hide(struct pt_regs *regs)
+{
+}
+
+static inline bool kmemcheck_fault(struct pt_regs *regs,
+	unsigned long address, unsigned long error_code)
+{
+	return false;
+}
+
+static inline bool kmemcheck_trap(struct pt_regs *regs)
+{
+	return false;
+}
+#endif /* CONFIG_KMEMCHECK */
+
+#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18ef7ebf263..c5a08079ad5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -317,6 +317,15 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
+static inline int pte_hidden(pte_t x)
+{
+#ifdef CONFIG_KMEMCHECK
+	return pte_flags(x) & _PAGE_HIDDEN;
+#else
+	return 0;
+#endif
+}
+
 static inline int pmd_present(pmd_t pmd)
 {
 	return pmd_flags(pmd) & _PAGE_PRESENT;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..9b5c92140aa 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
 #define _PAGE_BIT_UNUSED1	9	/* available for programmer */
 #define _PAGE_BIT_IOMAP		10	/* flag used to indicate IO mapping */
-#define _PAGE_BIT_UNUSED3	11
+#define _PAGE_BIT_HIDDEN	11	/* hidden by kmemcheck */
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_UNUSED1
@@ -41,7 +41,7 @@
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
 #define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
-#define _PAGE_UNUSED3	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
+#define _PAGE_HIDDEN	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab5..eefdeee8a87 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
+obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
+
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 00000000000..4666b7a778b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
+obj-y := error.o kmemcheck.o opcode.o pte.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 00000000000..5ec9f5a93f4
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,229 @@
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/kmemcheck.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include "error.h"
+#include "shadow.h"
+
+enum kmemcheck_error_type {
+	KMEMCHECK_ERROR_INVALID_ACCESS,
+	KMEMCHECK_ERROR_BUG,
+};
+
+#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
+
+struct kmemcheck_error {
+	enum kmemcheck_error_type type;
+
+	union {
+		/* KMEMCHECK_ERROR_INVALID_ACCESS */
+		struct {
+			/* Kind of access that caused the error */
+			enum kmemcheck_shadow state;
+			/* Address and size of the erroneous read */
+			unsigned long	address;
+			unsigned int	size;
+		};
+	};
+
+	struct pt_regs		regs;
+	struct stack_trace	trace;
+	unsigned long		trace_entries[32];
+
+	/* We compress it to a char. */
+	unsigned char		shadow_copy[SHADOW_COPY_SIZE];
+	unsigned char		memory_copy[SHADOW_COPY_SIZE];
+};
+
+/*
+ * Create a ring queue of errors to output. We can't call printk() directly
+ * from the kmemcheck traps, since this may call the console drivers and
+ * result in a recursive fault.
+ */
+static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
+static unsigned int error_count;
+static unsigned int error_rd;
+static unsigned int error_wr;
+static unsigned int error_missed_count;
+
+static struct kmemcheck_error *error_next_wr(void)
+{
+	struct kmemcheck_error *e;
+
+	if (error_count == ARRAY_SIZE(error_fifo)) {
+		++error_missed_count;
+		return NULL;
+	}
+
+	e = &error_fifo[error_wr];
+	if (++error_wr == ARRAY_SIZE(error_fifo))
+		error_wr = 0;
+	++error_count;
+	return e;
+}
+
+static struct kmemcheck_error *error_next_rd(void)
+{
+	struct kmemcheck_error *e;
+
+	if (error_count == 0)
+		return NULL;
+
+	e = &error_fifo[error_rd];
+	if (++error_rd == ARRAY_SIZE(error_fifo))
+		error_rd = 0;
+	--error_count;
+	return e;
+}
+
+static void do_wakeup(unsigned long);
+static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
+
+/*
+ * Save the context of an error report.
+ */
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+	unsigned long address, unsigned int size, struct pt_regs *regs)
+{
+	static unsigned long prev_ip;
+
+	struct kmemcheck_error *e;
+	void *shadow_copy;
+	void *memory_copy;
+
+	/* Don't report several adjacent errors from the same EIP. */
+	if (regs->ip == prev_ip)
+		return;
+	prev_ip = regs->ip;
+
+	e = error_next_wr();
+	if (!e)
+		return;
+
+	e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
+
+	e->state = state;
+	e->address = address;
+	e->size = size;
+
+	/* Save regs */
+	memcpy(&e->regs, regs, sizeof(*regs));
+
+	/* Save stack trace */
+	e->trace.nr_entries = 0;
+	e->trace.entries = e->trace_entries;
+	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+	e->trace.skip = 0;
+	save_stack_trace_bp(&e->trace, regs->bp);
+
+	/* Round address down to nearest 16 bytes */
+	shadow_copy = kmemcheck_shadow_lookup(address
+		& ~(SHADOW_COPY_SIZE - 1));
+	BUG_ON(!shadow_copy);
+
+	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
+
+	kmemcheck_show_addr(address);
+	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
+	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
+	kmemcheck_hide_addr(address);
+
+	tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+/*
+ * Save the context of a kmemcheck bug.
+ */
+void kmemcheck_error_save_bug(struct pt_regs *regs)
+{
+	struct kmemcheck_error *e;
+
+	e = error_next_wr();
+	if (!e)
+		return;
+
+	e->type = KMEMCHECK_ERROR_BUG;
+
+	memcpy(&e->regs, regs, sizeof(*regs));
+
+	e->trace.nr_entries = 0;
+	e->trace.entries = e->trace_entries;
+	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+	e->trace.skip = 1;
+	save_stack_trace(&e->trace);
+
+	tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+void kmemcheck_error_recall(void)
+{
+	static const char *desc[] = {
+		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated",
+		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized",
+		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized",
+		[KMEMCHECK_SHADOW_FREED]		= "freed",
+	};
+
+	static const char short_desc[] = {
+		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a',
+		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u',
+		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i',
+		[KMEMCHECK_SHADOW_FREED]		= 'f',
+	};
+
+	struct kmemcheck_error *e;
+	unsigned int i;
+
+	e = error_next_rd();
+	if (!e)
+		return;
+
+	switch (e->type) {
+	case KMEMCHECK_ERROR_INVALID_ACCESS:
+		printk(KERN_ERR  "WARNING: kmemcheck: Caught %d-bit read "
+			"from %s memory (%p)\n",
+			8 * e->size, e->state < ARRAY_SIZE(desc) ?
+				desc[e->state] : "(invalid shadow state)",
+			(void *) e->address);
+
+		printk(KERN_INFO);
+		for (i = 0; i < SHADOW_COPY_SIZE; ++i)
+			printk("%02x", e->memory_copy[i]);
+		printk("\n");
+
+		printk(KERN_INFO);
+		for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
+			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
+				printk(" %c", short_desc[e->shadow_copy[i]]);
+			else
+				printk(" ?");
+		}
+		printk("\n");
+		printk(KERN_INFO "%*c\n", 2 + 2
+			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
+		break;
+	case KMEMCHECK_ERROR_BUG:
+		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
+		break;
+	}
+
+	__show_regs(&e->regs, 1);
+	print_stack_trace(&e->trace, 0);
+}
+
+static void do_wakeup(unsigned long data)
+{
+	while (error_count > 0)
+		kmemcheck_error_recall();
+
+	if (error_missed_count > 0) {
+		printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
+			"the queue was too small\n", error_missed_count);
+		error_missed_count = 0;
+	}
+}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 00000000000..0efc2e8d0a2
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
+#define ARCH__X86__MM__KMEMCHECK__ERROR_H
+
+#include <linux/ptrace.h>
+
+#include "shadow.h"
+
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+	unsigned long address, unsigned int size, struct pt_regs *regs);
+
+void kmemcheck_error_save_bug(struct pt_regs *regs);
+
+void kmemcheck_error_recall(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 00000000000..9de7d8f6b6e
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,650 @@
+/**
+ * kmemcheck - a heavyweight memory checker for the linux kernel
+ * Copyright (C) 2007, 2008  Vegard Nossum <vegardno@ifi.uio.no>
+ * (With a lot of help from Ingo Molnar and Pekka Enberg.)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2) as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kmemcheck.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "error.h"
+#include "opcode.h"
+#include "pte.h"
+#include "shadow.h"
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+#  define KMEMCHECK_ENABLED 0
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+#  define KMEMCHECK_ENABLED 1
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+#  define KMEMCHECK_ENABLED 2
+#endif
+
+int kmemcheck_enabled = KMEMCHECK_ENABLED;
+
+int __init kmemcheck_init(void)
+{
+	printk(KERN_INFO "kmemcheck: \"Bugs, beware!\"\n");
+
+#ifdef CONFIG_SMP
+	/*
+	 * Limit SMP to use a single CPU. We rely on the fact that this code
+	 * runs before SMP is set up.
+	 */
+	if (setup_max_cpus > 1) {
+		printk(KERN_INFO
+			"kmemcheck: Limiting number of CPUs to 1.\n");
+		setup_max_cpus = 1;
+	}
+#endif
+
+	return 0;
+}
+
+early_initcall(kmemcheck_init);
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+int kmemcheck_enabled = 0;
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+int kmemcheck_enabled = 1;
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+int kmemcheck_enabled = 2;
+#endif
+
+/*
+ * We need to parse the kmemcheck= option before any memory is allocated.
+ */
+static int __init param_kmemcheck(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	sscanf(str, "%d", &kmemcheck_enabled);
+	return 0;
+}
+
+early_param("kmemcheck", param_kmemcheck);
+
+int kmemcheck_show_addr(unsigned long address)
+{
+	pte_t *pte;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return 0;
+
+	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+	__flush_tlb_one(address);
+	return 1;
+}
+
+int kmemcheck_hide_addr(unsigned long address)
+{
+	pte_t *pte;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return 0;
+
+	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+	__flush_tlb_one(address);
+	return 1;
+}
+
+struct kmemcheck_context {
+	bool busy;
+	int balance;
+
+	/*
+	 * There can be at most two memory operands to an instruction, but
+	 * each address can cross a page boundary -- so we may need up to
+	 * four addresses that must be hidden/revealed for each fault.
+	 */
+	unsigned long addr[4];
+	unsigned long n_addrs;
+	unsigned long flags;
+
+	/* Data size of the instruction that caused a fault. */
+	unsigned int size;
+};
+
+static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
+
+bool kmemcheck_active(struct pt_regs *regs)
+{
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+	return data->balance > 0;
+}
+
+/* Save an address that needs to be shown/hidden */
+static void kmemcheck_save_addr(unsigned long addr)
+{
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
+	data->addr[data->n_addrs++] = addr;
+}
+
+static unsigned int kmemcheck_show_all(void)
+{
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+	unsigned int i;
+	unsigned int n;
+
+	n = 0;
+	for (i = 0; i < data->n_addrs; ++i)
+		n += kmemcheck_show_addr(data->addr[i]);
+
+	return n;
+}
+
+static unsigned int kmemcheck_hide_all(void)
+{
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+	unsigned int i;
+	unsigned int n;
+
+	n = 0;
+	for (i = 0; i < data->n_addrs; ++i)
+		n += kmemcheck_hide_addr(data->addr[i]);
+
+	return n;
+}
+
+/*
+ * Called from the #PF handler.
+ */
+void kmemcheck_show(struct pt_regs *regs)
+{
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+	BUG_ON(!irqs_disabled());
+
+	if (unlikely(data->balance != 0)) {
+		kmemcheck_show_all();
+		kmemcheck_error_save_bug(regs);
+		data->balance = 0;
+		return;
+	}
+
+	/*
+	 * None of the addresses actually belonged to kmemcheck. Note that
+	 * this is not an error.
+	 */
+	if (kmemcheck_show_all() == 0)
+		return;
+
+	++data->balance;
+
+	/*
+	 * The IF needs to be cleared as well, so that the faulting
+	 * instruction can run "uninterrupted". Otherwise, we might take
+	 * an interrupt and start executing that before we've had a chance
+	 * to hide the page again.
+	 *
+	 * NOTE: In the rare case of multiple faults, we must not override
+	 * the original flags:
+	 */
+	if (!(regs->flags & X86_EFLAGS_TF))
+		data->flags = regs->flags;
+
+	regs->flags |= X86_EFLAGS_TF;
+	regs->flags &= ~X86_EFLAGS_IF;
+}
+
+/*
+ * Called from the #DB handler.
+ */
+void kmemcheck_hide(struct pt_regs *regs)
+{
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+	int n;
+
+	BUG_ON(!irqs_disabled());
+
+	if (data->balance == 0)
+		return;
+
+	if (unlikely(data->balance != 1)) {
+		kmemcheck_show_all();
+		kmemcheck_error_save_bug(regs);
+		data->n_addrs = 0;
+		data->balance = 0;
+
+		if (!(data->flags & X86_EFLAGS_TF))
+			regs->flags &= ~X86_EFLAGS_TF;
+		if (data->flags & X86_EFLAGS_IF)
+			regs->flags |= X86_EFLAGS_IF;
+		return;
+	}
+
+	if (kmemcheck_enabled)
+		n = kmemcheck_hide_all();
+	else
+		n = kmemcheck_show_all();
+
+	if (n == 0)
+		return;
+
+	--data->balance;
+
+	data->n_addrs = 0;
+
+	if (!(data->flags & X86_EFLAGS_TF))
+		regs->flags &= ~X86_EFLAGS_TF;
+	if (data->flags & X86_EFLAGS_IF)
+		regs->flags |= X86_EFLAGS_IF;
+}
+
+void kmemcheck_show_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i) {
+		unsigned long address;
+		pte_t *pte;
+		unsigned int level;
+
+		address = (unsigned long) page_address(&p[i]);
+		pte = lookup_address(address, &level);
+		BUG_ON(!pte);
+		BUG_ON(level != PG_LEVEL_4K);
+
+		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
+		__flush_tlb_one(address);
+	}
+}
+
+bool kmemcheck_page_is_tracked(struct page *p)
+{
+	/* This will also check the "hidden" flag of the PTE. */
+	return kmemcheck_pte_lookup((unsigned long) page_address(p));
+}
+
+void kmemcheck_hide_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i) {
+		unsigned long address;
+		pte_t *pte;
+		unsigned int level;
+
+		address = (unsigned long) page_address(&p[i]);
+		pte = lookup_address(address, &level);
+		BUG_ON(!pte);
+		BUG_ON(level != PG_LEVEL_4K);
+
+		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
+		__flush_tlb_one(address);
+	}
+}
+
+/* Access may NOT cross page boundary */
+static void kmemcheck_read_strict(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	void *shadow;
+	enum kmemcheck_shadow status;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (!shadow)
+		return;
+
+	kmemcheck_save_addr(addr);
+	status = kmemcheck_shadow_test(shadow, size);
+	if (status == KMEMCHECK_SHADOW_INITIALIZED)
+		return;
+
+	if (kmemcheck_enabled)
+		kmemcheck_error_save(status, addr, size, regs);
+
+	if (kmemcheck_enabled == 2)
+		kmemcheck_enabled = 0;
+
+	/* Don't warn about it again. */
+	kmemcheck_shadow_set(shadow, size);
+}
+
+/* Access may cross page boundary */
+static void kmemcheck_read(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	unsigned long page = addr & PAGE_MASK;
+	unsigned long next_addr = addr + size - 1;
+	unsigned long next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		kmemcheck_read_strict(regs, addr, size);
+		return;
+	}
+
+	/*
+	 * What we do is basically to split the access across the
+	 * two pages and handle each part separately. Yes, this means
+	 * that we may now see reads that are 3 + 5 bytes, for
+	 * example (and if both are uninitialized, there will be two
+	 * reports), but it makes the code a lot simpler.
+	 */
+	kmemcheck_read_strict(regs, addr, next_page - addr);
+	kmemcheck_read_strict(regs, next_page, next_addr - next_page);
+}
+
+static void kmemcheck_write_strict(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	void *shadow;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (!shadow)
+		return;
+
+	kmemcheck_save_addr(addr);
+	kmemcheck_shadow_set(shadow, size);
+}
+
+static void kmemcheck_write(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	unsigned long page = addr & PAGE_MASK;
+	unsigned long next_addr = addr + size - 1;
+	unsigned long next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		kmemcheck_write_strict(regs, addr, size);
+		return;
+	}
+
+	/* See comment in kmemcheck_read(). */
+	kmemcheck_write_strict(regs, addr, next_page - addr);
+	kmemcheck_write_strict(regs, next_page, next_addr - next_page);
+}
+
+/*
+ * Copying is hard. We have two addresses, each of which may be split across
+ * a page (and each page will have different shadow addresses).
+ */
+static void kmemcheck_copy(struct pt_regs *regs,
+	unsigned long src_addr, unsigned long dst_addr, unsigned int size)
+{
+	uint8_t shadow[8];
+	enum kmemcheck_shadow status;
+
+	unsigned long page;
+	unsigned long next_addr;
+	unsigned long next_page;
+
+	uint8_t *x;
+	unsigned int i;
+	unsigned int n;
+
+	BUG_ON(size > sizeof(shadow));
+
+	page = src_addr & PAGE_MASK;
+	next_addr = src_addr + size - 1;
+	next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		/* Same page */
+		x = kmemcheck_shadow_lookup(src_addr);
+		if (x) {
+			kmemcheck_save_addr(src_addr);
+			for (i = 0; i < size; ++i)
+				shadow[i] = x[i];
+		} else {
+			for (i = 0; i < size; ++i)
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+		}
+	} else {
+		n = next_page - src_addr;
+		BUG_ON(n > sizeof(shadow));
+
+		/* First page */
+		x = kmemcheck_shadow_lookup(src_addr);
+		if (x) {
+			kmemcheck_save_addr(src_addr);
+			for (i = 0; i < n; ++i)
+				shadow[i] = x[i];
+		} else {
+			/* Not tracked */
+			for (i = 0; i < n; ++i)
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+		}
+
+		/* Second page */
+		x = kmemcheck_shadow_lookup(next_page);
+		if (x) {
+			kmemcheck_save_addr(next_page);
+			for (i = n; i < size; ++i)
+				shadow[i] = x[i - n];
+		} else {
+			/* Not tracked */
+			for (i = n; i < size; ++i)
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+		}
+	}
+
+	page = dst_addr & PAGE_MASK;
+	next_addr = dst_addr + size - 1;
+	next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		/* Same page */
+		x = kmemcheck_shadow_lookup(dst_addr);
+		if (x) {
+			kmemcheck_save_addr(dst_addr);
+			for (i = 0; i < size; ++i) {
+				x[i] = shadow[i];
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+			}
+		}
+	} else {
+		n = next_page - dst_addr;
+		BUG_ON(n > sizeof(shadow));
+
+		/* First page */
+		x = kmemcheck_shadow_lookup(dst_addr);
+		if (x) {
+			kmemcheck_save_addr(dst_addr);
+			for (i = 0; i < n; ++i) {
+				x[i] = shadow[i];
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+			}
+		}
+
+		/* Second page */
+		x = kmemcheck_shadow_lookup(next_page);
+		if (x) {
+			kmemcheck_save_addr(next_page);
+			for (i = n; i < size; ++i) {
+				x[i - n] = shadow[i];
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+			}
+		}
+	}
+
+	status = kmemcheck_shadow_test(shadow, size);
+	if (status == KMEMCHECK_SHADOW_INITIALIZED)
+		return;
+
+	if (kmemcheck_enabled)
+		kmemcheck_error_save(status, src_addr, size, regs);
+
+	if (kmemcheck_enabled == 2)
+		kmemcheck_enabled = 0;
+}
+
+enum kmemcheck_method {
+	KMEMCHECK_READ,
+	KMEMCHECK_WRITE,
+};
+
+static void kmemcheck_access(struct pt_regs *regs,
+	unsigned long fallback_address, enum kmemcheck_method fallback_method)
+{
+	const uint8_t *insn;
+	const uint8_t *insn_primary;
+	unsigned int size;
+
+	struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+	/* Recursive fault -- ouch. */
+	if (data->busy) {
+		kmemcheck_show_addr(fallback_address);
+		kmemcheck_error_save_bug(regs);
+		return;
+	}
+
+	data->busy = true;
+
+	insn = (const uint8_t *) regs->ip;
+	insn_primary = kmemcheck_opcode_get_primary(insn);
+
+	kmemcheck_opcode_decode(insn, &size);
+
+	switch (insn_primary[0]) {
+#ifdef CONFIG_KMEMCHECK_BITOPS_OK
+		/* AND, OR, XOR */
+		/*
+		 * Unfortunately, these instructions have to be excluded from
+		 * our regular checking since they access only some (and not
+		 * all) bits. This clears out "bogus" bitfield-access warnings.
+		 */
+	case 0x80:
+	case 0x81:
+	case 0x82:
+	case 0x83:
+		switch ((insn_primary[1] >> 3) & 7) {
+			/* OR */
+		case 1:
+			/* AND */
+		case 4:
+			/* XOR */
+		case 6:
+			kmemcheck_write(regs, fallback_address, size);
+			goto out;
+
+			/* ADD */
+		case 0:
+			/* ADC */
+		case 2:
+			/* SBB */
+		case 3:
+			/* SUB */
+		case 5:
+			/* CMP */
+		case 7:
+			break;
+		}
+		break;
+#endif
+
+		/* MOVS, MOVSB, MOVSW, MOVSD */
+	case 0xa4:
+	case 0xa5:
+		/*
+		 * These instructions are special because they take two
+		 * addresses, but we only get one page fault.
+		 */
+		kmemcheck_copy(regs, regs->si, regs->di, size);
+		goto out;
+
+		/* CMPS, CMPSB, CMPSW, CMPSD */
+	case 0xa6:
+	case 0xa7:
+		kmemcheck_read(regs, regs->si, size);
+		kmemcheck_read(regs, regs->di, size);
+		goto out;
+	}
+
+	/*
+	 * If the opcode isn't special in any way, we use the data from the
+	 * page fault handler to determine the address and type of memory
+	 * access.
+	 */
+	switch (fallback_method) {
+	case KMEMCHECK_READ:
+		kmemcheck_read(regs, fallback_address, size);
+		goto out;
+	case KMEMCHECK_WRITE:
+		kmemcheck_write(regs, fallback_address, size);
+		goto out;
+	}
+
+out:
+	data->busy = false;
+}
+
+bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
+	unsigned long error_code)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	/*
+	 * XXX: Is it safe to assume that memory accesses from virtual 86
+	 * mode or non-kernel code segments will _never_ access kernel
+	 * memory (e.g. tracked pages)? For now, we need this to avoid
+	 * invoking kmemcheck for PnP BIOS calls.
+	 */
+	if (regs->flags & X86_VM_MASK)
+		return false;
+	if (regs->cs != __KERNEL_CS)
+		return false;
+
+	pte = lookup_address(address, &level);
+	if (!pte)
+		return false;
+	if (level != PG_LEVEL_4K)
+		return false;
+	if (!pte_hidden(*pte))
+		return false;
+
+	if (error_code & 2)
+		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
+	else
+		kmemcheck_access(regs, address, KMEMCHECK_READ);
+
+	kmemcheck_show(regs);
+	return true;
+}
+
+bool kmemcheck_trap(struct pt_regs *regs)
+{
+	if (!kmemcheck_active(regs))
+		return false;
+
+	/* We're done. */
+	kmemcheck_hide(regs);
+	return true;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 00000000000..a4100b6e783
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,101 @@
+#include <linux/types.h>
+
+#include "opcode.h"
+
+static bool opcode_is_prefix(uint8_t b)
+{
+	return
+		/* Group 1 */
+		b == 0xf0 || b == 0xf2 || b == 0xf3
+		/* Group 2 */
+		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
+		|| b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+		/* Group 3 */
+		|| b == 0x66
+		/* Group 4 */
+		|| b == 0x67;
+}
+
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+	return (b & 0xf0) == 0x40;
+}
+
+#define REX_W (1 << 3)
+
+/*
+ * This is a VERY crude opcode decoder. We only need to find the size of the
+ * load/store that caused our #PF and this should work for all the opcodes
+ * that we care about. Moreover, the ones who invented this instruction set
+ * should be shot.
+ */
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
+{
+	/* Default operand size */
+	int operand_size_override = 4;
+
+	/* prefixes */
+	for (; opcode_is_prefix(*op); ++op) {
+		if (*op == 0x66)
+			operand_size_override = 2;
+	}
+
+#ifdef CONFIG_X86_64
+	/* REX prefix */
+	if (opcode_is_rex_prefix(*op)) {
+		uint8_t rex = *op;
+
+		++op;
+		if (rex & REX_W) {
+			switch (*op) {
+			case 0x63:
+				*size = 4;
+				return;
+			case 0x0f:
+				++op;
+
+				switch (*op) {
+				case 0xb6:
+				case 0xbe:
+					*size = 1;
+					return;
+				case 0xb7:
+				case 0xbf:
+					*size = 2;
+					return;
+				}
+
+				break;
+			}
+
+			*size = 8;
+			return;
+		}
+	}
+#endif
+
+	/* escape opcode */
+	if (*op == 0x0f) {
+		++op;
+
+		/*
+		 * This is move with zero-extend and sign-extend, respectively;
+		 * we don't have to think about 0xb6/0xbe, because this is
+		 * already handled in the conditional below.
+		 */
+		if (*op == 0xb7 || *op == 0xbf)
+			operand_size_override = 2;
+	}
+
+	*size = (*op & 1) ? operand_size_override : 1;
+}
+
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
+{
+	/* skip prefixes */
+	while (opcode_is_prefix(*op))
+		++op;
+	if (opcode_is_rex_prefix(*op))
+		++op;
+	return op;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 00000000000..6956aad66b5
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
+#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
+
+#include <linux/types.h>
+
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 00000000000..4ead26eeaf9
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+#include "pte.h"
+
+pte_t *kmemcheck_pte_lookup(unsigned long address)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	pte = lookup_address(address, &level);
+	if (!pte)
+		return NULL;
+	if (level != PG_LEVEL_4K)
+		return NULL;
+	if (!pte_hidden(*pte))
+		return NULL;
+
+	return pte;
+}
+
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 00000000000..9f596645649
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
+#define ARCH__X86__MM__KMEMCHECK__PTE_H
+
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+pte_t *kmemcheck_pte_lookup(unsigned long address);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 00000000000..5544d360087
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,153 @@
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include "pte.h"
+#include "shadow.h"
+
+/*
+ * Return the shadow address for the given address. Returns NULL if the
+ * address is not tracked.
+ *
+ * We need to be extremely careful not to follow any invalid pointers,
+ * because this function can be called for *any* possible address.
+ */
+void *kmemcheck_shadow_lookup(unsigned long address)
+{
+	pte_t *pte;
+	struct page *page;
+
+	if (!virt_addr_valid(address))
+		return NULL;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return NULL;
+
+	page = virt_to_page(address);
+	if (!page->shadow)
+		return NULL;
+	return page->shadow + (address & (PAGE_SIZE - 1));
+}
+
+static void mark_shadow(void *address, unsigned int n,
+	enum kmemcheck_shadow status)
+{
+	unsigned long addr = (unsigned long) address;
+	unsigned long last_addr = addr + n - 1;
+	unsigned long page = addr & PAGE_MASK;
+	unsigned long last_page = last_addr & PAGE_MASK;
+	unsigned int first_n;
+	void *shadow;
+
+	/* If the memory range crosses a page boundary, stop there. */
+	if (page == last_page)
+		first_n = n;
+	else
+		first_n = page + PAGE_SIZE - addr;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (shadow)
+		memset(shadow, status, first_n);
+
+	addr += first_n;
+	n -= first_n;
+
+	/* Do full-page memset()s. */
+	while (n >= PAGE_SIZE) {
+		shadow = kmemcheck_shadow_lookup(addr);
+		if (shadow)
+			memset(shadow, status, PAGE_SIZE);
+
+		addr += PAGE_SIZE;
+		n -= PAGE_SIZE;
+	}
+
+	/* Do the remaining page, if any. */
+	if (n > 0) {
+		shadow = kmemcheck_shadow_lookup(addr);
+		if (shadow)
+			memset(shadow, status, n);
+	}
+}
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
+}
+
+void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
+}
+
+/*
+ * Fill the shadow memory of the given address such that the memory at that
+ * address is marked as being initialized.
+ */
+void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
+}
+EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
+
+void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
+}
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i)
+		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i)
+		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
+{
+	uint8_t *x;
+	unsigned int i;
+
+	x = shadow;
+
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
+	/*
+	 * Make sure _some_ bytes are initialized. Gcc frequently generates
+	 * code to access neighboring bytes.
+	 */
+	for (i = 0; i < size; ++i) {
+		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
+			return x[i];
+	}
+#else
+	/* All bytes must be initialized. */
+	for (i = 0; i < size; ++i) {
+		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
+			return x[i];
+	}
+#endif
+
+	return x[0];
+}
+
+void kmemcheck_shadow_set(void *shadow, unsigned int size)
+{
+	uint8_t *x;
+	unsigned int i;
+
+	x = shadow;
+	for (i = 0; i < size; ++i)
+		x[i] = KMEMCHECK_SHADOW_INITIALIZED;
+}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 00000000000..af46d9ab9d8
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
+#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
+
+enum kmemcheck_shadow {
+	KMEMCHECK_SHADOW_UNALLOCATED,
+	KMEMCHECK_SHADOW_UNINITIALIZED,
+	KMEMCHECK_SHADOW_INITIALIZED,
+	KMEMCHECK_SHADOW_FREED,
+};
+
+void *kmemcheck_shadow_lookup(unsigned long address);
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+void kmemcheck_shadow_set(void *shadow, unsigned int size);
+
+#endif
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
new file mode 100644
index 00000000000..39480c91b2f
--- /dev/null
+++ b/include/linux/kmemcheck.h
@@ -0,0 +1,17 @@
+#ifndef LINUX_KMEMCHECK_H
+#define LINUX_KMEMCHECK_H
+
+#include <linux/mm_types.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KMEMCHECK
+extern int kmemcheck_enabled;
+
+int kmemcheck_show_addr(unsigned long address);
+int kmemcheck_hide_addr(unsigned long address);
+#else
+#define kmemcheck_enabled 0
+
+#endif /* CONFIG_KMEMCHECK */
+
+#endif /* LINUX_KMEMCHECK_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26ecf2..0042090a4d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -98,6 +98,14 @@ struct page {
 #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
 	unsigned long debug_flags;	/* Use atomic bitops on this */
 #endif
+
+#ifdef CONFIG_KMEMCHECK
+	/*
+	 * kmemcheck wants to track the status of each byte in a page; this
+	 * is a pointer to such a status block. NULL if not tracked.
+	 */
+	void *shadow;
+#endif
 };
 
 /*
diff --git a/init/main.c b/init/main.c
index 5616661eac0..e3c335e47cd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -65,6 +65,7 @@
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/kmemcheck.h>
 #include <linux/kmemtrace.h>
 #include <trace/boot.h>
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce664f98e3f..9ef80bba350 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
+#include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -959,6 +960,17 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_KMEMCHECK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kmemcheck",
+		.data		= &kmemcheck_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-70-g09d2


From 5f5bac8272be791b67c7b7b411e7c8c5847e598a Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Fri, 22 May 2009 20:33:59 +0200
Subject: mmc: Driver for CB710/720 memory card reader (MMC part)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code is divided in two parts. There is a virtual 'bus' driver
that handles PCI device and registers three new devices one per card
reader type. The other driver handles SD/MMC part of the reader.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>
---
 MAINTAINERS                  |   9 +
 drivers/misc/Kconfig         |   1 +
 drivers/misc/Makefile        |   1 +
 drivers/misc/cb710/Kconfig   |  21 ++
 drivers/misc/cb710/Makefile  |   4 +
 drivers/misc/cb710/core.c    | 357 +++++++++++++++++++
 drivers/misc/cb710/debug.c   | 119 +++++++
 drivers/misc/cb710/sgbuf2.c  | 150 ++++++++
 drivers/mmc/host/Kconfig     |  13 +
 drivers/mmc/host/Makefile    |   1 +
 drivers/mmc/host/cb710-mmc.c | 804 +++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/host/cb710-mmc.h | 104 ++++++
 include/linux/cb710.h        | 238 +++++++++++++
 include/linux/pci_ids.h      |   1 +
 14 files changed, 1823 insertions(+)
 create mode 100644 drivers/misc/cb710/Kconfig
 create mode 100644 drivers/misc/cb710/Makefile
 create mode 100644 drivers/misc/cb710/core.c
 create mode 100644 drivers/misc/cb710/debug.c
 create mode 100644 drivers/misc/cb710/sgbuf2.c
 create mode 100644 drivers/mmc/host/cb710-mmc.c
 create mode 100644 drivers/mmc/host/cb710-mmc.h
 create mode 100644 include/linux/cb710.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 90f81283b72..b5cebddd284 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2105,6 +2105,15 @@ W:	http://sourceforge.net/projects/lpfcxxxx
 S:	Supported
 F:	drivers/scsi/lpfc/
 
+ENE CB710 FLASH CARD READER DRIVER
+P:	Michał Mirosław
+M:	mirq-linux@rere.qmqm.pl
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	drivers/misc/cb710/
+F:	drivers/mmc/host/cb710-mmc.*
+F:	include/linux/cb710.h
+
 EPSON 1355 FRAMEBUFFER DRIVER
 P:	Christopher Hoover
 M:	ch@murgatroid.com
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 6d1ac180f6e..68ab39d7cb3 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -235,5 +235,6 @@ config ISL29003
 
 source "drivers/misc/c2port/Kconfig"
 source "drivers/misc/eeprom/Kconfig"
+source "drivers/misc/cb710/Kconfig"
 
 endif # MISC_DEVICES
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 7871f05dcb9..36f733cd60e 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_HP_ILO)		+= hpilo.o
 obj-$(CONFIG_ISL29003)		+= isl29003.o
 obj-$(CONFIG_C2PORT)		+= c2port/
 obj-y				+= eeprom/
+obj-y				+= cb710/
diff --git a/drivers/misc/cb710/Kconfig b/drivers/misc/cb710/Kconfig
new file mode 100644
index 00000000000..0b55079774c
--- /dev/null
+++ b/drivers/misc/cb710/Kconfig
@@ -0,0 +1,21 @@
+config CB710_CORE
+	tristate "ENE CB710/720 Flash memory card reader support"
+	depends on PCI
+	help
+	  This option enables support for PCI ENE CB710/720 Flash memory card
+	  reader found in some laptops (ie. some versions of HP Compaq nx9500).
+
+	  You will also have to select some flash card format drivers (MMC/SD,
+	  MemoryStick).
+
+	  This driver can also be built as a module. If so, the module
+	  will be called cb710.
+
+config CB710_DEBUG
+	bool "Enable driver debugging"
+	depends on CB710_CORE != n
+	default n
+	help
+	  This is an option for use by developers; most people should
+	  say N here.  This adds a lot of debugging output to dmesg.
+
diff --git a/drivers/misc/cb710/Makefile b/drivers/misc/cb710/Makefile
new file mode 100644
index 00000000000..62d51acfeca
--- /dev/null
+++ b/drivers/misc/cb710/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_CB710_CORE)	+= cb710.o
+
+cb710-y				:= core.o sgbuf2.o
+cb710-$(CONFIG_CB710_DEBUG)	+= debug.o
diff --git a/drivers/misc/cb710/core.c b/drivers/misc/cb710/core.c
new file mode 100644
index 00000000000..b14eab0f2ba
--- /dev/null
+++ b/drivers/misc/cb710/core.c
@@ -0,0 +1,357 @@
+/*
+ *  cb710/core.c
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/cb710.h>
+
+static DEFINE_IDA(cb710_ida);
+static DEFINE_SPINLOCK(cb710_ida_lock);
+
+void cb710_pci_update_config_reg(struct pci_dev *pdev,
+	int reg, uint32_t mask, uint32_t xor)
+{
+	u32 rval;
+
+	pci_read_config_dword(pdev, reg, &rval);
+	rval = (rval & mask) ^ xor;
+	pci_write_config_dword(pdev, reg, rval);
+}
+EXPORT_SYMBOL_GPL(cb710_pci_update_config_reg);
+
+/* Some magic writes based on Windows driver init code */
+static int __devinit cb710_pci_configure(struct pci_dev *pdev)
+{
+	unsigned int devfn = PCI_DEVFN(PCI_SLOT(pdev->devfn), 0);
+	struct pci_dev *pdev0 = pci_get_slot(pdev->bus, devfn);
+	u32 val;
+
+	cb710_pci_update_config_reg(pdev, 0x48,
+		~0x000000FF, 0x0000003F);
+
+	pci_read_config_dword(pdev, 0x48, &val);
+	if (val & 0x80000000)
+		return 0;
+
+	if (!pdev0)
+		return -ENODEV;
+
+	if (pdev0->vendor == PCI_VENDOR_ID_ENE
+	    && pdev0->device == PCI_DEVICE_ID_ENE_720) {
+		cb710_pci_update_config_reg(pdev0, 0x8C,
+			~0x00F00000, 0x00100000);
+		cb710_pci_update_config_reg(pdev0, 0xB0,
+			~0x08000000, 0x08000000);
+	}
+
+	cb710_pci_update_config_reg(pdev0, 0x8C,
+		~0x00000F00, 0x00000200);
+	cb710_pci_update_config_reg(pdev0, 0x90,
+		~0x00060000, 0x00040000);
+
+	pci_dev_put(pdev0);
+
+	return 0;
+}
+
+static irqreturn_t cb710_irq_handler(int irq, void *data)
+{
+	struct cb710_chip *chip = data;
+	struct cb710_slot *slot = &chip->slot[0];
+	irqreturn_t handled = IRQ_NONE;
+	unsigned nr;
+
+	spin_lock(&chip->irq_lock); /* incl. smp_rmb() */
+
+	for (nr = chip->slots; nr; ++slot, --nr) {
+		cb710_irq_handler_t handler_func = slot->irq_handler;
+		if (handler_func && handler_func(slot))
+			handled = IRQ_HANDLED;
+	}
+
+	spin_unlock(&chip->irq_lock);
+
+	return handled;
+}
+
+static void cb710_release_slot(struct device *dev)
+{
+#ifdef CONFIG_CB710_DEBUG_ASSUMPTIONS
+	struct cb710_slot *slot = cb710_pdev_to_slot(to_platform_device(dev));
+	struct cb710_chip *chip = cb710_slot_to_chip(slot);
+
+	/* slot struct can be freed now */
+	atomic_dec(&chip->slot_refs_count);
+#endif
+}
+
+static int __devinit cb710_register_slot(struct cb710_chip *chip,
+	unsigned slot_mask, unsigned io_offset, const char *name)
+{
+	int nr = chip->slots;
+	struct cb710_slot *slot = &chip->slot[nr];
+	int err;
+
+	dev_dbg(cb710_chip_dev(chip),
+		"register: %s.%d; slot %d; mask %d; IO offset: 0x%02X\n",
+		name, chip->platform_id, nr, slot_mask, io_offset);
+
+	/* slot->irq_handler == NULL here; this needs to be
+	 * seen before platform_device_register() */
+	++chip->slots;
+	smp_wmb();
+
+	slot->iobase = chip->iobase + io_offset;
+	slot->pdev.name = name;
+	slot->pdev.id = chip->platform_id;
+	slot->pdev.dev.parent = &chip->pdev->dev;
+	slot->pdev.dev.release = cb710_release_slot;
+
+	err = platform_device_register(&slot->pdev);
+
+#ifdef CONFIG_CB710_DEBUG_ASSUMPTIONS
+	atomic_inc(&chip->slot_refs_count);
+#endif
+
+	if (err) {
+		/* device_initialize() called from platform_device_register()
+		 * wants this on error path */
+		platform_device_put(&slot->pdev);
+
+		/* slot->irq_handler == NULL here anyway, so no lock needed */
+		--chip->slots;
+		return err;
+	}
+
+	chip->slot_mask |= slot_mask;
+
+	return 0;
+}
+
+static void cb710_unregister_slot(struct cb710_chip *chip,
+	unsigned slot_mask)
+{
+	int nr = chip->slots - 1;
+
+	if (!(chip->slot_mask & slot_mask))
+		return;
+
+	platform_device_unregister(&chip->slot[nr].pdev);
+
+	/* complementary to spin_unlock() in cb710_set_irq_handler() */
+	smp_rmb();
+	BUG_ON(chip->slot[nr].irq_handler != NULL);
+
+	/* slot->irq_handler == NULL here, so no lock needed */
+	--chip->slots;
+	chip->slot_mask &= ~slot_mask;
+}
+
+void cb710_set_irq_handler(struct cb710_slot *slot,
+	cb710_irq_handler_t handler)
+{
+	struct cb710_chip *chip = cb710_slot_to_chip(slot);
+	unsigned long flags;
+
+	spin_lock_irqsave(&chip->irq_lock, flags);
+	slot->irq_handler = handler;
+	spin_unlock_irqrestore(&chip->irq_lock, flags);
+}
+EXPORT_SYMBOL_GPL(cb710_set_irq_handler);
+
+#ifdef CONFIG_PM
+
+static int cb710_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct cb710_chip *chip = pci_get_drvdata(pdev);
+
+	free_irq(pdev->irq, chip);
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	if (state.event & PM_EVENT_SLEEP)
+		pci_set_power_state(pdev, PCI_D3cold);
+	return 0;
+}
+
+static int cb710_resume(struct pci_dev *pdev)
+{
+	struct cb710_chip *chip = pci_get_drvdata(pdev);
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	err = pcim_enable_device(pdev);
+	if (err)
+		return err;
+
+	return devm_request_irq(&pdev->dev, pdev->irq,
+		cb710_irq_handler, IRQF_SHARED, KBUILD_MODNAME, chip);
+}
+
+#endif /* CONFIG_PM */
+
+static int __devinit cb710_probe(struct pci_dev *pdev,
+	const struct pci_device_id *ent)
+{
+	struct cb710_chip *chip;
+	unsigned long flags;
+	u32 val;
+	int err;
+	int n = 0;
+
+	err = cb710_pci_configure(pdev);
+	if (err)
+		return err;
+
+	/* this is actually magic... */
+	pci_read_config_dword(pdev, 0x48, &val);
+	if (!(val & 0x80000000)) {
+		pci_write_config_dword(pdev, 0x48, val|0x71000000);
+		pci_read_config_dword(pdev, 0x48, &val);
+	}
+
+	dev_dbg(&pdev->dev, "PCI config[0x48] = 0x%08X\n", val);
+	if (!(val & 0x70000000))
+		return -ENODEV;
+	val = (val >> 28) & 7;
+	if (val & CB710_SLOT_MMC)
+		++n;
+	if (val & CB710_SLOT_MS)
+		++n;
+	if (val & CB710_SLOT_SM)
+		++n;
+
+	chip = devm_kzalloc(&pdev->dev,
+		sizeof(*chip) + n * sizeof(*chip->slot), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	err = pcim_enable_device(pdev);
+	if (err)
+		return err;
+
+	err = pcim_iomap_regions(pdev, 0x0001, KBUILD_MODNAME);
+	if (err)
+		return err;
+
+	chip->pdev = pdev;
+	chip->iobase = pcim_iomap_table(pdev)[0];
+
+	pci_set_drvdata(pdev, chip);
+
+	err = devm_request_irq(&pdev->dev, pdev->irq,
+		cb710_irq_handler, IRQF_SHARED, KBUILD_MODNAME, chip);
+	if (err)
+		return err;
+
+	do {
+		if (!ida_pre_get(&cb710_ida, GFP_KERNEL))
+			return -ENOMEM;
+
+		spin_lock_irqsave(&cb710_ida_lock, flags);
+		err = ida_get_new(&cb710_ida, &chip->platform_id);
+		spin_unlock_irqrestore(&cb710_ida_lock, flags);
+
+		if (err && err != -EAGAIN)
+			return err;
+	} while (err);
+
+
+	dev_info(&pdev->dev, "id %d, IO 0x%p, IRQ %d\n",
+		chip->platform_id, chip->iobase, pdev->irq);
+
+	if (val & CB710_SLOT_MMC) {	/* MMC/SD slot */
+		err = cb710_register_slot(chip,
+			CB710_SLOT_MMC, 0x00, "cb710-mmc");
+		if (err)
+			return err;
+	}
+
+	if (val & CB710_SLOT_MS) {	/* MemoryStick slot */
+		err = cb710_register_slot(chip,
+			CB710_SLOT_MS, 0x40, "cb710-ms");
+		if (err)
+			goto unreg_mmc;
+	}
+
+	if (val & CB710_SLOT_SM) {	/* SmartMedia slot */
+		err = cb710_register_slot(chip,
+			CB710_SLOT_SM, 0x60, "cb710-sm");
+		if (err)
+			goto unreg_ms;
+	}
+
+	return 0;
+unreg_ms:
+	cb710_unregister_slot(chip, CB710_SLOT_MS);
+unreg_mmc:
+	cb710_unregister_slot(chip, CB710_SLOT_MMC);
+
+#ifdef CONFIG_CB710_DEBUG_ASSUMPTIONS
+	BUG_ON(atomic_read(&chip->slot_refs_count) != 0);
+#endif
+	return err;
+}
+
+static void __devexit cb710_remove_one(struct pci_dev *pdev)
+{
+	struct cb710_chip *chip = pci_get_drvdata(pdev);
+	unsigned long flags;
+
+	cb710_unregister_slot(chip, CB710_SLOT_SM);
+	cb710_unregister_slot(chip, CB710_SLOT_MS);
+	cb710_unregister_slot(chip, CB710_SLOT_MMC);
+#ifdef CONFIG_CB710_DEBUG_ASSUMPTIONS
+	BUG_ON(atomic_read(&chip->slot_refs_count) != 0);
+#endif
+
+	spin_lock_irqsave(&cb710_ida_lock, flags);
+	ida_remove(&cb710_ida, chip->platform_id);
+	spin_unlock_irqrestore(&cb710_ida_lock, flags);
+}
+
+static const struct pci_device_id cb710_pci_tbl[] = {
+	{ PCI_VENDOR_ID_ENE, PCI_DEVICE_ID_ENE_CB710_FLASH,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{ 0, }
+};
+
+static struct pci_driver cb710_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = cb710_pci_tbl,
+	.probe = cb710_probe,
+	.remove = __devexit_p(cb710_remove_one),
+#ifdef CONFIG_PM
+	.suspend = cb710_suspend,
+	.resume = cb710_resume,
+#endif
+};
+
+static int __init cb710_init_module(void)
+{
+	return pci_register_driver(&cb710_driver);
+}
+
+static void __exit cb710_cleanup_module(void)
+{
+	pci_unregister_driver(&cb710_driver);
+	ida_destroy(&cb710_ida);
+}
+
+module_init(cb710_init_module);
+module_exit(cb710_cleanup_module);
+
+MODULE_AUTHOR("Michał Mirosław <mirq-linux@rere.qmqm.pl>");
+MODULE_DESCRIPTION("ENE CB710 memory card reader driver");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(pci, cb710_pci_tbl);
diff --git a/drivers/misc/cb710/debug.c b/drivers/misc/cb710/debug.c
new file mode 100644
index 00000000000..9da11bc620a
--- /dev/null
+++ b/drivers/misc/cb710/debug.c
@@ -0,0 +1,119 @@
+/*
+ *  cb710/debug.c
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/cb710.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#define CB710_REG_COUNT		0x80
+
+static const u16 allow[CB710_REG_COUNT/16] = {
+	0xFFF0, 0xFFFF, 0xFFFF, 0xFFFF,
+	0xFFF0, 0xFFFF, 0xFFFF, 0xFFFF,
+};
+static const char *const prefix[ARRAY_SIZE(allow)] = {
+	"MMC", "MMC", "MMC", "MMC",
+	"MS?", "MS?", "SM?", "SM?"
+};
+
+static inline int allow_reg_read(unsigned block, unsigned offset, unsigned bits)
+{
+	unsigned mask = (1 << bits/8) - 1;
+	offset *= bits/8;
+	return ((allow[block] >> offset) & mask) == mask;
+}
+
+#define CB710_READ_REGS_TEMPLATE(t)					\
+static void cb710_read_regs_##t(void __iomem *iobase,			\
+	u##t *reg, unsigned select)					\
+{									\
+	unsigned i, j;							\
+									\
+	for (i = 0; i < ARRAY_SIZE(allow); ++i, reg += 16/(t/8)) {	\
+		if (!select & (1 << i))					\
+			continue;					\
+									\
+		for (j = 0; j < 0x10/(t/8); ++j) {			\
+			if (!allow_reg_read(i, j, t))			\
+				continue;				\
+			reg[j] = ioread##t(iobase			\
+				+ (i << 4) + (j * (t/8)));		\
+		}							\
+	}								\
+}
+
+static const char cb710_regf_8[] = "%02X";
+static const char cb710_regf_16[] = "%04X";
+static const char cb710_regf_32[] = "%08X";
+static const char cb710_xes[] = "xxxxxxxx";
+
+#define CB710_DUMP_REGS_TEMPLATE(t)					\
+static void cb710_dump_regs_##t(struct device *dev,			\
+	const u##t *reg, unsigned select)				\
+{									\
+	const char *const xp = &cb710_xes[8 - t/4];			\
+	const char *const format = cb710_regf_##t;			\
+									\
+	char msg[100], *p;						\
+	unsigned i, j;							\
+									\
+	for (i = 0; i < ARRAY_SIZE(allow); ++i, reg += 16/(t/8)) {	\
+		if (!(select & (1 << i)))				\
+			continue;					\
+		p = msg;						\
+		for (j = 0; j < 0x10/(t/8); ++j) {			\
+			*p++ = ' ';					\
+			if (j == 8/(t/8))				\
+				*p++ = ' ';				\
+			if (allow_reg_read(i, j, t))			\
+				p += sprintf(p, format, reg[j]);	\
+			else						\
+				p += sprintf(p, "%s", xp);		\
+		}							\
+		dev_dbg(dev, "%s 0x%02X %s\n", prefix[i], i << 4, msg);	\
+	}								\
+}
+
+#define CB710_READ_AND_DUMP_REGS_TEMPLATE(t)				\
+static void cb710_read_and_dump_regs_##t(struct cb710_chip *chip,	\
+	unsigned select)						\
+{									\
+	u##t regs[CB710_REG_COUNT/sizeof(u##t)];			\
+									\
+	memset(&regs, 0, sizeof(regs));					\
+	cb710_read_regs_##t(chip->iobase, regs, select);		\
+	cb710_dump_regs_##t(cb710_chip_dev(chip), regs, select);	\
+}
+
+#define CB710_REG_ACCESS_TEMPLATES(t)		\
+  CB710_READ_REGS_TEMPLATE(t)			\
+  CB710_DUMP_REGS_TEMPLATE(t)			\
+  CB710_READ_AND_DUMP_REGS_TEMPLATE(t)
+
+CB710_REG_ACCESS_TEMPLATES(8)
+CB710_REG_ACCESS_TEMPLATES(16)
+CB710_REG_ACCESS_TEMPLATES(32)
+
+void cb710_dump_regs(struct cb710_chip *chip, unsigned select)
+{
+	if (!(select & CB710_DUMP_REGS_MASK))
+		select = CB710_DUMP_REGS_ALL;
+	if (!(select & CB710_DUMP_ACCESS_MASK))
+		select |= CB710_DUMP_ACCESS_8;
+
+	if (select & CB710_DUMP_ACCESS_32)
+		cb710_read_and_dump_regs_32(chip, select);
+	if (select & CB710_DUMP_ACCESS_16)
+		cb710_read_and_dump_regs_16(chip, select);
+	if (select & CB710_DUMP_ACCESS_8)
+		cb710_read_and_dump_regs_8(chip, select);
+}
+EXPORT_SYMBOL_GPL(cb710_dump_regs);
+
diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c
new file mode 100644
index 00000000000..d38a7acdb6e
--- /dev/null
+++ b/drivers/misc/cb710/sgbuf2.c
@@ -0,0 +1,150 @@
+/*
+ *  cb710/sgbuf2.c
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cb710.h>
+
+static bool sg_dwiter_next(struct sg_mapping_iter *miter)
+{
+	if (sg_miter_next(miter)) {
+		miter->consumed = 0;
+		return true;
+	} else
+		return false;
+}
+
+static bool sg_dwiter_is_at_end(struct sg_mapping_iter *miter)
+{
+	return miter->length == miter->consumed && !sg_dwiter_next(miter);
+}
+
+static uint32_t sg_dwiter_read_buffer(struct sg_mapping_iter *miter)
+{
+	size_t len, left = 4;
+	uint32_t data;
+	void *addr = &data;
+
+	do {
+		len = min(miter->length - miter->consumed, left);
+		memcpy(addr, miter->addr + miter->consumed, len);
+		miter->consumed += len;
+		left -= len;
+		if (!left)
+			return data;
+		addr += len;
+	} while (sg_dwiter_next(miter));
+
+	memset(addr, 0, left);
+	return data;
+}
+
+static inline bool needs_unaligned_copy(const void *ptr)
+{
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+	return false;
+#else
+	return ((ptr - NULL) & 3) != 0;
+#endif
+}
+
+static bool sg_dwiter_get_next_block(struct sg_mapping_iter *miter, uint32_t **ptr)
+{
+	size_t len;
+
+	if (sg_dwiter_is_at_end(miter))
+		return true;
+
+	len = miter->length - miter->consumed;
+
+	if (likely(len >= 4 && !needs_unaligned_copy(
+			miter->addr + miter->consumed))) {
+		*ptr = miter->addr + miter->consumed;
+		miter->consumed += 4;
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * cb710_sg_dwiter_read_next_block() - get next 32-bit word from sg buffer
+ * @miter: sg mapping iterator used for reading
+ *
+ * Description:
+ *   Returns 32-bit word starting at byte pointed to by @miter@
+ *   handling any alignment issues.  Bytes past the buffer's end
+ *   are not accessed (read) but are returned as zeroes.  @miter@
+ *   is advanced by 4 bytes or to the end of buffer whichever is
+ *   closer.
+ *
+ * Context:
+ *   Same requirements as in sg_miter_next().
+ *
+ * Returns:
+ *   32-bit word just read.
+ */
+uint32_t cb710_sg_dwiter_read_next_block(struct sg_mapping_iter *miter)
+{
+	uint32_t *ptr = NULL;
+
+	if (likely(sg_dwiter_get_next_block(miter, &ptr)))
+		return ptr ? *ptr : 0;
+
+	return sg_dwiter_read_buffer(miter);
+}
+EXPORT_SYMBOL_GPL(cb710_sg_dwiter_read_next_block);
+
+static void sg_dwiter_write_slow(struct sg_mapping_iter *miter, uint32_t data)
+{
+	size_t len, left = 4;
+	void *addr = &data;
+
+	do {
+		len = min(miter->length - miter->consumed, left);
+		memcpy(miter->addr, addr, len);
+		miter->consumed += len;
+		left -= len;
+		if (!left)
+			return;
+		addr += len;
+		flush_kernel_dcache_page(miter->page);
+	} while (sg_dwiter_next(miter));
+}
+
+/**
+ * cb710_sg_dwiter_write_next_block() - write next 32-bit word to sg buffer
+ * @miter: sg mapping iterator used for writing
+ *
+ * Description:
+ *   Writes 32-bit word starting at byte pointed to by @miter@
+ *   handling any alignment issues.  Bytes which would be written
+ *   past the buffer's end are silently discarded. @miter@ is
+ *   advanced by 4 bytes or to the end of buffer whichever is closer.
+ *
+ * Context:
+ *   Same requirements as in sg_miter_next().
+ */
+void cb710_sg_dwiter_write_next_block(struct sg_mapping_iter *miter, uint32_t data)
+{
+	uint32_t *ptr = NULL;
+
+	if (likely(sg_dwiter_get_next_block(miter, &ptr))) {
+		if (ptr)
+			*ptr = data;
+		else
+			return;
+	} else
+		sg_dwiter_write_slow(miter, data);
+
+	if (miter->length == miter->consumed)
+		flush_kernel_dcache_page(miter->page);
+}
+EXPORT_SYMBOL_GPL(cb710_sg_dwiter_write_next_block);
+
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index b4cf691f3f6..75337e442da 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -241,3 +241,16 @@ config MMC_TMIO
 	help
 	  This provides support for the SD/MMC cell found in TC6393XB,
 	  T7L66XB and also ipaq ASIC3
+
+config MMC_CB710
+	tristate "ENE CB710 MMC/SD Interface support"
+	depends on PCI
+	select CB710_CORE
+	help
+	  This option enables support for MMC/SD part of ENE CB710/720 Flash
+	  memory card reader found in some laptops (ie. some versions of
+	  HP Compaq nx9500).
+
+	  This driver can also be built as a module. If so, the module
+	  will be called cb710-mmc.
+
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 970a997620e..ce9c8b6f0f4 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -29,4 +29,5 @@ endif
 obj-$(CONFIG_MMC_S3C)   	+= s3cmci.o
 obj-$(CONFIG_MMC_SDRICOH_CS)	+= sdricoh_cs.o
 obj-$(CONFIG_MMC_TMIO)		+= tmio_mmc.o
+obj-$(CONFIG_MMC_CB710)	+= cb710-mmc.o
 
diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c
new file mode 100644
index 00000000000..bc048aae720
--- /dev/null
+++ b/drivers/mmc/host/cb710-mmc.c
@@ -0,0 +1,804 @@
+/*
+ *  cb710/mmc.c
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include "cb710-mmc.h"
+
+static const u8 cb710_clock_divider_log2[8] = {
+/*	1, 2, 4, 8, 16, 32, 128, 512 */
+	0, 1, 2, 3,  4,  5,   7,   9
+};
+#define CB710_MAX_DIVIDER_IDX	\
+	(ARRAY_SIZE(cb710_clock_divider_log2) - 1)
+
+static const u8 cb710_src_freq_mhz[16] = {
+	33, 10, 20, 25, 30, 35, 40, 45,
+	50, 55, 60, 65, 70, 75, 80, 85
+};
+
+static void cb710_mmc_set_clock(struct mmc_host *mmc, int hz)
+{
+	struct cb710_slot *slot = cb710_mmc_to_slot(mmc);
+	struct pci_dev *pdev = cb710_slot_to_chip(slot)->pdev;
+	u32 src_freq_idx;
+	u32 divider_idx;
+	int src_hz;
+
+	/* this is magic, unverifiable for me, unless I get
+	 * MMC card with cables connected to bus signals */
+	pci_read_config_dword(pdev, 0x48, &src_freq_idx);
+	src_freq_idx = (src_freq_idx >> 16) & 0xF;
+	src_hz = cb710_src_freq_mhz[src_freq_idx] * 1000000;
+
+	for (divider_idx = 0; divider_idx < CB710_MAX_DIVIDER_IDX; ++divider_idx) {
+		if (hz >= src_hz >> cb710_clock_divider_log2[divider_idx])
+			break;
+	}
+
+	if (src_freq_idx)
+		divider_idx |= 0x8;
+
+	cb710_pci_update_config_reg(pdev, 0x40, ~0xF0000000, divider_idx << 28);
+
+	dev_dbg(cb710_slot_dev(slot),
+		"clock set to %d Hz, wanted %d Hz; flag = %d\n",
+		src_hz >> cb710_clock_divider_log2[divider_idx & 7],
+		hz, (divider_idx & 8) != 0);
+}
+
+static void __cb710_mmc_enable_irq(struct cb710_slot *slot,
+	unsigned short enable, unsigned short mask)
+{
+	/* clear global IE
+	 * - it gets set later if any interrupt sources are enabled */
+	mask |= CB710_MMC_IE_IRQ_ENABLE;
+
+	/* look like interrupt is fired whenever
+	 * WORD[0x0C] & WORD[0x10] != 0;
+	 * -> bit 15 port 0x0C seems to be global interrupt enable
+	 */
+
+	enable = (cb710_read_port_16(slot, CB710_MMC_IRQ_ENABLE_PORT)
+		& ~mask) | enable;
+
+	if (enable)
+		enable |= CB710_MMC_IE_IRQ_ENABLE;
+
+	cb710_write_port_16(slot, CB710_MMC_IRQ_ENABLE_PORT, enable);
+}
+
+static void cb710_mmc_enable_irq(struct cb710_slot *slot,
+	unsigned short enable, unsigned short mask)
+{
+	struct cb710_mmc_reader *reader = mmc_priv(cb710_slot_to_mmc(slot));
+	unsigned long flags;
+
+	spin_lock_irqsave(&reader->irq_lock, flags);
+	/* this is the only thing irq_lock protects */
+	__cb710_mmc_enable_irq(slot, enable, mask);
+	spin_unlock_irqrestore(&reader->irq_lock, flags);
+}
+
+static void cb710_mmc_reset_events(struct cb710_slot *slot)
+{
+	cb710_write_port_8(slot, CB710_MMC_STATUS0_PORT, 0xFF);
+	cb710_write_port_8(slot, CB710_MMC_STATUS1_PORT, 0xFF);
+	cb710_write_port_8(slot, CB710_MMC_STATUS2_PORT, 0xFF);
+}
+
+static int cb710_mmc_is_card_inserted(struct cb710_slot *slot)
+{
+	return cb710_read_port_8(slot, CB710_MMC_STATUS3_PORT)
+		& CB710_MMC_S3_CARD_DETECTED;
+}
+
+static void cb710_mmc_enable_4bit_data(struct cb710_slot *slot, int enable)
+{
+	dev_dbg(cb710_slot_dev(slot), "configuring %d-data-line%s mode\n",
+		enable ? 4 : 1, enable ? "s" : "");
+	if (enable)
+		cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT,
+			CB710_MMC_C1_4BIT_DATA_BUS, 0);
+	else
+		cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT,
+			0, CB710_MMC_C1_4BIT_DATA_BUS);
+}
+
+static int cb710_check_event(struct cb710_slot *slot, u8 what)
+{
+	u16 status;
+
+	status = cb710_read_port_16(slot, CB710_MMC_STATUS_PORT);
+
+	if (status & CB710_MMC_S0_FIFO_UNDERFLOW) {
+		/* it is just a guess, so log it */
+		dev_dbg(cb710_slot_dev(slot),
+			"CHECK : ignoring bit 6 in status %04X\n", status);
+		cb710_write_port_8(slot, CB710_MMC_STATUS0_PORT,
+			CB710_MMC_S0_FIFO_UNDERFLOW);
+		status &= ~CB710_MMC_S0_FIFO_UNDERFLOW;
+	}
+
+	if (status & CB710_MMC_STATUS_ERROR_EVENTS) {
+		dev_dbg(cb710_slot_dev(slot),
+			"CHECK : returning EIO on status %04X\n", status);
+		cb710_write_port_8(slot, CB710_MMC_STATUS0_PORT, status & 0xFF);
+		cb710_write_port_8(slot, CB710_MMC_STATUS1_PORT,
+			CB710_MMC_S1_RESET);
+		return -EIO;
+	}
+
+	/* 'what' is a bit in MMC_STATUS1 */
+	if ((status >> 8) & what) {
+		cb710_write_port_8(slot, CB710_MMC_STATUS1_PORT, what);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int cb710_wait_for_event(struct cb710_slot *slot, u8 what)
+{
+	int err = 0;
+	unsigned limit = 2000000;	/* FIXME: real timeout */
+
+#ifdef CONFIG_CB710_DEBUG
+	u32 e, x;
+	e = cb710_read_port_32(slot, CB710_MMC_STATUS_PORT);
+#endif
+
+	while (!(err = cb710_check_event(slot, what))) {
+		if (!--limit) {
+			cb710_dump_regs(cb710_slot_to_chip(slot),
+				CB710_DUMP_REGS_MMC);
+			err = -ETIMEDOUT;
+			break;
+		}
+		udelay(1);
+	}
+
+#ifdef CONFIG_CB710_DEBUG
+	x = cb710_read_port_32(slot, CB710_MMC_STATUS_PORT);
+
+	limit = 2000000 - limit;
+	if (limit > 100)
+		dev_dbg(cb710_slot_dev(slot),
+			"WAIT10: waited %d loops, what %d, entry val %08X, exit val %08X\n",
+			limit, what, e, x);
+#endif
+	return err < 0 ? err : 0;
+}
+
+
+static int cb710_wait_while_busy(struct cb710_slot *slot, uint8_t mask)
+{
+	unsigned limit = 500000;	/* FIXME: real timeout */
+	int err = 0;
+
+#ifdef CONFIG_CB710_DEBUG
+	u32 e, x;
+	e = cb710_read_port_32(slot, CB710_MMC_STATUS_PORT);
+#endif
+
+	while (cb710_read_port_8(slot, CB710_MMC_STATUS2_PORT) & mask) {
+		if (!--limit) {
+			cb710_dump_regs(cb710_slot_to_chip(slot),
+				CB710_DUMP_REGS_MMC);
+			err = -ETIMEDOUT;
+			break;
+		}
+		udelay(1);
+	}
+
+#ifdef CONFIG_CB710_DEBUG
+	x = cb710_read_port_32(slot, CB710_MMC_STATUS_PORT);
+
+	limit = 500000 - limit;
+	if (limit > 100)
+		dev_dbg(cb710_slot_dev(slot),
+			"WAIT12: waited %d loops, mask %02X, entry val %08X, exit val %08X\n",
+			limit, mask, e, x);
+#endif
+	return 0;
+}
+
+static void cb710_mmc_set_transfer_size(struct cb710_slot *slot,
+	size_t count, size_t blocksize)
+{
+	cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	cb710_write_port_32(slot, CB710_MMC_TRANSFER_SIZE_PORT,
+		((count - 1) << 16)|(blocksize - 1));
+
+	dev_vdbg(cb710_slot_dev(slot), "set up for %d block%s of %d bytes\n",
+		count, count == 1 ? "" : "s", blocksize);
+}
+
+static void cb710_mmc_fifo_hack(struct cb710_slot *slot)
+{
+	/* without this, received data is prepended with 8-bytes of zeroes */
+	u32 r1, r2;
+	int ok = 0;
+
+	r1 = cb710_read_port_32(slot, CB710_MMC_DATA_PORT);
+	r2 = cb710_read_port_32(slot, CB710_MMC_DATA_PORT);
+	if (cb710_read_port_8(slot, CB710_MMC_STATUS0_PORT)
+	    & CB710_MMC_S0_FIFO_UNDERFLOW) {
+		cb710_write_port_8(slot, CB710_MMC_STATUS0_PORT,
+			CB710_MMC_S0_FIFO_UNDERFLOW);
+		ok = 1;
+	}
+
+	dev_dbg(cb710_slot_dev(slot),
+		"FIFO-read-hack: expected STATUS0 bit was %s\n",
+		ok ? "set." : "NOT SET!");
+	dev_dbg(cb710_slot_dev(slot),
+		"FIFO-read-hack: dwords ignored: %08X %08X - %s\n",
+		r1, r2, (r1|r2) ? "BAD (NOT ZERO)!" : "ok");
+}
+
+static int cb710_mmc_receive_pio(struct cb710_slot *slot,
+	struct sg_mapping_iter *miter, size_t dw_count)
+{
+	if (!(cb710_read_port_8(slot, CB710_MMC_STATUS2_PORT) & CB710_MMC_S2_FIFO_READY)) {
+		int err = cb710_wait_for_event(slot,
+			CB710_MMC_S1_PIO_TRANSFER_DONE);
+		if (err)
+			return err;
+	}
+
+	cb710_sg_dwiter_write_from_io(miter,
+		slot->iobase + CB710_MMC_DATA_PORT, dw_count);
+
+	return 0;
+}
+
+static bool cb710_is_transfer_size_supported(struct mmc_data *data)
+{
+	return !(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8));
+}
+
+static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data)
+{
+	struct sg_mapping_iter miter;
+	size_t len, blocks = data->blocks;
+	int err = 0;
+
+	/* TODO: I don't know how/if the hardware handles non-16B-boundary blocks
+	 * except single 8B block */
+	if (unlikely(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8)))
+		return -EINVAL;
+
+	sg_miter_start(&miter, data->sg, data->sg_len, 0);
+
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT,
+		15, CB710_MMC_C2_READ_PIO_SIZE_MASK);
+
+	cb710_mmc_fifo_hack(slot);
+
+	while (blocks-- > 0) {
+		len = data->blksz;
+
+		while (len >= 16) {
+			err = cb710_mmc_receive_pio(slot, &miter, 4);
+			if (err)
+				goto out;
+			len -= 16;
+		}
+
+		if (!len)
+			continue;
+
+		cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT,
+			len - 1, CB710_MMC_C2_READ_PIO_SIZE_MASK);
+
+		len = (len >= 8) ? 4 : 2;
+		err = cb710_mmc_receive_pio(slot, &miter, len);
+		if (err)
+			goto out;
+	}
+out:
+	cb710_sg_miter_stop_writing(&miter);
+	return err;
+}
+
+static int cb710_mmc_send(struct cb710_slot *slot, struct mmc_data *data)
+{
+	struct sg_mapping_iter miter;
+	size_t len, blocks = data->blocks;
+	int err = 0;
+
+	/* TODO: I don't know how/if the hardware handles multiple
+	 * non-16B-boundary blocks */
+	if (unlikely(data->blocks > 1 && data->blksz & 15))
+		return -EINVAL;
+
+	sg_miter_start(&miter, data->sg, data->sg_len, 0);
+
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT,
+		0, CB710_MMC_C2_READ_PIO_SIZE_MASK);
+
+	while (blocks-- > 0) {
+		len = (data->blksz + 15) >> 4;
+		do {
+			if (!(cb710_read_port_8(slot, CB710_MMC_STATUS2_PORT)
+			    & CB710_MMC_S2_FIFO_EMPTY)) {
+				err = cb710_wait_for_event(slot,
+					CB710_MMC_S1_PIO_TRANSFER_DONE);
+				if (err)
+					goto out;
+			}
+			cb710_sg_dwiter_read_to_io(&miter,
+				slot->iobase + CB710_MMC_DATA_PORT, 4);
+		} while (--len);
+	}
+out:
+	sg_miter_stop(&miter);
+	return err;
+}
+
+static u16 cb710_encode_cmd_flags(struct cb710_mmc_reader *reader,
+	struct mmc_command *cmd)
+{
+	unsigned int flags = cmd->flags;
+	u16 cb_flags = 0;
+
+	/* Windows driver returned 0 for commands for which no response
+	 * is expected. It happened that there were only two such commands
+	 * used: MMC_GO_IDLE_STATE and MMC_GO_INACTIVE_STATE so it might
+	 * as well be a bug in that driver.
+	 *
+	 * Original driver set bit 14 for MMC/SD application
+	 * commands. There's no difference 'on the wire' and
+	 * it apparently works without it anyway.
+	 */
+
+	switch (flags & MMC_CMD_MASK) {
+	case MMC_CMD_AC:	cb_flags = CB710_MMC_CMD_AC;	break;
+	case MMC_CMD_ADTC:	cb_flags = CB710_MMC_CMD_ADTC;	break;
+	case MMC_CMD_BC:	cb_flags = CB710_MMC_CMD_BC;	break;
+	case MMC_CMD_BCR:	cb_flags = CB710_MMC_CMD_BCR;	break;
+	}
+
+	if (flags & MMC_RSP_BUSY)
+		cb_flags |= CB710_MMC_RSP_BUSY;
+
+	cb_flags |= cmd->opcode << CB710_MMC_CMD_CODE_SHIFT;
+
+	if (cmd->data && (cmd->data->flags & MMC_DATA_READ))
+		cb_flags |= CB710_MMC_DATA_READ;
+
+	if (flags & MMC_RSP_PRESENT) {
+		/* Windows driver set 01 at bits 4,3 except for
+		 * MMC_SET_BLOCKLEN where it set 10. Maybe the
+		 * hardware can do something special about this
+		 * command? The original driver looks buggy/incomplete
+		 * anyway so we ignore this for now.
+		 *
+		 * I assume that 00 here means no response is expected.
+		 */
+		cb_flags |= CB710_MMC_RSP_PRESENT;
+
+		if (flags & MMC_RSP_136)
+			cb_flags |= CB710_MMC_RSP_136;
+		if (!(flags & MMC_RSP_CRC))
+			cb_flags |= CB710_MMC_RSP_NO_CRC;
+	}
+
+	return cb_flags;
+}
+
+static void cb710_receive_response(struct cb710_slot *slot,
+	struct mmc_command *cmd)
+{
+	unsigned rsp_opcode, wanted_opcode;
+
+	/* Looks like final byte with CRC is always stripped (same as SDHCI) */
+	if (cmd->flags & MMC_RSP_136) {
+		u32 resp[4];
+
+		resp[0] = cb710_read_port_32(slot, CB710_MMC_RESPONSE3_PORT);
+		resp[1] = cb710_read_port_32(slot, CB710_MMC_RESPONSE2_PORT);
+		resp[2] = cb710_read_port_32(slot, CB710_MMC_RESPONSE1_PORT);
+		resp[3] = cb710_read_port_32(slot, CB710_MMC_RESPONSE0_PORT);
+		rsp_opcode = resp[0] >> 24;
+
+		cmd->resp[0] = (resp[0] << 8)|(resp[1] >> 24);
+		cmd->resp[1] = (resp[1] << 8)|(resp[2] >> 24);
+		cmd->resp[2] = (resp[2] << 8)|(resp[3] >> 24);
+		cmd->resp[3] = (resp[3] << 8);
+	} else {
+		rsp_opcode = cb710_read_port_32(slot, CB710_MMC_RESPONSE1_PORT) & 0x3F;
+		cmd->resp[0] = cb710_read_port_32(slot, CB710_MMC_RESPONSE0_PORT);
+	}
+
+	wanted_opcode = (cmd->flags & MMC_RSP_OPCODE) ? cmd->opcode : 0x3F;
+	if (rsp_opcode != wanted_opcode)
+		cmd->error = -EILSEQ;
+}
+
+static int cb710_mmc_transfer_data(struct cb710_slot *slot,
+	struct mmc_data *data)
+{
+	int error, to;
+
+	if (data->flags & MMC_DATA_READ)
+		error = cb710_mmc_receive(slot, data);
+	else
+		error = cb710_mmc_send(slot, data);
+
+	to = cb710_wait_for_event(slot, CB710_MMC_S1_DATA_TRANSFER_DONE);
+	if (!error)
+		error = to;
+
+	if (!error)
+		data->bytes_xfered = data->blksz * data->blocks;
+	return error;
+}
+
+static int cb710_mmc_command(struct mmc_host *mmc, struct mmc_command *cmd)
+{
+	struct cb710_slot *slot = cb710_mmc_to_slot(mmc);
+	struct cb710_mmc_reader *reader = mmc_priv(mmc);
+	struct mmc_data *data = cmd->data;
+
+	u16 cb_cmd = cb710_encode_cmd_flags(reader, cmd);
+	dev_dbg(cb710_slot_dev(slot), "cmd request: 0x%04X\n", cb_cmd);
+
+	if (data) {
+		if (!cb710_is_transfer_size_supported(data)) {
+			data->error = -EINVAL;
+			return -1;
+		}
+		cb710_mmc_set_transfer_size(slot, data->blocks, data->blksz);
+	}
+
+	cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20|CB710_MMC_S2_BUSY_10);
+	cb710_write_port_16(slot, CB710_MMC_CMD_TYPE_PORT, cb_cmd);
+	cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	cb710_write_port_32(slot, CB710_MMC_CMD_PARAM_PORT, cmd->arg);
+	cb710_mmc_reset_events(slot);
+	cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG0_PORT, 0x01, 0);
+
+	cmd->error = cb710_wait_for_event(slot, CB710_MMC_S1_COMMAND_SENT);
+	if (cmd->error)
+		return -1;
+
+	if (cmd->flags & MMC_RSP_PRESENT) {
+		cb710_receive_response(slot, cmd);
+		if (cmd->error)
+			return -1;
+	}
+
+	if (data)
+		data->error = cb710_mmc_transfer_data(slot, data);
+	return 0;
+}
+
+static void cb710_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
+{
+	struct cb710_slot *slot = cb710_mmc_to_slot(mmc);
+	struct cb710_mmc_reader *reader = mmc_priv(mmc);
+
+	WARN_ON(reader->mrq != NULL);
+
+	reader->mrq = mrq;
+	cb710_mmc_enable_irq(slot, CB710_MMC_IE_TEST_MASK, 0);
+
+	if (cb710_mmc_is_card_inserted(slot)) {
+		if (!cb710_mmc_command(mmc, mrq->cmd) && mrq->stop)
+			cb710_mmc_command(mmc, mrq->stop);
+		mdelay(1);
+	} else {
+		mrq->cmd->error = -ENOMEDIUM;
+	}
+
+	tasklet_schedule(&reader->finish_req_tasklet);
+}
+
+static int cb710_mmc_powerup(struct cb710_slot *slot)
+{
+#ifdef CONFIG_CB710_DEBUG
+	struct cb710_chip *chip = cb710_slot_to_chip(slot);
+#endif
+	int err;
+
+	/* a lot of magic; see comment in cb710_mmc_set_clock() */
+	dev_dbg(cb710_slot_dev(slot), "bus powerup\n");
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	err = cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	if (unlikely(err))
+		return err;
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT, 0x80, 0);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG3_PORT, 0x80, 0);
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	mdelay(1);
+	dev_dbg(cb710_slot_dev(slot), "after delay 1\n");
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	err = cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	if (unlikely(err))
+		return err;
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT, 0x09, 0);
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	mdelay(1);
+	dev_dbg(cb710_slot_dev(slot), "after delay 2\n");
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	err = cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	if (unlikely(err))
+		return err;
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT, 0, 0x08);
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	mdelay(2);
+	dev_dbg(cb710_slot_dev(slot), "after delay 3\n");
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG0_PORT, 0x06, 0);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT, 0x70, 0);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 0x80, 0);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG3_PORT, 0x03, 0);
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	err = cb710_wait_while_busy(slot, CB710_MMC_S2_BUSY_20);
+	if (unlikely(err))
+		return err;
+	/* This port behaves weird: quick byte reads of 0x08,0x09 return
+	 * 0xFF,0x00 after writing 0xFFFF to 0x08; it works correctly when
+	 * read/written from userspace...  What am I missing here?
+	 * (it doesn't depend on write-to-read delay) */
+	cb710_write_port_16(slot, CB710_MMC_CONFIGB_PORT, 0xFFFF);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG0_PORT, 0x06, 0);
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+	dev_dbg(cb710_slot_dev(slot), "bus powerup finished\n");
+
+	return cb710_check_event(slot, 0);
+}
+
+static void cb710_mmc_powerdown(struct cb710_slot *slot)
+{
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG1_PORT, 0, 0x81);
+	cb710_modify_port_8(slot, CB710_MMC_CONFIG3_PORT, 0, 0x80);
+}
+
+static void cb710_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+	struct cb710_slot *slot = cb710_mmc_to_slot(mmc);
+	struct cb710_mmc_reader *reader = mmc_priv(mmc);
+	int err;
+
+	cb710_mmc_set_clock(mmc, ios->clock);
+
+	if (!cb710_mmc_is_card_inserted(slot)) {
+		dev_dbg(cb710_slot_dev(slot),
+			"no card inserted - ignoring bus powerup request\n");
+		ios->power_mode = MMC_POWER_OFF;
+	}
+
+	if (ios->power_mode != reader->last_power_mode)
+	switch (ios->power_mode) {
+	case MMC_POWER_ON:
+		err = cb710_mmc_powerup(slot);
+		if (err) {
+			dev_warn(cb710_slot_dev(slot),
+				"powerup failed (%d)- retrying\n", err);
+			cb710_mmc_powerdown(slot);
+			udelay(1);
+			err = cb710_mmc_powerup(slot);
+			if (err)
+				dev_warn(cb710_slot_dev(slot),
+					"powerup retry failed (%d) - expect errors\n",
+					err);
+		}
+		reader->last_power_mode = MMC_POWER_ON;
+		break;
+	case MMC_POWER_OFF:
+		cb710_mmc_powerdown(slot);
+		reader->last_power_mode = MMC_POWER_OFF;
+		break;
+	case MMC_POWER_UP:
+	default:
+		/* ignore */;
+	}
+
+	cb710_mmc_enable_4bit_data(slot, ios->bus_width != MMC_BUS_WIDTH_1);
+
+	cb710_mmc_enable_irq(slot, CB710_MMC_IE_TEST_MASK, 0);
+}
+
+static int cb710_mmc_get_ro(struct mmc_host *mmc)
+{
+	struct cb710_slot *slot = cb710_mmc_to_slot(mmc);
+
+	return cb710_read_port_8(slot, CB710_MMC_STATUS3_PORT)
+		& CB710_MMC_S3_WRITE_PROTECTED;
+}
+
+static int cb710_mmc_irq_handler(struct cb710_slot *slot)
+{
+	struct mmc_host *mmc = cb710_slot_to_mmc(slot);
+	struct cb710_mmc_reader *reader = mmc_priv(mmc);
+	u32 status, config1, config2, irqen;
+
+	status = cb710_read_port_32(slot, CB710_MMC_STATUS_PORT);
+	irqen = cb710_read_port_32(slot, CB710_MMC_IRQ_ENABLE_PORT);
+	config2 = cb710_read_port_32(slot, CB710_MMC_CONFIGB_PORT);
+	config1 = cb710_read_port_32(slot, CB710_MMC_CONFIG_PORT);
+
+	dev_dbg(cb710_slot_dev(slot), "interrupt; status: %08X, "
+		"ie: %08X, c2: %08X, c1: %08X\n",
+		status, irqen, config2, config1);
+
+	if (status & (CB710_MMC_S1_CARD_CHANGED << 8)) {
+		/* ack the event */
+		cb710_write_port_8(slot, CB710_MMC_STATUS1_PORT,
+			CB710_MMC_S1_CARD_CHANGED);
+		if ((irqen & CB710_MMC_IE_CISTATUS_MASK)
+		    == CB710_MMC_IE_CISTATUS_MASK)
+			mmc_detect_change(mmc, HZ/5);
+	} else {
+		dev_dbg(cb710_slot_dev(slot), "unknown interrupt (test)\n");
+		spin_lock(&reader->irq_lock);
+		__cb710_mmc_enable_irq(slot, 0, CB710_MMC_IE_TEST_MASK);
+		spin_unlock(&reader->irq_lock);
+	}
+
+	return 1;
+}
+
+static void cb710_mmc_finish_request_tasklet(unsigned long data)
+{
+	struct mmc_host *mmc = (void *)data;
+	struct cb710_mmc_reader *reader = mmc_priv(mmc);
+	struct mmc_request *mrq = reader->mrq;
+
+	reader->mrq = NULL;
+	mmc_request_done(mmc, mrq);
+}
+
+static const struct mmc_host_ops cb710_mmc_host = {
+	.request = cb710_mmc_request,
+	.set_ios = cb710_mmc_set_ios,
+	.get_ro = cb710_mmc_get_ro
+};
+
+#ifdef CONFIG_PM
+
+static int cb710_mmc_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct cb710_slot *slot = cb710_pdev_to_slot(pdev);
+	struct mmc_host *mmc = cb710_slot_to_mmc(slot);
+	int err;
+
+	err = mmc_suspend_host(mmc, state);
+	if (err)
+		return err;
+
+	cb710_mmc_enable_irq(slot, 0, ~0);
+	return 0;
+}
+
+static int cb710_mmc_resume(struct platform_device *pdev)
+{
+	struct cb710_slot *slot = cb710_pdev_to_slot(pdev);
+	struct mmc_host *mmc = cb710_slot_to_mmc(slot);
+
+	cb710_mmc_enable_irq(slot, 0, ~0);
+
+	return mmc_resume_host(mmc);
+}
+
+#endif /* CONFIG_PM */
+
+static int __devinit cb710_mmc_init(struct platform_device *pdev)
+{
+	struct cb710_slot *slot = cb710_pdev_to_slot(pdev);
+	struct cb710_chip *chip = cb710_slot_to_chip(slot);
+	struct mmc_host *mmc;
+	struct cb710_mmc_reader *reader;
+	int err;
+	u32 val;
+
+	mmc = mmc_alloc_host(sizeof(*reader), cb710_slot_dev(slot));
+	if (!mmc)
+		return -ENOMEM;
+
+	dev_set_drvdata(&pdev->dev, mmc);
+
+	/* harmless (maybe) magic */
+	pci_read_config_dword(chip->pdev, 0x48, &val);
+	val = cb710_src_freq_mhz[(val >> 16) & 0xF];
+	dev_dbg(cb710_slot_dev(slot), "source frequency: %dMHz\n", val);
+	val *= 1000000;
+
+	mmc->ops = &cb710_mmc_host;
+	mmc->f_max = val;
+	mmc->f_min = val >> cb710_clock_divider_log2[CB710_MAX_DIVIDER_IDX];
+	mmc->ocr_avail = MMC_VDD_32_33|MMC_VDD_33_34;
+	mmc->caps = MMC_CAP_4_BIT_DATA;
+
+	reader = mmc_priv(mmc);
+
+	tasklet_init(&reader->finish_req_tasklet,
+		cb710_mmc_finish_request_tasklet, (unsigned long)mmc);
+	spin_lock_init(&reader->irq_lock);
+	cb710_dump_regs(chip, CB710_DUMP_REGS_MMC);
+
+	cb710_mmc_enable_irq(slot, 0, ~0);
+	cb710_set_irq_handler(slot, cb710_mmc_irq_handler);
+
+	err = mmc_add_host(mmc);
+	if (unlikely(err))
+		goto err_free_mmc;
+
+	dev_dbg(cb710_slot_dev(slot), "mmc_hostname is %s\n",
+		mmc_hostname(mmc));
+
+	cb710_mmc_enable_irq(slot, CB710_MMC_IE_CARD_INSERTION_STATUS, 0);
+
+	return 0;
+
+err_free_mmc:
+	dev_dbg(cb710_slot_dev(slot), "mmc_add_host() failed: %d\n", err);
+
+	mmc_free_host(mmc);
+	return err;
+}
+
+static int __devexit cb710_mmc_exit(struct platform_device *pdev)
+{
+	struct cb710_slot *slot = cb710_pdev_to_slot(pdev);
+	struct mmc_host *mmc = cb710_slot_to_mmc(slot);
+	struct cb710_mmc_reader *reader = mmc_priv(mmc);
+
+	cb710_mmc_enable_irq(slot, 0, CB710_MMC_IE_CARD_INSERTION_STATUS);
+
+	mmc_remove_host(mmc);
+
+	/* IRQs should be disabled now, but let's stay on the safe side */
+	cb710_mmc_enable_irq(slot, 0, ~0);
+	cb710_set_irq_handler(slot, NULL);
+
+	/* clear config ports - just in case */
+	cb710_write_port_32(slot, CB710_MMC_CONFIG_PORT, 0);
+	cb710_write_port_16(slot, CB710_MMC_CONFIGB_PORT, 0);
+
+	tasklet_kill(&reader->finish_req_tasklet);
+
+	mmc_free_host(mmc);
+	return 0;
+}
+
+static struct platform_driver cb710_mmc_driver = {
+	.driver.name = "cb710-mmc",
+	.probe = cb710_mmc_init,
+	.remove = __devexit_p(cb710_mmc_exit),
+#ifdef CONFIG_PM
+	.suspend = cb710_mmc_suspend,
+	.resume = cb710_mmc_resume,
+#endif
+};
+
+static int __init cb710_mmc_init_module(void)
+{
+	return platform_driver_register(&cb710_mmc_driver);
+}
+
+static void __exit cb710_mmc_cleanup_module(void)
+{
+	platform_driver_unregister(&cb710_mmc_driver);
+}
+
+module_init(cb710_mmc_init_module);
+module_exit(cb710_mmc_cleanup_module);
+
+MODULE_AUTHOR("Michał Mirosław <mirq-linux@rere.qmqm.pl>");
+MODULE_DESCRIPTION("ENE CB710 memory card reader driver - MMC/SD part");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:cb710-mmc");
diff --git a/drivers/mmc/host/cb710-mmc.h b/drivers/mmc/host/cb710-mmc.h
new file mode 100644
index 00000000000..e845c776bdd
--- /dev/null
+++ b/drivers/mmc/host/cb710-mmc.h
@@ -0,0 +1,104 @@
+/*
+ *  cb710/cb710-mmc.h
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef LINUX_CB710_MMC_H
+#define LINUX_CB710_MMC_H
+
+#include <linux/cb710.h>
+
+/* per-MMC-reader structure */
+struct cb710_mmc_reader {
+	struct tasklet_struct finish_req_tasklet;
+	struct mmc_request *mrq;
+	spinlock_t irq_lock;
+	unsigned char last_power_mode;
+};
+
+/* some device struct walking */
+
+static inline struct mmc_host *cb710_slot_to_mmc(struct cb710_slot *slot)
+{
+	return dev_get_drvdata(&slot->pdev.dev);
+}
+
+static inline struct cb710_slot *cb710_mmc_to_slot(struct mmc_host *mmc)
+{
+	struct platform_device *pdev = container_of(mmc_dev(mmc),
+		struct platform_device, dev);
+	return cb710_pdev_to_slot(pdev);
+}
+
+/* registers (this might be all wrong ;) */
+
+#define CB710_MMC_DATA_PORT		0x00
+
+#define CB710_MMC_CONFIG_PORT		0x04
+#define CB710_MMC_CONFIG0_PORT		0x04
+#define CB710_MMC_CONFIG1_PORT		0x05
+#define   CB710_MMC_C1_4BIT_DATA_BUS		0x40
+#define CB710_MMC_CONFIG2_PORT		0x06
+#define   CB710_MMC_C2_READ_PIO_SIZE_MASK	0x0F	/* N-1 */
+#define CB710_MMC_CONFIG3_PORT		0x07
+
+#define CB710_MMC_CONFIGB_PORT		0x08
+
+#define CB710_MMC_IRQ_ENABLE_PORT	0x0C
+#define   CB710_MMC_IE_TEST_MASK		0x00BF
+#define   CB710_MMC_IE_CARD_INSERTION_STATUS	0x1000
+#define   CB710_MMC_IE_IRQ_ENABLE		0x8000
+#define   CB710_MMC_IE_CISTATUS_MASK		\
+		(CB710_MMC_IE_CARD_INSERTION_STATUS|CB710_MMC_IE_IRQ_ENABLE)
+
+#define CB710_MMC_STATUS_PORT		0x10
+#define   CB710_MMC_STATUS_ERROR_EVENTS		0x60FF
+#define CB710_MMC_STATUS0_PORT		0x10
+#define   CB710_MMC_S0_FIFO_UNDERFLOW		0x40
+#define CB710_MMC_STATUS1_PORT		0x11
+#define   CB710_MMC_S1_COMMAND_SENT		0x01
+#define   CB710_MMC_S1_DATA_TRANSFER_DONE	0x02
+#define   CB710_MMC_S1_PIO_TRANSFER_DONE	0x04
+#define   CB710_MMC_S1_CARD_CHANGED		0x10
+#define   CB710_MMC_S1_RESET			0x20
+#define CB710_MMC_STATUS2_PORT		0x12
+#define   CB710_MMC_S2_FIFO_READY		0x01
+#define   CB710_MMC_S2_FIFO_EMPTY		0x02
+#define   CB710_MMC_S2_BUSY_10			0x10
+#define   CB710_MMC_S2_BUSY_20			0x20
+#define CB710_MMC_STATUS3_PORT		0x13
+#define   CB710_MMC_S3_CARD_DETECTED		0x02
+#define   CB710_MMC_S3_WRITE_PROTECTED		0x04
+
+#define CB710_MMC_CMD_TYPE_PORT		0x14
+#define   CB710_MMC_RSP_TYPE_MASK		0x0007
+#define     CB710_MMC_RSP_R1			(0)
+#define     CB710_MMC_RSP_136			(5)
+#define     CB710_MMC_RSP_NO_CRC		(2)
+#define   CB710_MMC_RSP_PRESENT_MASK		0x0018
+#define     CB710_MMC_RSP_NONE			(0 << 3)
+#define     CB710_MMC_RSP_PRESENT		(1 << 3)
+#define     CB710_MMC_RSP_PRESENT_X		(2 << 3)
+#define   CB710_MMC_CMD_TYPE_MASK		0x0060
+#define     CB710_MMC_CMD_BC			(0 << 5)
+#define     CB710_MMC_CMD_BCR			(1 << 5)
+#define     CB710_MMC_CMD_AC			(2 << 5)
+#define     CB710_MMC_CMD_ADTC			(3 << 5)
+#define   CB710_MMC_DATA_READ			0x0080
+#define   CB710_MMC_CMD_CODE_MASK		0x3F00
+#define   CB710_MMC_CMD_CODE_SHIFT		8
+#define   CB710_MMC_IS_APP_CMD			0x4000
+#define   CB710_MMC_RSP_BUSY			0x8000
+
+#define CB710_MMC_CMD_PARAM_PORT	0x18
+#define CB710_MMC_TRANSFER_SIZE_PORT	0x1C
+#define CB710_MMC_RESPONSE0_PORT	0x20
+#define CB710_MMC_RESPONSE1_PORT	0x24
+#define CB710_MMC_RESPONSE2_PORT	0x28
+#define CB710_MMC_RESPONSE3_PORT	0x2C
+
+#endif /* LINUX_CB710_MMC_H */
diff --git a/include/linux/cb710.h b/include/linux/cb710.h
new file mode 100644
index 00000000000..c3819e1ce1c
--- /dev/null
+++ b/include/linux/cb710.h
@@ -0,0 +1,238 @@
+/*
+ *  cb710/cb710.h
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef LINUX_CB710_DRIVER_H
+#define LINUX_CB710_DRIVER_H
+
+#ifdef CONFIG_CB710_DEBUG
+#define DEBUG
+#endif
+
+/* verify assumptions on platform_device framework */
+#define CONFIG_CB710_DEBUG_ASSUMPTIONS
+
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/mmc/host.h>
+
+struct cb710_slot;
+
+typedef int (*cb710_irq_handler_t)(struct cb710_slot *);
+
+/* per-virtual-slot structure */
+struct cb710_slot {
+	struct platform_device	pdev;
+	void __iomem		*iobase;
+	cb710_irq_handler_t	irq_handler;
+};
+
+/* per-device structure */
+struct cb710_chip {
+	struct pci_dev		*pdev;
+	void __iomem		*iobase;
+	unsigned		platform_id;
+#ifdef CONFIG_CB710_DEBUG_ASSUMPTIONS
+	atomic_t		slot_refs_count;
+#endif
+	unsigned		slot_mask;
+	unsigned		slots;
+	spinlock_t		irq_lock;
+	struct cb710_slot	slot[0];
+};
+
+/* NOTE: cb710_chip.slots is modified only during device init/exit and
+ * they are all serialized wrt themselves */
+
+/* cb710_chip.slot_mask values */
+#define CB710_SLOT_MMC		1
+#define CB710_SLOT_MS		2
+#define CB710_SLOT_SM		4
+
+/* slot port accessors - so the logic is more clear in the code */
+#define CB710_PORT_ACCESSORS(t) \
+static inline void cb710_write_port_##t(struct cb710_slot *slot,	\
+	unsigned port, u##t value)					\
+{									\
+	iowrite##t(value, slot->iobase + port);				\
+}									\
+									\
+static inline u##t cb710_read_port_##t(struct cb710_slot *slot,		\
+	unsigned port)							\
+{									\
+	return ioread##t(slot->iobase + port);				\
+}									\
+									\
+static inline void cb710_modify_port_##t(struct cb710_slot *slot,	\
+	unsigned port, u##t set, u##t clear)				\
+{									\
+	iowrite##t(							\
+		(ioread##t(slot->iobase + port) & ~clear)|set,		\
+		slot->iobase + port);					\
+}
+
+CB710_PORT_ACCESSORS(8)
+CB710_PORT_ACCESSORS(16)
+CB710_PORT_ACCESSORS(32)
+
+void cb710_pci_update_config_reg(struct pci_dev *pdev,
+	int reg, uint32_t and, uint32_t xor);
+void cb710_set_irq_handler(struct cb710_slot *slot,
+	cb710_irq_handler_t handler);
+
+/* some device struct walking */
+
+static inline struct cb710_slot *cb710_pdev_to_slot(
+	struct platform_device *pdev)
+{
+	return container_of(pdev, struct cb710_slot, pdev);
+}
+
+static inline struct cb710_chip *cb710_slot_to_chip(struct cb710_slot *slot)
+{
+	return dev_get_drvdata(slot->pdev.dev.parent);
+}
+
+static inline struct device *cb710_slot_dev(struct cb710_slot *slot)
+{
+	return &slot->pdev.dev;
+}
+
+static inline struct device *cb710_chip_dev(struct cb710_chip *chip)
+{
+	return &chip->pdev->dev;
+}
+
+/* debugging aids */
+
+#ifdef CONFIG_CB710_DEBUG
+void cb710_dump_regs(struct cb710_chip *chip, unsigned dump);
+#else
+#define cb710_dump_regs(c, d) do {} while (0)
+#endif
+
+#define CB710_DUMP_REGS_MMC	0x0F
+#define CB710_DUMP_REGS_MS	0x30
+#define CB710_DUMP_REGS_SM	0xC0
+#define CB710_DUMP_REGS_ALL	0xFF
+#define CB710_DUMP_REGS_MASK	0xFF
+
+#define CB710_DUMP_ACCESS_8	0x100
+#define CB710_DUMP_ACCESS_16	0x200
+#define CB710_DUMP_ACCESS_32	0x400
+#define CB710_DUMP_ACCESS_ALL	0x700
+#define CB710_DUMP_ACCESS_MASK	0x700
+
+#endif /* LINUX_CB710_DRIVER_H */
+/*
+ *  cb710/sgbuf2.h
+ *
+ *  Copyright by Michał Mirosław, 2008-2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef LINUX_CB710_SG_H
+#define LINUX_CB710_SG_H
+
+#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+
+/**
+ * cb710_sg_miter_stop_writing - stop mapping iteration after writing
+ * @miter: sg mapping iter to be stopped
+ *
+ * Description:
+ *   Stops mapping iterator @miter.  @miter should have been started
+ *   started using sg_miter_start().  A stopped iteration can be
+ *   resumed by calling sg_miter_next() on it.  This is useful when
+ *   resources (kmap) need to be released during iteration.
+ *
+ *   This is a convenience wrapper that will be optimized out for arches
+ *   that don't need flush_kernel_dcache_page().
+ *
+ * Context:
+ *   IRQ disabled if the SG_MITER_ATOMIC is set.  Don't care otherwise.
+ */
+static inline void cb710_sg_miter_stop_writing(struct sg_mapping_iter *miter)
+{
+	if (miter->page)
+		flush_kernel_dcache_page(miter->page);
+	sg_miter_stop(miter);
+}
+
+/*
+ * 32-bit PIO mapping sg iterator
+ *
+ * Hides scatterlist access issues - fragment boundaries, alignment, page
+ * mapping - for drivers using 32-bit-word-at-a-time-PIO (ie. PCI devices
+ * without DMA support).
+ *
+ * Best-case reading (transfer from device):
+ *   sg_miter_start();
+ *   cb710_sg_dwiter_write_from_io();
+ *   cb710_sg_miter_stop_writing();
+ *
+ * Best-case writing (transfer to device):
+ *   sg_miter_start();
+ *   cb710_sg_dwiter_read_to_io();
+ *   sg_miter_stop();
+ */
+
+uint32_t cb710_sg_dwiter_read_next_block(struct sg_mapping_iter *miter);
+void cb710_sg_dwiter_write_next_block(struct sg_mapping_iter *miter, uint32_t data);
+
+/**
+ * cb710_sg_dwiter_write_from_io - transfer data to mapped buffer from 32-bit IO port
+ * @miter: sg mapping iter
+ * @port: PIO port - IO or MMIO address
+ * @count: number of 32-bit words to transfer
+ *
+ * Description:
+ *   Reads @count 32-bit words from register @port and stores it in
+ *   buffer iterated by @miter.  Data that would overflow the buffer
+ *   is silently ignored.  Iterator is advanced by 4*@count bytes
+ *   or to the buffer's end whichever is closer.
+ *
+ * Context:
+ *   IRQ disabled if the SG_MITER_ATOMIC is set.  Don't care otherwise.
+ */
+static inline void cb710_sg_dwiter_write_from_io(struct sg_mapping_iter *miter,
+	void __iomem *port, size_t count)
+{
+	while (count-- > 0)
+		cb710_sg_dwiter_write_next_block(miter, ioread32(port));
+}
+
+/**
+ * cb710_sg_dwiter_read_to_io - transfer data to 32-bit IO port from mapped buffer
+ * @miter: sg mapping iter
+ * @port: PIO port - IO or MMIO address
+ * @count: number of 32-bit words to transfer
+ *
+ * Description:
+ *   Writes @count 32-bit words to register @port from buffer iterated
+ *   through @miter.  If buffer ends before @count words are written
+ *   missing data is replaced by zeroes. @miter is advanced by 4*@count
+ *   bytes or to the buffer's end whichever is closer.
+ *
+ * Context:
+ *   IRQ disabled if the SG_MITER_ATOMIC is set.  Don't care otherwise.
+ */
+static inline void cb710_sg_dwiter_read_to_io(struct sg_mapping_iter *miter,
+	void __iomem *port, size_t count)
+{
+	while (count-- > 0)
+		iowrite32(cb710_sg_dwiter_read_next_block(miter), port);
+}
+
+#endif /* LINUX_CB710_SG_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 19f8e6d1a4d..9f36e1cdbf0 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2127,6 +2127,7 @@
 #define PCI_VENDOR_ID_MAINPINE		0x1522
 #define PCI_DEVICE_ID_MAINPINE_PBRIDGE	0x0100
 #define PCI_VENDOR_ID_ENE		0x1524
+#define PCI_DEVICE_ID_ENE_CB710_FLASH	0x0510
 #define PCI_DEVICE_ID_ENE_CB712_SD	0x0550
 #define PCI_DEVICE_ID_ENE_CB712_SD_2	0x0551
 #define PCI_DEVICE_ID_ENE_CB714_SD	0x0750
-- 
cgit v1.2.3-70-g09d2


From 9bf69a26ad9ccdc49469402275204271b3336ab6 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <pierre@ossman.eu>
Date: Thu, 4 Jun 2009 08:00:40 +0200
Subject: cb710: handle DEBUG define in Makefile

Signed-off-by: Pierre Ossman <pierre@ossman.eu>
---
 drivers/misc/cb710/Makefile | 4 ++++
 include/linux/cb710.h       | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cb710/Makefile b/drivers/misc/cb710/Makefile
index 62d51acfeca..7b80cbf1a60 100644
--- a/drivers/misc/cb710/Makefile
+++ b/drivers/misc/cb710/Makefile
@@ -1,3 +1,7 @@
+ifeq ($(CONFIG_CB710_DEBUG),y)
+	EXTRA_CFLAGS		+= -DDEBUG
+endif
+
 obj-$(CONFIG_CB710_CORE)	+= cb710.o
 
 cb710-y				:= core.o sgbuf2.o
diff --git a/include/linux/cb710.h b/include/linux/cb710.h
index c3819e1ce1c..a09213b5261 100644
--- a/include/linux/cb710.h
+++ b/include/linux/cb710.h
@@ -10,10 +10,6 @@
 #ifndef LINUX_CB710_DRIVER_H
 #define LINUX_CB710_DRIVER_H
 
-#ifdef CONFIG_CB710_DEBUG
-#define DEBUG
-#endif
-
 /* verify assumptions on platform_device framework */
 #define CONFIG_CB710_DEBUG_ASSUMPTIONS
 
-- 
cgit v1.2.3-70-g09d2


From c54f6bc67a4398243682f7438a2129906e127d21 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Sat, 13 Jun 2009 12:37:59 +0200
Subject: cb710: more cleanup for the DEBUG case.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>
---
 drivers/misc/cb710/Kconfig | 4 ++++
 drivers/mmc/host/Makefile  | 3 +++
 include/linux/cb710.h      | 3 ---
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cb710/Kconfig b/drivers/misc/cb710/Kconfig
index 0b55079774c..22429b8b106 100644
--- a/drivers/misc/cb710/Kconfig
+++ b/drivers/misc/cb710/Kconfig
@@ -19,3 +19,7 @@ config CB710_DEBUG
 	  This is an option for use by developers; most people should
 	  say N here.  This adds a lot of debugging output to dmesg.
 
+config CB710_DEBUG_ASSUMPTIONS
+	bool
+	depends on CB710_CORE != n
+	default y
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 254e441291d..79da397c5fe 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -32,3 +32,6 @@ obj-$(CONFIG_MMC_SDRICOH_CS)	+= sdricoh_cs.o
 obj-$(CONFIG_MMC_TMIO)		+= tmio_mmc.o
 obj-$(CONFIG_MMC_CB710)	+= cb710-mmc.o
 
+ifeq ($(CONFIG_CB710_DEBUG),y)
+	CFLAGS-cb710-mmc	+= -DDEBUG
+endif
diff --git a/include/linux/cb710.h b/include/linux/cb710.h
index a09213b5261..63bc9a4d292 100644
--- a/include/linux/cb710.h
+++ b/include/linux/cb710.h
@@ -10,9 +10,6 @@
 #ifndef LINUX_CB710_DRIVER_H
 #define LINUX_CB710_DRIVER_H
 
-/* verify assumptions on platform_device framework */
-#define CONFIG_CB710_DEBUG_ASSUMPTIONS
-
 #include <linux/io.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
-- 
cgit v1.2.3-70-g09d2


From f0e46cc4971f6be96010d9248e0fc076b229d989 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Thu, 4 Jun 2009 20:12:31 +0200
Subject: MFD,mmc: tmio_mmc: make HCLK configurable

The Toshiba parts all have a 24 MHz HCLK, but HTC ASIC3 has a 24.576 MHz HCLK
and AMD Imageon w228x's HCLK is 80 MHz. With this patch, the MFD driver
provides the HCLK frequency to tmio_mmc via mfd_cell->driver_data.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Acked-by: Ian Molton <ian@mnementh.co.uk>
Acked-by: Samuel Ortiz <sameo@openedhand.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>
---
 drivers/mfd/t7l66xb.c       |  5 +++++
 drivers/mfd/tc6387xb.c      |  5 +++++
 drivers/mfd/tc6393xb.c      |  5 +++++
 drivers/mmc/host/tmio_mmc.c | 24 +++++++++++-------------
 include/linux/mfd/tmio.h    |  7 +++++++
 5 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c
index e9f4323dd2c..875f7a87573 100644
--- a/drivers/mfd/t7l66xb.c
+++ b/drivers/mfd/t7l66xb.c
@@ -108,6 +108,10 @@ static int t7l66xb_mmc_disable(struct platform_device *mmc)
 
 /*--------------------------------------------------------------------------*/
 
+static const struct tmio_mmc_data t7166xb_mmc_data = {
+	.hclk = 24000000,
+};
+
 static const struct resource t7l66xb_mmc_resources[] = {
 	{
 		.start = 0x800,
@@ -149,6 +153,7 @@ static struct mfd_cell t7l66xb_cells[] = {
 		.name = "tmio-mmc",
 		.enable = t7l66xb_mmc_enable,
 		.disable = t7l66xb_mmc_disable,
+		.driver_data = &t7166xb_mmc_data,
 		.num_resources = ARRAY_SIZE(t7l66xb_mmc_resources),
 		.resources = t7l66xb_mmc_resources,
 	},
diff --git a/drivers/mfd/tc6387xb.c b/drivers/mfd/tc6387xb.c
index 43222c12fec..c3993ac2054 100644
--- a/drivers/mfd/tc6387xb.c
+++ b/drivers/mfd/tc6387xb.c
@@ -75,6 +75,10 @@ static int tc6387xb_mmc_disable(struct platform_device *mmc)
 
 /*--------------------------------------------------------------------------*/
 
+const static struct tmio_mmc_data tc6387xb_mmc_data = {
+	.hclk = 24000000,
+};
+
 static struct resource tc6387xb_mmc_resources[] = {
 	{
 		.start = 0x800,
@@ -98,6 +102,7 @@ static struct mfd_cell tc6387xb_cells[] = {
 		.name = "tmio-mmc",
 		.enable = tc6387xb_mmc_enable,
 		.disable = tc6387xb_mmc_disable,
+		.driver_data = &tc6387xb_mmc_data,
 		.num_resources = ARRAY_SIZE(tc6387xb_mmc_resources),
 		.resources = tc6387xb_mmc_resources,
 	},
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 77a12fc8045..9d2abb5d6e2 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -136,6 +136,10 @@ static int tc6393xb_nand_enable(struct platform_device *nand)
 	return 0;
 }
 
+const static struct tmio_mmc_data tc6393xb_mmc_data = {
+	.hclk = 24000000,
+};
+
 static struct resource __devinitdata tc6393xb_nand_resources[] = {
 	{
 		.start	= 0x1000,
@@ -351,6 +355,7 @@ static struct mfd_cell __devinitdata tc6393xb_cells[] = {
 	},
 	[TC6393XB_CELL_MMC] = {
 		.name = "tmio-mmc",
+		.driver_data = &tc6393xb_mmc_data,
 		.num_resources = ARRAY_SIZE(tc6393xb_mmc_resources),
 		.resources = tc6393xb_mmc_resources,
 	},
diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index 63fbd5b7d31..49df71e6be1 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -35,23 +35,14 @@
 
 #include "tmio_mmc.h"
 
-/*
- * Fixme - documentation conflicts on what the clock values are for the
- * various dividers.
- * One document I have says that its a divisor of a 24MHz clock, another 33.
- * This probably depends on HCLK for a given platform, so we may need to
- * require HCLK be passed to us from the MFD core.
- *
- */
-
 static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
 {
 	void __iomem *cnf = host->cnf;
 	void __iomem *ctl = host->ctl;
-	u32 clk = 0, clock;
+	u32 clk = 0, clock, f_min = host->mmc->f_min;
 
 	if (new_clock) {
-		for (clock = 46875, clk = 0x100; new_clock >= (clock<<1); ) {
+		for (clock = f_min, clk = 0x100; new_clock >= (clock<<1); ) {
 			clock <<= 1;
 			clk >>= 1;
 		}
@@ -545,6 +536,7 @@ out:
 static int __devinit tmio_mmc_probe(struct platform_device *dev)
 {
 	struct mfd_cell	*cell = (struct mfd_cell *)dev->dev.platform_data;
+	struct tmio_mmc_data *pdata;
 	struct resource *res_ctl, *res_cnf;
 	struct tmio_mmc_host *host;
 	struct mmc_host *mmc;
@@ -560,6 +552,12 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
 		goto out;
 	}
 
+	pdata = cell->driver_data;
+	if (!pdata || !pdata->hclk) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	mmc = mmc_alloc_host(sizeof(struct tmio_mmc_host), &dev->dev);
 	if (!mmc)
 		goto out;
@@ -578,8 +576,8 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
 
 	mmc->ops = &tmio_mmc_ops;
 	mmc->caps = MMC_CAP_4_BIT_DATA;
-	mmc->f_min = 46875; /* 24000000 / 512 */
-	mmc->f_max = 24000000;
+	mmc->f_max = pdata->hclk;
+	mmc->f_min = mmc->f_max / 512;
 	mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
 
 	/* Enable the MMC/SD Control registers */
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 516d955ab8a..c377118884e 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -18,6 +18,13 @@
 	writew((val) >> 16, (addr) + 2); \
 	} while (0)
 
+/*
+ * data for the MMC controller
+ */
+struct tmio_mmc_data {
+	unsigned int		hclk;
+};
+
 /*
  * data for the NAND controller
  */
-- 
cgit v1.2.3-70-g09d2


From ab33dcff40d7a9a28587e4425621e4cbc4089e03 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 13 Jun 2009 20:01:00 -0700
Subject: genirq, irq.h: Fix kernel-doc warnings

Fix kernel-doc warnings in linux/irq.h:

  Warning(include/linux/irq.h:201): No description found for parameter 'node'
  Warning(include/linux/irq.h:201): Excess struct/union/enum/typedef member 'cpu' description in 'irq_desc'
  Warning(include/linux/irq.h:434): No description found for parameter 'node'
  Warning(include/linux/irq.h:434): Excess function parameter 'cpu' description in 'alloc_desc_masks'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <4A3467EC.50006@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b7cbeed972e..0c001c15752 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -157,7 +157,7 @@ struct irq_2_iommu;
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
- * @cpu:		cpu index useful for balancing
+ * @node:		node index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
@@ -426,7 +426,7 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 /**
  * init_alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
- * @cpu:	cpu which will be handling the cpumasks
+ * @node:	node which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
-- 
cgit v1.2.3-70-g09d2


From 837ec787d85fda8d73193a399ebcea0288e4765b Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 9 Jun 2009 23:56:55 +0200
Subject: firewire: core: don't update Broadcast_Channel if RFC 2734 conditions
 aren't met

This extra check will avoid Broadcast_Channel register related traffic
to many IIDC, SBP-2, and AV/C devices which aren't IRMC or have a
max_rec < 8 (i.e. support < 512 bytes async payload).  This avoids a
little bit of traffic after bus reset and is even more careful with
devices which don't implement this CSR.

The assumption is that no other protocol than IP over 1394 uses the
broadcast channel for streams.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-device.c | 18 +++++++++++++++++-
 include/linux/firewire.h       |  2 ++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index d6e54a5173f..97e656af2d2 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -580,7 +580,9 @@ static int read_bus_info_block(struct fw_device *device, int generation)
 
 	kfree(old_rom);
 	ret = 0;
-	device->cmc = rom[2] >> 30 & 1;
+	device->max_rec	= rom[2] >> 12 & 0xf;
+	device->cmc	= rom[2] >> 30 & 1;
+	device->irmc	= rom[2] >> 31 & 1;
  out:
 	kfree(rom);
 
@@ -841,6 +843,20 @@ static void set_broadcast_channel(struct fw_device *device, int generation)
 	if (!card->broadcast_channel_allocated)
 		return;
 
+	/*
+	 * The Broadcast_Channel Valid bit is required by nodes which want to
+	 * transmit on this channel.  Such transmissions are practically
+	 * exclusive to IP over 1394 (RFC 2734).  IP capable nodes are required
+	 * to be IRM capable and have a max_rec of 8 or more.  We use this fact
+	 * to narrow down to which nodes we send Broadcast_Channel updates.
+	 */
+	if (!device->irmc || device->max_rec < 8)
+		return;
+
+	/*
+	 * Some 1394-1995 nodes crash if this 1394a-2000 register is written.
+	 * Perform a read test first.
+	 */
 	if (device->bc_implemented == BC_UNKNOWN) {
 		rcode = fw_run_transaction(card, TCODE_READ_QUADLET_REQUEST,
 				device->node_id, generation, device->max_speed,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index a69aea0394e..610eade8abb 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -191,7 +191,9 @@ struct fw_device {
 	size_t config_rom_length;
 	int config_rom_retries;
 	unsigned is_local:1;
+	unsigned max_rec:4;
 	unsigned cmc:1;
+	unsigned irmc:1;
 	unsigned bc_implemented:2;
 
 	struct delayed_work work;
-- 
cgit v1.2.3-70-g09d2


From 1e626fdcef61460dc75fe7377f38bb019722b848 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 14 Jun 2009 13:23:58 +0200
Subject: firewire: core: use more outbound tlabels

Tlabel is a 6 bits wide datum.  Wrap it after 63 rather than 31 for more
safety against transaction label exhaustion and potential responders'
transaction layer bugs.  (As noted by Guus Sliepen, this change requires
an expansion of tlabel_mask to 64 bits.)

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-transaction.c | 8 ++++----
 include/linux/firewire.h            | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 9a6ce9ab2a6..479b22f5a1e 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -82,7 +82,7 @@ static int close_transaction(struct fw_transaction *transaction,
 	list_for_each_entry(t, &card->transaction_list, link) {
 		if (t == transaction) {
 			list_del(&t->link);
-			card->tlabel_mask &= ~(1 << t->tlabel);
+			card->tlabel_mask &= ~(1ULL << t->tlabel);
 			break;
 		}
 	}
@@ -288,14 +288,14 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 	spin_lock_irqsave(&card->lock, flags);
 
 	tlabel = card->current_tlabel;
-	if (card->tlabel_mask & (1 << tlabel)) {
+	if (card->tlabel_mask & (1ULL << tlabel)) {
 		spin_unlock_irqrestore(&card->lock, flags);
 		callback(card, RCODE_SEND_ERROR, NULL, 0, callback_data);
 		return;
 	}
 
-	card->current_tlabel = (card->current_tlabel + 1) & 0x1f;
-	card->tlabel_mask |= (1 << tlabel);
+	card->current_tlabel = (card->current_tlabel + 1) & 0x3f;
+	card->tlabel_mask |= (1ULL << tlabel);
 
 	t->node_id = destination_id;
 	t->tlabel = tlabel;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 610eade8abb..e584b7215e8 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -98,7 +98,8 @@ struct fw_card {
 
 	int node_id;
 	int generation;
-	int current_tlabel, tlabel_mask;
+	int current_tlabel;
+	u64 tlabel_mask;
 	struct list_head transaction_list;
 	struct timer_list flush_timer;
 	unsigned long reset_jiffies;
-- 
cgit v1.2.3-70-g09d2


From c76acec6d55107b652a37c90b36c00bc8b04dabb Mon Sep 17 00:00:00 2001
From: Jay Fenlason <fenlason@redhat.com>
Date: Mon, 18 May 2009 13:08:06 -0400
Subject: firewire: add IPv4 support

Implement IPv4 over IEEE 1394 as per RFC 2734 for the newer firewire
stack.  This feature has only been present in the older ieee1394 stack
via the eth1394 driver.

Still to do:
  - fix ipv4_priv and ipv4_node lifetime logic
  - fix determination of speeds and max payloads
  - fix bus reset handling
  - fix unaligned memory accesses
  - fix coding style
  - further testing/ improvement of fragment reassembly
  - perhaps multicast support

Signed-off-by: Jay Fenlason <fenlason@redhat.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de> (rebased, copyright note, changelog)
---
 drivers/firewire/Makefile    |    2 +
 drivers/firewire/core-card.c |    4 +
 drivers/firewire/core-iso.c  |    7 +
 drivers/firewire/core.h      |   87 --
 drivers/firewire/fw-ipv4.c   | 1819 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/firewire.h     |   94 +++
 6 files changed, 1926 insertions(+), 87 deletions(-)
 create mode 100644 drivers/firewire/fw-ipv4.c

(limited to 'include/linux')

diff --git a/drivers/firewire/Makefile b/drivers/firewire/Makefile
index bc3b9bf822b..31edf30c558 100644
--- a/drivers/firewire/Makefile
+++ b/drivers/firewire/Makefile
@@ -6,7 +6,9 @@ firewire-core-y += core-card.o core-cdev.o core-device.o \
                    core-iso.o core-topology.o core-transaction.o
 firewire-ohci-y += ohci.o
 firewire-sbp2-y += sbp2.o
+firewire-ipv4-y += fw-ipv4.o
 
 obj-$(CONFIG_FIREWIRE) += firewire-core.o
 obj-$(CONFIG_FIREWIRE_OHCI) += firewire-ohci.o
 obj-$(CONFIG_FIREWIRE_SBP2) += firewire-sbp2.o
+obj-$(CONFIG_FIREWIRE_IPV4) += firewire-ipv4.o
diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 4c1be64fddd..cdab32b2067 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -176,6 +176,7 @@ int fw_core_add_descriptor(struct fw_descriptor *desc)
 
 	return 0;
 }
+EXPORT_SYMBOL(fw_core_add_descriptor);
 
 void fw_core_remove_descriptor(struct fw_descriptor *desc)
 {
@@ -189,6 +190,7 @@ void fw_core_remove_descriptor(struct fw_descriptor *desc)
 
 	mutex_unlock(&card_mutex);
 }
+EXPORT_SYMBOL(fw_core_remove_descriptor);
 
 static void allocate_broadcast_channel(struct fw_card *card, int generation)
 {
@@ -427,6 +429,8 @@ void fw_card_initialize(struct fw_card *card,
 	card->local_node = NULL;
 
 	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
+	card->netdev = NULL;
+	INIT_LIST_HEAD(&card->ipv4_nodes);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 28076c892d7..448ddd7d887 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -80,6 +80,7 @@ int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL(fw_iso_buffer_init);
 
 int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma)
 {
@@ -114,6 +115,7 @@ void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
 	kfree(buffer->pages);
 	buffer->pages = NULL;
 }
+EXPORT_SYMBOL(fw_iso_buffer_destroy);
 
 struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 		int type, int channel, int speed, size_t header_size,
@@ -136,6 +138,7 @@ struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 
 	return ctx;
 }
+EXPORT_SYMBOL(fw_iso_context_create);
 
 void fw_iso_context_destroy(struct fw_iso_context *ctx)
 {
@@ -143,12 +146,14 @@ void fw_iso_context_destroy(struct fw_iso_context *ctx)
 
 	card->driver->free_iso_context(ctx);
 }
+EXPORT_SYMBOL(fw_iso_context_destroy);
 
 int fw_iso_context_start(struct fw_iso_context *ctx,
 			 int cycle, int sync, int tags)
 {
 	return ctx->card->driver->start_iso(ctx, cycle, sync, tags);
 }
+EXPORT_SYMBOL(fw_iso_context_start);
 
 int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
@@ -159,11 +164,13 @@ int fw_iso_context_queue(struct fw_iso_context *ctx,
 
 	return card->driver->queue_iso(ctx, packet, buffer, payload);
 }
+EXPORT_SYMBOL(fw_iso_context_queue);
 
 int fw_iso_context_stop(struct fw_iso_context *ctx)
 {
 	return ctx->card->driver->stop_iso(ctx);
 }
+EXPORT_SYMBOL(fw_iso_context_stop);
 
 /*
  * Isochronous bus resource management (channels, bandwidth), client side
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 0a25a7b38a8..c3cfc647e5e 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -1,7 +1,6 @@
 #ifndef _FIREWIRE_CORE_H
 #define _FIREWIRE_CORE_H
 
-#include <linux/dma-mapping.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/idr.h>
@@ -97,17 +96,6 @@ int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
 int fw_compute_block_crc(u32 *block);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
-struct fw_descriptor {
-	struct list_head link;
-	size_t length;
-	u32 immediate;
-	u32 key;
-	const u32 *data;
-};
-
-int fw_core_add_descriptor(struct fw_descriptor *desc);
-void fw_core_remove_descriptor(struct fw_descriptor *desc);
-
 
 /* -cdev */
 
@@ -130,77 +118,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event);
 
 /* -iso */
 
-/*
- * The iso packet format allows for an immediate header/payload part
- * stored in 'header' immediately after the packet info plus an
- * indirect payload part that is pointer to by the 'payload' field.
- * Applications can use one or the other or both to implement simple
- * low-bandwidth streaming (e.g. audio) or more advanced
- * scatter-gather streaming (e.g. assembling video frame automatically).
- */
-struct fw_iso_packet {
-	u16 payload_length;	/* Length of indirect payload. */
-	u32 interrupt:1;	/* Generate interrupt on this packet */
-	u32 skip:1;		/* Set to not send packet at all. */
-	u32 tag:2;
-	u32 sy:4;
-	u32 header_length:8;	/* Length of immediate header. */
-	u32 header[0];
-};
-
-#define FW_ISO_CONTEXT_TRANSMIT	0
-#define FW_ISO_CONTEXT_RECEIVE	1
-
-#define FW_ISO_CONTEXT_MATCH_TAG0	 1
-#define FW_ISO_CONTEXT_MATCH_TAG1	 2
-#define FW_ISO_CONTEXT_MATCH_TAG2	 4
-#define FW_ISO_CONTEXT_MATCH_TAG3	 8
-#define FW_ISO_CONTEXT_MATCH_ALL_TAGS	15
-
-/*
- * An iso buffer is just a set of pages mapped for DMA in the
- * specified direction.  Since the pages are to be used for DMA, they
- * are not mapped into the kernel virtual address space.  We store the
- * DMA address in the page private. The helper function
- * fw_iso_buffer_map() will map the pages into a given vma.
- */
-struct fw_iso_buffer {
-	enum dma_data_direction direction;
-	struct page **pages;
-	int page_count;
-};
-
-typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
-				  u32 cycle, size_t header_length,
-				  void *header, void *data);
-
-struct fw_iso_context {
-	struct fw_card *card;
-	int type;
-	int channel;
-	int speed;
-	size_t header_size;
-	fw_iso_callback_t callback;
-	void *callback_data;
-};
-
-int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
-		       int page_count, enum dma_data_direction direction);
 int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma);
-void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
-
-struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
-		int type, int channel, int speed, size_t header_size,
-		fw_iso_callback_t callback, void *callback_data);
-int fw_iso_context_queue(struct fw_iso_context *ctx,
-			 struct fw_iso_packet *packet,
-			 struct fw_iso_buffer *buffer,
-			 unsigned long payload);
-int fw_iso_context_start(struct fw_iso_context *ctx,
-			 int cycle, int sync, int tags);
-int fw_iso_context_stop(struct fw_iso_context *ctx);
-void fw_iso_context_destroy(struct fw_iso_context *ctx);
-
 void fw_iso_resource_manage(struct fw_card *card, int generation,
 		u64 channels_mask, int *channel, int *bandwidth, bool allocate);
 
@@ -285,9 +203,4 @@ void fw_flush_transactions(struct fw_card *card);
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
-static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
-{
-	return tag << 14 | channel << 8 | sy;
-}
-
 #endif /* _FIREWIRE_CORE_H */
diff --git a/drivers/firewire/fw-ipv4.c b/drivers/firewire/fw-ipv4.c
new file mode 100644
index 00000000000..4de6dbb95f0
--- /dev/null
+++ b/drivers/firewire/fw-ipv4.c
@@ -0,0 +1,1819 @@
+/*
+ * IPv4 over IEEE 1394, per RFC 2734
+ *
+ * Copyright (C) 2009 Jay Fenlason <fenlason@redhat.com>
+ *
+ * based on eth1394 by Ben Collins et al
+ */
+
+#include <linux/device.h>
+#include <linux/ethtool.h>
+#include <linux/firewire.h>
+#include <linux/firewire-constants.h>
+#include <linux/highmem.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include <asm/unaligned.h>
+#include <net/arp.h>
+
+/* Things to potentially make runtime cofigurable */
+/* must be at least as large as our maximum receive size */
+#define FIFO_SIZE 4096
+/* Network timeout in glibbles */
+#define IPV4_TIMEOUT       100000
+
+/* Runitme configurable paramaters */
+static int ipv4_mpd = 25;
+static int ipv4_max_xmt = 0;
+/* 16k for receiving arp and broadcast packets.  Enough? */
+static int ipv4_iso_page_count = 4;
+
+MODULE_AUTHOR("Jay Fenlason (fenlason@redhat.com)");
+MODULE_DESCRIPTION("Firewire IPv4 Driver (IPv4-over-IEEE1394 as per RFC 2734)");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(ieee1394, ipv4_id_table);
+module_param_named(max_partial_datagrams, ipv4_mpd, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_partial_datagrams, "Maximum number of received"
+ " incomplete fragmented datagrams (default = 25).");
+
+/* Max xmt is useful for forcing fragmentation, which makes testing easier. */
+module_param_named(max_transmit, ipv4_max_xmt, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_transmit, "Maximum datagram size to transmit"
+ " (larger datagrams will be fragmented) (default = 0 (use hardware defaults).");
+
+/* iso page count controls how many pages will be used for receiving broadcast packets. */
+module_param_named(iso_pages, ipv4_iso_page_count, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(iso_pages, "Number of pages to use for receiving broadcast packets"
+ " (default = 4).");
+
+/* uncomment this line to do debugging */
+#define fw_debug(s, args...) printk(KERN_DEBUG KBUILD_MODNAME ": " s, ## args)
+
+/* comment out these lines to do debugging. */
+/* #undef fw_debug */
+/* #define fw_debug(s...) */
+/* #define print_hex_dump(l...) */
+
+/* Define a fake hardware header format for the networking core.  Note that
+ * header size cannot exceed 16 bytes as that is the size of the header cache.
+ * Also, we do not need the source address in the header so we omit it and
+ * keep the header to under 16 bytes */
+#define IPV4_ALEN (8)
+/* This must equal sizeof(struct ipv4_ether_hdr) */
+#define IPV4_HLEN (10)
+
+/* FIXME: what's a good size for this? */
+#define INVALID_FIFO_ADDR (u64)~0ULL
+
+/* Things specified by standards */
+#define BROADCAST_CHANNEL 31
+
+#define S100_BUFFER_SIZE 512
+#define MAX_BUFFER_SIZE 4096
+
+#define IPV4_GASP_SPECIFIER_ID	0x00005EU
+#define IPV4_GASP_VERSION	0x00000001U
+
+#define IPV4_GASP_OVERHEAD (2 * sizeof(u32)) /* for GASP header */
+
+#define IPV4_UNFRAG_HDR_SIZE	sizeof(u32)
+#define IPV4_FRAG_HDR_SIZE	(2 * sizeof(u32))
+#define IPV4_FRAG_OVERHEAD	sizeof(u32)
+
+#define ALL_NODES (0xffc0 | 0x003f)
+
+#define IPV4_HDR_UNFRAG		0	/* unfragmented		*/
+#define IPV4_HDR_FIRSTFRAG	1	/* first fragment	*/
+#define IPV4_HDR_LASTFRAG	2	/* last fragment	*/
+#define IPV4_HDR_INTFRAG	3	/* interior fragment	*/
+
+/* Our arp packet (ARPHRD_IEEE1394) */
+/* FIXME: note that this is probably bogus on weird-endian machines */
+struct ipv4_arp {
+	u16 hw_type;		/* 0x0018	*/
+	u16 proto_type;		/* 0x0806       */
+	u8 hw_addr_len;		/* 16		*/
+	u8 ip_addr_len;         /* 4		*/
+	u16 opcode;	        /* ARP Opcode	*/
+	/* Above is exactly the same format as struct arphdr */
+
+	u64 s_uniq_id;		/* Sender's 64bit EUI			*/
+	u8 max_rec;             /* Sender's max packet size		*/
+	u8 sspd;		/* Sender's max speed			*/
+	u16 fifo_hi;            /* hi 16bits of sender's FIFO addr	*/
+	u32 fifo_lo;            /* lo 32bits of sender's FIFO addr	*/
+	u32 sip;		/* Sender's IP Address			*/
+	u32 tip;		/* IP Address of requested hw addr	*/
+} __attribute__((packed));
+
+struct ipv4_ether_hdr {
+	unsigned char	h_dest[IPV4_ALEN];	/* destination address */
+	unsigned short  h_proto;                /* packet type ID field */
+}  __attribute__((packed));
+
+static inline struct ipv4_ether_hdr *ipv4_ether_hdr(const struct sk_buff *skb)
+{
+	return (struct ipv4_ether_hdr *)skb_mac_header(skb);
+}
+
+enum ipv4_tx_type {
+	IPV4_UNKNOWN = 0,
+	IPV4_GASP = 1,
+	IPV4_WRREQ = 2,
+};
+
+enum ipv4_broadcast_state {
+	IPV4_BROADCAST_ERROR,
+	IPV4_BROADCAST_RUNNING,
+	IPV4_BROADCAST_STOPPED,
+};
+
+#define ipv4_get_hdr_lf(h)		(((h)->w0&0xC0000000)>>30)
+#define ipv4_get_hdr_ether_type(h)	(((h)->w0&0x0000FFFF)    )
+#define ipv4_get_hdr_dg_size(h)		(((h)->w0&0x0FFF0000)>>16)
+#define ipv4_get_hdr_fg_off(h)		(((h)->w0&0x00000FFF)    )
+#define ipv4_get_hdr_dgl(h)		(((h)->w1&0xFFFF0000)>>16)
+
+#define ipv4_set_hdr_lf(lf)		(( lf)<<30)
+#define ipv4_set_hdr_ether_type(et)	(( et)    )
+#define ipv4_set_hdr_dg_size(dgs)	((dgs)<<16)
+#define ipv4_set_hdr_fg_off(fgo)	((fgo)    )
+
+#define ipv4_set_hdr_dgl(dgl)		((dgl)<<16)
+
+struct ipv4_hdr {
+	u32 w0;
+	u32 w1;
+};
+
+static inline void ipv4_make_uf_hdr( struct ipv4_hdr *hdr, unsigned ether_type) {
+	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_UNFRAG)
+		   |ipv4_set_hdr_ether_type(ether_type);
+	fw_debug ( "Setting unfragmented header %p to %x\n", hdr, hdr->w0 );
+}
+
+static inline void ipv4_make_ff_hdr ( struct ipv4_hdr *hdr, unsigned ether_type, unsigned dg_size, unsigned dgl ) {
+	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_FIRSTFRAG)
+		   |ipv4_set_hdr_dg_size(dg_size)
+		   |ipv4_set_hdr_ether_type(ether_type);
+	hdr->w1 = ipv4_set_hdr_dgl(dgl);
+	fw_debug ( "Setting fragmented header %p to first_frag %x,%x (et %x, dgs %x, dgl %x)\n", hdr, hdr->w0, hdr->w1,
+ ether_type, dg_size, dgl );
+}
+
+static inline void ipv4_make_sf_hdr ( struct ipv4_hdr *hdr, unsigned lf, unsigned dg_size, unsigned fg_off, unsigned dgl) {
+	hdr->w0 = ipv4_set_hdr_lf(lf)
+		 |ipv4_set_hdr_dg_size(dg_size)
+		 |ipv4_set_hdr_fg_off(fg_off);
+	hdr->w1 = ipv4_set_hdr_dgl(dgl);
+	fw_debug ( "Setting fragmented header %p to %x,%x (lf %x, dgs %x, fo %x dgl %x)\n",
+ hdr, hdr->w0, hdr->w1,
+ lf, dg_size, fg_off, dgl );
+}
+
+/* End of IP1394 headers */
+
+/* Fragment types */
+#define ETH1394_HDR_LF_UF	0	/* unfragmented		*/
+#define ETH1394_HDR_LF_FF	1	/* first fragment	*/
+#define ETH1394_HDR_LF_LF	2	/* last fragment	*/
+#define ETH1394_HDR_LF_IF	3	/* interior fragment	*/
+
+#define IP1394_HW_ADDR_LEN	16	/* As per RFC		*/
+
+/* This list keeps track of what parts of the datagram have been filled in */
+struct ipv4_fragment_info {
+        struct list_head fragment_info;
+	u16 offset;
+	u16 len;
+};
+
+struct ipv4_partial_datagram {
+	struct list_head pdg_list;
+	struct list_head fragment_info;
+	struct sk_buff *skb;
+	/* FIXME Why not use skb->data? */
+	char *pbuf;
+	u16 datagram_label;
+	u16 ether_type;
+	u16 datagram_size;
+};
+
+/*
+ * We keep one of these for each IPv4 capable device attached to a fw_card.
+ * The list of them is stored in the fw_card structure rather than in the
+ * ipv4_priv because the remote IPv4 nodes may be probed before the card is,
+ * so we need a place to store them before the ipv4_priv structure is
+ * allocated.
+ */
+struct ipv4_node {
+	struct list_head ipv4_nodes;
+	/* guid of the remote node */
+	u64 guid;
+	/* FIFO address to transmit datagrams to, or INVALID_FIFO_ADDR */
+	u64 fifo;
+
+	spinlock_t pdg_lock;	/* partial datagram lock		*/
+	/* List of partial datagrams received from this node */
+	struct list_head pdg_list;
+	/* Number of entries in pdg_list at the moment */
+	unsigned pdg_size;
+
+	/* max payload to transmit to this remote node */
+	/* This already includes the IPV4_FRAG_HDR_SIZE overhead */
+	u16 max_payload;
+	/* outgoing datagram label */
+	u16 datagram_label;
+	/* Current node_id of the remote node */
+	u16 nodeid;
+	/* current generation of the remote node */
+	u8 generation;
+	/* max speed that this node can receive at */
+	u8 xmt_speed;
+};
+
+struct ipv4_priv {
+	spinlock_t lock;
+
+	enum ipv4_broadcast_state broadcast_state;
+	struct fw_iso_context *broadcast_rcv_context;
+	struct fw_iso_buffer broadcast_rcv_buffer;
+	void **broadcast_rcv_buffer_ptrs;
+	unsigned broadcast_rcv_next_ptr;
+	unsigned num_broadcast_rcv_ptrs;
+	unsigned rcv_buffer_size;
+	/*
+	 * This value is the maximum unfragmented datagram size that can be
+	 * sent by the hardware.  It already has the GASP overhead and the
+	 * unfragmented datagram header overhead calculated into it.
+	 */
+	unsigned broadcast_xmt_max_payload;
+	u16 broadcast_xmt_datagramlabel;
+
+	/*
+	 * The csr address that remote nodes must send datagrams to for us to
+	 * receive them.
+	 */
+	struct fw_address_handler handler;
+	u64 local_fifo;
+
+	/* Wake up to xmt	 */
+        /* struct work_struct wake;*/
+	/* List of packets to be sent */
+	struct list_head packet_list;
+	/*
+	 * List of packets that were broadcasted.  When we get an ISO interrupt
+	 * one of them has been sent
+	 */
+	struct list_head broadcasted_list;
+	/* List of packets that have been sent but not yet acked */
+	struct list_head sent_list;
+
+	struct fw_card *card;
+};
+
+/* This is our task struct. It's used for the packet complete callback.  */
+struct ipv4_packet_task {
+	/*
+	 * ptask can actually be on priv->packet_list, priv->broadcasted_list,
+	 * or priv->sent_list depending on its current state.
+	 */
+	struct list_head packet_list;
+	struct fw_transaction transaction;
+	struct ipv4_hdr hdr;
+	struct sk_buff *skb;
+	struct ipv4_priv *priv;
+	enum ipv4_tx_type tx_type;
+	int outstanding_pkts;
+	unsigned max_payload;
+	u64 fifo_addr;
+	u16 dest_node;
+	u8 generation;
+	u8 speed;
+};
+
+static struct kmem_cache *ipv4_packet_task_cache;
+
+static const char ipv4_driver_name[] = "firewire-ipv4";
+
+static const struct ieee1394_device_id ipv4_id_table[] = {
+	{
+		.match_flags  = IEEE1394_MATCH_SPECIFIER_ID |
+				IEEE1394_MATCH_VERSION,
+		.specifier_id = IPV4_GASP_SPECIFIER_ID,
+		.version      = IPV4_GASP_VERSION,
+	},
+	{ }
+};
+
+static u32 ipv4_unit_directory_data[] = {
+	0x00040000,					/* unit directory */
+	0x12000000 | IPV4_GASP_SPECIFIER_ID,	/* specifier ID */
+	0x81000003,					/* text descriptor */
+	0x13000000 | IPV4_GASP_VERSION,		/* version */
+	0x81000005,					/* text descriptor */
+
+	0x00030000,					/* Three quadlets */
+	0x00000000,					/* Text */
+	0x00000000,					/* Language 0 */
+	0x49414e41,					/* I A N A */
+	0x00030000,					/* Three quadlets */
+	0x00000000,					/* Text */
+	0x00000000,					/* Language 0 */
+	0x49507634,					/* I P v 4 */
+};
+
+static struct fw_descriptor ipv4_unit_directory = {
+	.length = ARRAY_SIZE(ipv4_unit_directory_data),
+	.key = 0xd1000000,
+	.data = ipv4_unit_directory_data
+};
+
+static int ipv4_send_packet(struct ipv4_packet_task *ptask );
+
+/* ------------------------------------------------------------------ */
+/******************************************
+ * HW Header net device functions
+ ******************************************/
+  /* These functions have been adapted from net/ethernet/eth.c */
+
+/* Create a fake MAC header for an arbitrary protocol layer.
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp). */
+
+static int ipv4_header ( struct sk_buff *skb, struct net_device *dev,
+		       unsigned short type, const void *daddr,
+		       const void *saddr, unsigned len) {
+	struct ipv4_ether_hdr *eth;
+
+	eth = (struct ipv4_ether_hdr *)skb_push(skb, sizeof(*eth));
+	eth->h_proto = htons(type);
+
+	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
+		memset(eth->h_dest, 0, dev->addr_len);
+		return dev->hard_header_len;
+	}
+
+	if (daddr) {
+		memcpy(eth->h_dest, daddr, dev->addr_len);
+		return dev->hard_header_len;
+	}
+
+	return -dev->hard_header_len;
+}
+
+/* Rebuild the faked MAC header. This is called after an ARP
+ * (or in future other address resolution) has completed on this
+ * sk_buff. We now let ARP fill in the other fields.
+ *
+ * This routine CANNOT use cached dst->neigh!
+ * Really, it is used only when dst->neigh is wrong.
+ */
+
+static int ipv4_rebuild_header(struct sk_buff *skb)
+{
+	struct ipv4_ether_hdr *eth;
+
+	eth = (struct ipv4_ether_hdr *)skb->data;
+	if (eth->h_proto == htons(ETH_P_IP))
+		return arp_find((unsigned char *)&eth->h_dest, skb);
+
+	fw_notify ( "%s: unable to resolve type %04x addresses\n",
+		   skb->dev->name,ntohs(eth->h_proto) );
+	return 0;
+}
+
+static int ipv4_header_cache(const struct neighbour *neigh, struct hh_cache *hh) {
+	unsigned short type = hh->hh_type;
+	struct net_device *dev;
+	struct ipv4_ether_hdr *eth;
+
+	if (type == htons(ETH_P_802_3))
+		return -1;
+	dev = neigh->dev;
+	eth = (struct ipv4_ether_hdr *)((u8 *)hh->hh_data + 16 - sizeof(*eth));
+	eth->h_proto = type;
+	memcpy(eth->h_dest, neigh->ha, dev->addr_len);
+
+	hh->hh_len = IPV4_HLEN;
+	return 0;
+}
+
+/* Called by Address Resolution module to notify changes in address. */
+static void ipv4_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char * haddr ) {
+	memcpy((u8 *)hh->hh_data + 16 - IPV4_HLEN, haddr, dev->addr_len);
+}
+
+static int ipv4_header_parse(const struct sk_buff *skb, unsigned char *haddr) {
+	memcpy(haddr, skb->dev->dev_addr, IPV4_ALEN);
+	return IPV4_ALEN;
+}
+
+static const struct header_ops ipv4_header_ops = {
+	.create         = ipv4_header,
+	.rebuild        = ipv4_rebuild_header,
+	.cache		= ipv4_header_cache,
+	.cache_update	= ipv4_header_cache_update,
+	.parse          = ipv4_header_parse,
+};
+
+/* ------------------------------------------------------------------ */
+
+/* FIXME: is this correct for all cases? */
+static bool ipv4_frag_overlap(struct ipv4_partial_datagram *pd, unsigned offset, unsigned len)
+{
+        struct ipv4_fragment_info *fi;
+	unsigned end = offset + len;
+
+	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
+		if (offset < fi->offset + fi->len && end > fi->offset) {
+			fw_debug ( "frag_overlap pd %p fi %p (%x@%x) with %x@%x\n", pd, fi, fi->len, fi->offset, len, offset );
+			return true;
+		}
+	}
+	fw_debug ( "frag_overlap %p does not overlap with %x@%x\n", pd, len, offset );
+	return false;
+}
+
+/* Assumes that new fragment does not overlap any existing fragments */
+static struct ipv4_fragment_info *ipv4_frag_new ( struct ipv4_partial_datagram *pd, unsigned offset, unsigned len ) {
+	struct ipv4_fragment_info *fi, *fi2, *new;
+	struct list_head *list;
+
+	fw_debug ( "frag_new pd %p %x@%x\n", pd, len, offset );
+	list = &pd->fragment_info;
+	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
+		if (fi->offset + fi->len == offset) {
+			/* The new fragment can be tacked on to the end */
+			/* Did the new fragment plug a hole? */
+			fi2 = list_entry(fi->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+			if (fi->offset + fi->len == fi2->offset) {
+				fw_debug ( "pd %p: hole filling %p (%x@%x) and %p(%x@%x): now %x@%x\n", pd, fi, fi->len, fi->offset,
+				fi2, fi2->len, fi2->offset, fi->len + len + fi2->len, fi->offset );
+				/* glue fragments together */
+				fi->len += len + fi2->len;
+				list_del(&fi2->fragment_info);
+				kfree(fi2);
+			} else {
+				fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, fi->len+len, fi->offset );
+				fi->len += len;
+			}
+			return fi;
+		}
+		if (offset + len == fi->offset) {
+			/* The new fragment can be tacked on to the beginning */
+			/* Did the new fragment plug a hole? */
+			fi2 = list_entry(fi->fragment_info.prev, struct ipv4_fragment_info, fragment_info);
+			if (fi2->offset + fi2->len == fi->offset) {
+				/* glue fragments together */
+				fw_debug ( "pd %p: extending %p and merging with %p from %x@%x to %x@%x\n",
+ pd, fi2, fi, fi2->len, fi2->offset, fi2->len + fi->len + len, fi2->offset );
+				fi2->len += fi->len + len;
+				list_del(&fi->fragment_info);
+				kfree(fi);
+				return fi2;
+			}
+			fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, offset, fi->len + len );
+			fi->offset = offset;
+			fi->len += len;
+			return fi;
+		}
+		if (offset > fi->offset + fi->len) {
+			list = &fi->fragment_info;
+			break;
+		}
+		if (offset + len < fi->offset) {
+			list = fi->fragment_info.prev;
+			break;
+		}
+	}
+
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new) {
+		fw_error ( "out of memory in fragment handling!\n" );
+		return NULL;
+	}
+
+	new->offset = offset;
+	new->len = len;
+	list_add(&new->fragment_info, list);
+	fw_debug ( "pd %p: new frag %p %x@%x\n", pd, new, new->len, new->offset );
+	list_for_each_entry( fi, &pd->fragment_info, fragment_info )
+		fw_debug ( "fi %p %x@%x\n", fi, fi->len, fi->offset );
+	return new;
+}
+
+/* ------------------------------------------------------------------ */
+
+static struct ipv4_partial_datagram *ipv4_pd_new(struct net_device *netdev,
+ struct ipv4_node *node, u16 datagram_label, unsigned dg_size, u32 *frag_buf,
+ unsigned frag_off, unsigned frag_len) {
+	struct ipv4_partial_datagram *new;
+	struct ipv4_fragment_info *fi;
+
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		goto fail;
+	INIT_LIST_HEAD(&new->fragment_info);
+	fi = ipv4_frag_new ( new, frag_off, frag_len);
+	if ( fi == NULL )
+		goto fail_w_new;
+	new->datagram_label = datagram_label;
+	new->datagram_size = dg_size;
+	new->skb = dev_alloc_skb(dg_size + netdev->hard_header_len + 15);
+	if ( new->skb == NULL )
+		goto fail_w_fi;
+	skb_reserve(new->skb, (netdev->hard_header_len + 15) & ~15);
+	new->pbuf = skb_put(new->skb, dg_size);
+	memcpy(new->pbuf + frag_off, frag_buf, frag_len);
+	list_add_tail(&new->pdg_list, &node->pdg_list);
+	fw_debug ( "pd_new: new pd %p { dgl %u, dg_size %u, skb %p, pbuf %p } on node %p\n",
+ new, new->datagram_label, new->datagram_size, new->skb, new->pbuf, node );
+	return new;
+
+fail_w_fi:
+	kfree(fi);
+fail_w_new:
+	kfree(new);
+fail:
+	fw_error("ipv4_pd_new: no memory\n");
+	return NULL;
+}
+
+static struct ipv4_partial_datagram *ipv4_pd_find(struct ipv4_node *node, u16 datagram_label) {
+	struct ipv4_partial_datagram *pd;
+
+	list_for_each_entry(pd, &node->pdg_list, pdg_list) {
+	        if ( pd->datagram_label == datagram_label ) {
+			fw_debug ( "pd_find(node %p, label %u): pd %p\n", node, datagram_label, pd );
+			return pd;
+		}
+	}
+	fw_debug ( "pd_find(node %p, label %u) no entry\n", node, datagram_label );
+	return NULL;
+}
+
+
+static void ipv4_pd_delete ( struct ipv4_partial_datagram *old ) {
+	struct ipv4_fragment_info *fi, *n;
+
+	fw_debug ( "pd_delete %p\n", old );
+	list_for_each_entry_safe(fi, n, &old->fragment_info, fragment_info) {
+		fw_debug ( "Freeing fi %p\n", fi );
+		kfree(fi);
+	}
+	list_del(&old->pdg_list);
+	dev_kfree_skb_any(old->skb);
+	kfree(old);
+}
+
+static bool ipv4_pd_update ( struct ipv4_node *node, struct ipv4_partial_datagram *pd,
+ u32 *frag_buf, unsigned frag_off, unsigned frag_len) {
+	fw_debug ( "pd_update node %p, pd %p, frag_buf %p, %x@%x\n", node, pd, frag_buf, frag_len, frag_off );
+	if ( ipv4_frag_new ( pd, frag_off, frag_len ) == NULL)
+		return false;
+	memcpy(pd->pbuf + frag_off, frag_buf, frag_len);
+
+	/*
+	 * Move list entry to beginnig of list so that oldest partial
+	 * datagrams percolate to the end of the list
+	 */
+	list_move_tail(&pd->pdg_list, &node->pdg_list);
+	fw_debug ( "New pd list:\n" );
+	list_for_each_entry ( pd, &node->pdg_list, pdg_list ) {
+		fw_debug ( "pd %p\n", pd );
+	}
+	return true;
+}
+
+static bool ipv4_pd_is_complete ( struct ipv4_partial_datagram *pd ) {
+	struct ipv4_fragment_info *fi;
+	bool ret;
+
+	fi = list_entry(pd->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+
+	ret = (fi->len == pd->datagram_size);
+	fw_debug ( "pd_is_complete (pd %p, dgs %x): fi %p (%x@%x) %s\n", pd, pd->datagram_size, fi, fi->len, fi->offset, ret ? "yes" : "no" );
+	return ret;
+}
+
+/* ------------------------------------------------------------------ */
+
+static int ipv4_node_new ( struct fw_card *card, struct fw_device *device ) {
+	struct ipv4_node *node;
+
+	node = kmalloc ( sizeof(*node), GFP_KERNEL );
+	if ( ! node ) {
+		fw_error ( "allocate new node failed\n" );
+		return -ENOMEM;
+	}
+	node->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	node->fifo = INVALID_FIFO_ADDR;
+	INIT_LIST_HEAD(&node->pdg_list);
+	spin_lock_init(&node->pdg_lock);
+	node->pdg_size = 0;
+	node->generation = device->generation;
+	rmb();
+	node->nodeid = device->node_id;
+	 /* FIXME what should it really be? */
+	node->max_payload = S100_BUFFER_SIZE - IPV4_UNFRAG_HDR_SIZE;
+	node->datagram_label = 0U;
+	node->xmt_speed = device->max_speed;
+	list_add_tail ( &node->ipv4_nodes, &card->ipv4_nodes );
+	fw_debug ( "node_new: %p { guid %016llx, generation %u, nodeid %x, max_payload %x, xmt_speed %x } added\n",
+ node, (unsigned long long)node->guid, node->generation, node->nodeid, node->max_payload, node->xmt_speed );
+	return 0;
+}
+
+static struct ipv4_node *ipv4_node_find_by_guid(struct ipv4_priv *priv, u64 guid) {
+	struct ipv4_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
+		if (node->guid == guid) {
+			/* FIXME: lock the node first? */
+			spin_unlock_irqrestore ( &priv->lock, flags );
+			fw_debug ( "node_find_by_guid (%016llx) found %p\n", (unsigned long long)guid, node );
+			return node;
+		}
+
+	spin_unlock_irqrestore ( &priv->lock, flags );
+	fw_debug ( "node_find_by_guid (%016llx) not found\n", (unsigned long long)guid );
+	return NULL;
+}
+
+static struct ipv4_node *ipv4_node_find_by_nodeid(struct ipv4_priv *priv, u16 nodeid) {
+	struct ipv4_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
+		if (node->nodeid == nodeid) {
+			/* FIXME: lock the node first? */
+			spin_unlock_irqrestore ( &priv->lock, flags );
+			fw_debug ( "node_find_by_nodeid (%x) found %p\n", nodeid, node );
+			return node;
+		}
+	fw_debug ( "node_find_by_nodeid (%x) not found\n", nodeid );
+	spin_unlock_irqrestore ( &priv->lock, flags );
+	return NULL;
+}
+
+/* This is only complicated because we can't assume priv exists */
+static void ipv4_node_delete ( struct fw_card *card, struct fw_device *device ) {
+	struct net_device *netdev;
+	struct ipv4_priv *priv;
+	struct ipv4_node *node;
+	u64 guid;
+	unsigned long flags;
+	struct ipv4_partial_datagram *pd, *pd_next;
+
+	guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	netdev = card->netdev;
+	if ( netdev )
+		priv = netdev_priv ( netdev );
+	else
+		priv = NULL;
+	if ( priv )
+		spin_lock_irqsave ( &priv->lock, flags );
+	list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
+		if ( node->guid == guid ) {
+			list_del ( &node->ipv4_nodes );
+			list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
+				ipv4_pd_delete ( pd );
+			break;
+		}
+	}
+	if ( priv )
+		spin_unlock_irqrestore ( &priv->lock, flags );
+}
+
+/* ------------------------------------------------------------------ */
+
+
+static int ipv4_finish_incoming_packet ( struct net_device *netdev,
+ struct sk_buff *skb, u16 source_node_id, bool is_broadcast, u16 ether_type ) {
+	struct ipv4_priv *priv;
+	static u64 broadcast_hw = ~0ULL;
+	int status;
+	u64 guid;
+
+	fw_debug ( "ipv4_finish_incoming_packet(%p, %p, %x, %s, %x\n",
+ netdev, skb, source_node_id, is_broadcast ? "true" : "false", ether_type );
+	priv = netdev_priv(netdev);
+	/* Write metadata, and then pass to the receive level */
+	skb->dev = netdev;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;  /* don't check it */
+
+	/*
+	 * Parse the encapsulation header. This actually does the job of
+	 * converting to an ethernet frame header, as well as arp
+	 * conversion if needed. ARP conversion is easier in this
+	 * direction, since we are using ethernet as our backend.
+	 */
+	/*
+	 * If this is an ARP packet, convert it. First, we want to make
+	 * use of some of the fields, since they tell us a little bit
+	 * about the sending machine.
+	 */
+	if (ether_type == ETH_P_ARP) {
+		struct ipv4_arp *arp1394;
+		struct arphdr *arp;
+		unsigned char *arp_ptr;
+		u64 fifo_addr;
+		u8 max_rec;
+		u8 sspd;
+		u16 max_payload;
+		struct ipv4_node *node;
+		static const u16 ipv4_speed_to_max_payload[] = {
+			/* S100, S200, S400, S800, S1600, S3200 */
+			    512, 1024, 2048, 4096,  4096,  4096
+		};
+
+		/* fw_debug ( "ARP packet\n" ); */
+		arp1394 = (struct ipv4_arp *)skb->data;
+		arp = (struct arphdr *)skb->data;
+		arp_ptr = (unsigned char *)(arp + 1);
+		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32 |
+ ntohl(arp1394->fifo_lo);
+		max_rec = priv->card->max_receive;
+		if ( arp1394->max_rec < max_rec )
+			max_rec = arp1394->max_rec;
+		sspd = arp1394->sspd;
+		/*
+		 * Sanity check. MacOSX seems to be sending us 131 in this
+		 * field (atleast on my Panther G5). Not sure why.
+		 */
+		if (sspd > 5 ) {
+			fw_notify ( "sspd %x out of range\n", sspd );
+			sspd = 0;
+		}
+
+		max_payload = min(ipv4_speed_to_max_payload[sspd],
+ (u16)(1 << (max_rec + 1))) - IPV4_UNFRAG_HDR_SIZE;
+
+		guid = be64_to_cpu(get_unaligned(&arp1394->s_uniq_id));
+		node = ipv4_node_find_by_guid(priv, guid);
+		if (!node) {
+			fw_notify ( "No node for ARP packet from %llx\n", guid );
+			goto failed_proto;
+		}
+		if ( node->nodeid != source_node_id || node->generation != priv->card->generation ) {
+			fw_notify ( "Internal error: node->nodeid (%x) != soucre_node_id (%x) or node->generation (%x) != priv->card->generation(%x)\n",
+ node->nodeid, source_node_id, node->generation, priv->card->generation );
+			node->nodeid = source_node_id;
+			node->generation = priv->card->generation;
+		}
+
+		/* FIXME: for debugging */
+		if ( sspd > SCODE_400 )
+			sspd = SCODE_400;
+		/* Update our speed/payload/fifo_offset table */
+		/*
+		 * FIXME: this does not handle cases where two high-speed endpoints must use a slower speed because of
+		 * a lower speed hub between them.  We need to look at the actual topology map here.
+		 */
+		fw_debug ( "Setting node %p fifo %llx (was %llx), max_payload %x (was %x), speed %x (was %x)\n",
+ node, fifo_addr, node->fifo, max_payload, node->max_payload, sspd, node->xmt_speed );
+		node->fifo =	fifo_addr;
+		node->max_payload = max_payload;
+		/*
+		 * Only allow speeds to go down from their initial value.
+		 * Otherwise a local node that can only do S400 or slower may
+		 * be told to transmit at S800 to a faster remote node.
+		 */
+		if ( node->xmt_speed > sspd )
+			node->xmt_speed = sspd;
+
+		/*
+		 * Now that we're done with the 1394 specific stuff, we'll
+		 * need to alter some of the data.  Believe it or not, all
+		 * that needs to be done is sender_IP_address needs to be
+		 * moved, the destination hardware address get stuffed
+		 * in and the hardware address length set to 8.
+		 *
+		 * IMPORTANT: The code below overwrites 1394 specific data
+		 * needed above so keep the munging of the data for the
+		 * higher level IP stack last.
+		 */
+
+		arp->ar_hln = 8;
+		arp_ptr += arp->ar_hln;		/* skip over sender unique id */
+		*(u32 *)arp_ptr = arp1394->sip; /* move sender IP addr */
+		arp_ptr += arp->ar_pln;		/* skip over sender IP addr */
+
+		if (arp->ar_op == htons(ARPOP_REQUEST))
+			memset(arp_ptr, 0, sizeof(u64));
+		else
+			memcpy(arp_ptr, netdev->dev_addr, sizeof(u64));
+	}
+
+	/* Now add the ethernet header. */
+	guid = cpu_to_be64(priv->card->guid);
+	if (dev_hard_header(skb, netdev, ether_type, is_broadcast ? &broadcast_hw : &guid, NULL,
+ skb->len) >= 0) {
+		struct ipv4_ether_hdr *eth;
+		u16 *rawp;
+		__be16 protocol;
+
+		skb_reset_mac_header(skb);
+		skb_pull(skb, sizeof(*eth));
+		eth = ipv4_ether_hdr(skb);
+		if (*eth->h_dest & 1) {
+			if (memcmp(eth->h_dest, netdev->broadcast, netdev->addr_len) == 0) {
+				fw_debug ( "Broadcast\n" );
+				skb->pkt_type = PACKET_BROADCAST;
+			}
+#if 0
+			else
+				skb->pkt_type = PACKET_MULTICAST;
+#endif
+		} else {
+			if (memcmp(eth->h_dest, netdev->dev_addr, netdev->addr_len)) {
+				u64 a1, a2;
+
+				memcpy ( &a1, eth->h_dest, sizeof(u64));
+				memcpy ( &a2, netdev->dev_addr, sizeof(u64));
+				fw_debug ( "Otherhost %llx %llx %x\n", a1, a2, netdev->addr_len );
+				skb->pkt_type = PACKET_OTHERHOST;
+			}
+		}
+		if (ntohs(eth->h_proto) >= 1536) {
+			fw_debug ( " proto %x %x\n", eth->h_proto, ntohs(eth->h_proto) );
+			protocol = eth->h_proto;
+		} else {
+			rawp = (u16 *)skb->data;
+			if (*rawp == 0xFFFF) {
+				fw_debug ( "proto 802_3\n" );
+				protocol = htons(ETH_P_802_3);
+			} else {
+				fw_debug ( "proto 802_2\n" );
+				protocol = htons(ETH_P_802_2);
+			}
+		}
+		skb->protocol = protocol;
+	}
+	status = netif_rx(skb);
+	if ( status == NET_RX_DROP) {
+		netdev->stats.rx_errors++;
+		netdev->stats.rx_dropped++;
+	} else {
+		netdev->stats.rx_packets++;
+		netdev->stats.rx_bytes += skb->len;
+	}
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+	return 0;
+
+ failed_proto:
+	netdev->stats.rx_errors++;
+	netdev->stats.rx_dropped++;
+	dev_kfree_skb_any(skb);
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+	netdev->last_rx = jiffies;
+	return 0;
+}
+
+/* ------------------------------------------------------------------ */
+
+static int ipv4_incoming_packet ( struct ipv4_priv *priv, u32 *buf, int len, u16 source_node_id, bool is_broadcast ) {
+	struct sk_buff *skb;
+	struct net_device *netdev;
+	struct ipv4_hdr hdr;
+	unsigned lf;
+	unsigned long flags;
+	struct ipv4_node *node;
+	struct ipv4_partial_datagram *pd;
+	int fg_off;
+	int dg_size;
+	u16 datagram_label;
+	int retval;
+	u16 ether_type;
+
+	fw_debug ( "ipv4_incoming_packet(%p, %p, %d, %x, %s)\n", priv, buf, len, source_node_id, is_broadcast ? "true" : "false" );
+	netdev = priv->card->netdev;
+
+	hdr.w0 = ntohl(buf[0]);
+	lf = ipv4_get_hdr_lf(&hdr);
+	if ( lf == IPV4_HDR_UNFRAG ) {
+		/*
+		 * An unfragmented datagram has been received by the ieee1394
+		 * bus. Build an skbuff around it so we can pass it to the
+		 * high level network layer.
+		 */
+		ether_type = ipv4_get_hdr_ether_type(&hdr);
+		fw_debug ( "header w0 = %x, lf = %x, ether_type = %x\n", hdr.w0, lf, ether_type );
+		buf++;
+		len -= IPV4_UNFRAG_HDR_SIZE;
+
+		skb = dev_alloc_skb(len + netdev->hard_header_len + 15);
+		if (unlikely(!skb)) {
+			fw_error ( "Out of memory for incoming packet\n");
+			netdev->stats.rx_dropped++;
+			return -1;
+		}
+		skb_reserve(skb, (netdev->hard_header_len + 15) & ~15);
+		memcpy(skb_put(skb, len), buf, len );
+		return ipv4_finish_incoming_packet(netdev, skb, source_node_id, is_broadcast, ether_type );
+	}
+	/* A datagram fragment has been received, now the fun begins. */
+	hdr.w1 = ntohl(buf[1]);
+	buf +=2;
+	len -= IPV4_FRAG_HDR_SIZE;
+	if ( lf ==IPV4_HDR_FIRSTFRAG ) {
+		ether_type = ipv4_get_hdr_ether_type(&hdr);
+		fg_off = 0;
+	} else {
+		fg_off = ipv4_get_hdr_fg_off(&hdr);
+		ether_type = 0; /* Shut up compiler! */
+	}
+	datagram_label = ipv4_get_hdr_dgl(&hdr);
+	dg_size = ipv4_get_hdr_dg_size(&hdr); /* ??? + 1 */
+	fw_debug ( "fragmented: %x.%x = lf %x, ether_type %x, fg_off %x, dgl %x, dg_size %x\n", hdr.w0, hdr.w1, lf, ether_type, fg_off, datagram_label, dg_size );
+	node = ipv4_node_find_by_nodeid ( priv, source_node_id);
+	spin_lock_irqsave(&node->pdg_lock, flags);
+	pd = ipv4_pd_find( node, datagram_label );
+	if (pd == NULL) {
+		while ( node->pdg_size >= ipv4_mpd ) {
+			/* remove the oldest */
+			ipv4_pd_delete ( list_first_entry(&node->pdg_list, struct ipv4_partial_datagram, pdg_list) );
+			node->pdg_size--;
+		}
+		pd = ipv4_pd_new ( netdev, node, datagram_label, dg_size,
+ buf, fg_off, len);
+		if ( pd == NULL) {
+			retval = -ENOMEM;
+			goto bad_proto;
+		}
+		node->pdg_size++;
+	} else {
+		if (ipv4_frag_overlap(pd, fg_off, len) || pd->datagram_size != dg_size) {
+			/*
+			 * Differing datagram sizes or overlapping fragments,
+			 * Either way the remote machine is playing silly buggers
+			 * with us: obliterate the old datagram and start a new one.
+			 */
+			ipv4_pd_delete ( pd );
+			pd = ipv4_pd_new ( netdev, node, datagram_label,
+ dg_size, buf, fg_off, len);
+			if ( pd == NULL ) {
+				retval = -ENOMEM;
+				node->pdg_size--;
+				goto bad_proto;
+			}
+		} else {
+			bool worked;
+
+			worked = ipv4_pd_update ( node, pd,
+ buf, fg_off, len );
+			if ( ! worked ) {
+				/*
+				 * Couldn't save off fragment anyway
+				 * so might as well obliterate the
+				 * datagram now.
+				 */
+				ipv4_pd_delete ( pd );
+				node->pdg_size--;
+				goto bad_proto;
+			}
+		}
+	} /* new datagram or add to existing one */
+
+	if ( lf == IPV4_HDR_FIRSTFRAG )
+		pd->ether_type = ether_type;
+	if ( ipv4_pd_is_complete ( pd ) ) {
+		ether_type = pd->ether_type;
+		node->pdg_size--;
+		skb = skb_get(pd->skb);
+		ipv4_pd_delete ( pd );
+		spin_unlock_irqrestore(&node->pdg_lock, flags);
+		return ipv4_finish_incoming_packet ( netdev, skb, source_node_id, false, ether_type );
+	}
+	/*
+	 * Datagram is not complete, we're done for the
+	 * moment.
+	 */
+	spin_unlock_irqrestore(&node->pdg_lock, flags);
+	return 0;
+
+ bad_proto:
+	spin_unlock_irqrestore(&node->pdg_lock, flags);
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+	return 0;
+}
+
+static void ipv4_receive_packet ( struct fw_card *card, struct fw_request *r,
+ int tcode, int destination, int source, int generation, int speed,
+ unsigned long long offset, void *payload, size_t length, void *callback_data ) {
+	struct ipv4_priv *priv;
+	int status;
+
+	fw_debug ( "ipv4_receive_packet(%p,%p,%x,%x,%x,%x,%x,%llx,%p,%lx,%p)\n",
+ card, r, tcode, destination, source, generation, speed, offset, payload,
+ (unsigned long)length, callback_data);
+	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, payload, length, false );
+	priv = callback_data;
+	if (   tcode != TCODE_WRITE_BLOCK_REQUEST
+	    || destination != card->node_id
+	    || generation != card->generation
+	    || offset != priv->handler.offset ) {
+		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
+		fw_debug("Conflict error card node_id=%x, card generation=%x, local offset %llx\n",
+ card->node_id, card->generation, (unsigned long long)priv->handler.offset );
+		return;
+	}
+	status = ipv4_incoming_packet ( priv, payload, length, source, false );
+	if ( status != 0 ) {
+		fw_error ( "Incoming packet failure\n" );
+		fw_send_response ( card, r, RCODE_CONFLICT_ERROR );
+		return;
+	}
+	fw_send_response ( card, r, RCODE_COMPLETE );
+}
+
+static void ipv4_receive_broadcast(struct fw_iso_context *context, u32 cycle,
+ size_t header_length, void *header, void *data) {
+	struct ipv4_priv *priv;
+	struct fw_iso_packet packet;
+	struct fw_card *card;
+	u16 *hdr_ptr;
+	u32 *buf_ptr;
+	int retval;
+	u32 length;
+	u16 source_node_id;
+	u32 specifier_id;
+	u32 ver;
+	unsigned long offset;
+	unsigned long flags;
+
+	fw_debug ( "ipv4_receive_broadcast ( context=%p, cycle=%x, header_length=%lx, header=%p, data=%p )\n", context, cycle, (unsigned long)header_length, header, data );
+	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, header, header_length, false );
+	priv = data;
+	card = priv->card;
+	hdr_ptr = header;
+	length = ntohs(hdr_ptr[0]);
+	spin_lock_irqsave(&priv->lock,flags);
+	offset = priv->rcv_buffer_size * priv->broadcast_rcv_next_ptr;
+	buf_ptr = priv->broadcast_rcv_buffer_ptrs[priv->broadcast_rcv_next_ptr++];
+	if ( priv->broadcast_rcv_next_ptr == priv->num_broadcast_rcv_ptrs )
+		priv->broadcast_rcv_next_ptr = 0;
+	spin_unlock_irqrestore(&priv->lock,flags);
+	fw_debug ( "length %u at %p\n", length, buf_ptr );
+	print_hex_dump ( KERN_DEBUG, "buffer: ", DUMP_PREFIX_OFFSET, 32, 1, buf_ptr, length, false );
+
+	specifier_id =    (be32_to_cpu(buf_ptr[0]) & 0xffff) << 8
+			| (be32_to_cpu(buf_ptr[1]) & 0xff000000) >> 24;
+	ver = be32_to_cpu(buf_ptr[1]) & 0xFFFFFF;
+	source_node_id = be32_to_cpu(buf_ptr[0]) >> 16;
+	/* fw_debug ( "source %x SpecID %x ver %x\n", source_node_id, specifier_id, ver ); */
+	if ( specifier_id == IPV4_GASP_SPECIFIER_ID && ver == IPV4_GASP_VERSION ) {
+		buf_ptr += 2;
+		length -= IPV4_GASP_OVERHEAD;
+		ipv4_incoming_packet(priv, buf_ptr, length, source_node_id, true);
+	} else
+		fw_debug ( "Ignoring packet: not GASP\n" );
+	packet.payload_length = priv->rcv_buffer_size;
+	packet.interrupt = 1;
+	packet.skip = 0;
+	packet.tag = 3;
+	packet.sy = 0;
+	packet.header_length = IPV4_GASP_OVERHEAD;
+	spin_lock_irqsave(&priv->lock,flags);
+	retval = fw_iso_context_queue ( priv->broadcast_rcv_context, &packet,
+ &priv->broadcast_rcv_buffer, offset );
+	spin_unlock_irqrestore(&priv->lock,flags);
+	if ( retval < 0 )
+		fw_error ( "requeue failed\n" );
+}
+
+static void debug_ptask ( struct ipv4_packet_task *ptask ) {
+	static const char *tx_types[] = { "Unknown", "GASP", "Write" };
+
+	fw_debug ( "packet %p { hdr { w0 %x w1 %x }, skb %p, priv %p,"
+ " tx_type %s, outstanding_pkts %d, max_payload %x, fifo %llx,"
+ " speed %x, dest_node %x, generation %x }\n",
+ ptask, ptask->hdr.w0, ptask->hdr.w1, ptask->skb, ptask->priv,
+ ptask->tx_type > IPV4_WRREQ ? "Invalid" : tx_types[ptask->tx_type],
+ ptask->outstanding_pkts,  ptask->max_payload,
+ ptask->fifo_addr, ptask->speed, ptask->dest_node, ptask->generation );
+	print_hex_dump ( KERN_DEBUG, "packet :", DUMP_PREFIX_OFFSET, 32, 1,
+ ptask->skb->data, ptask->skb->len, false );
+}
+
+static void ipv4_transmit_packet_done ( struct ipv4_packet_task *ptask ) {
+	struct ipv4_priv *priv;
+	unsigned long flags;
+
+	priv = ptask->priv;
+	spin_lock_irqsave ( &priv->lock, flags );
+	list_del ( &ptask->packet_list );
+	spin_unlock_irqrestore ( &priv->lock, flags );
+	ptask->outstanding_pkts--;
+	if ( ptask->outstanding_pkts > 0 ) {
+		u16 dg_size;
+		u16 fg_off;
+		u16 datagram_label;
+		u16 lf;
+		struct sk_buff *skb;
+
+		/* Update the ptask to point to the next fragment and send it */
+		lf = ipv4_get_hdr_lf(&ptask->hdr);
+		switch (lf) {
+		case IPV4_HDR_LASTFRAG:
+		case IPV4_HDR_UNFRAG:
+		default:
+			fw_error ( "Outstanding packet %x lf %x, header %x,%x\n", ptask->outstanding_pkts, lf, ptask->hdr.w0, ptask->hdr.w1 );
+			BUG();
+
+		case IPV4_HDR_FIRSTFRAG:
+			/* Set frag type here for future interior fragments */
+			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
+			fg_off = ptask->max_payload - IPV4_FRAG_HDR_SIZE;
+			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+			break;
+
+		case IPV4_HDR_INTFRAG:
+			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
+			fg_off = ipv4_get_hdr_fg_off(&ptask->hdr) + ptask->max_payload - IPV4_FRAG_HDR_SIZE;
+			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+			break;
+		}
+		skb = ptask->skb;
+		skb_pull ( skb, ptask->max_payload );
+		if ( ptask->outstanding_pkts > 1 ) {
+			ipv4_make_sf_hdr ( &ptask->hdr,
+  IPV4_HDR_INTFRAG, dg_size, fg_off, datagram_label );
+		} else {
+			ipv4_make_sf_hdr ( &ptask->hdr,
+  IPV4_HDR_LASTFRAG, dg_size, fg_off, datagram_label );
+			ptask->max_payload = skb->len + IPV4_FRAG_HDR_SIZE;
+
+		}
+		ipv4_send_packet ( ptask );
+	} else {
+		dev_kfree_skb_any ( ptask->skb );
+		kmem_cache_free( ipv4_packet_task_cache, ptask );
+	}
+}
+
+static void ipv4_write_complete ( struct fw_card *card, int rcode,
+ void *payload, size_t length, void *data ) {
+	struct ipv4_packet_task *ptask;
+
+	ptask = data;
+	fw_debug ( "ipv4_write_complete ( %p, %x, %p, %lx, %p )\n",
+ card, rcode, payload, (unsigned long)length, data );
+	debug_ptask ( ptask );
+
+	if ( rcode == RCODE_COMPLETE ) {
+		ipv4_transmit_packet_done ( ptask );
+	} else {
+		fw_error ( "ipv4_write_complete: failed: %x\n", rcode );
+		/* ??? error recovery */
+	}
+}
+
+static int ipv4_send_packet ( struct ipv4_packet_task *ptask ) {
+	struct ipv4_priv *priv;
+	unsigned tx_len;
+	struct ipv4_hdr *bufhdr;
+	unsigned long flags;
+	struct net_device *netdev;
+#if 0 /* stefanr */
+	int retval;
+#endif
+
+	fw_debug ( "ipv4_send_packet\n" );
+	debug_ptask ( ptask );
+	priv = ptask->priv;
+	tx_len = ptask->max_payload;
+	switch (ipv4_get_hdr_lf(&ptask->hdr)) {
+	case IPV4_HDR_UNFRAG:
+		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_UNFRAG_HDR_SIZE);
+		bufhdr->w0 = htonl(ptask->hdr.w0);
+		break;
+
+	case IPV4_HDR_FIRSTFRAG:
+	case IPV4_HDR_INTFRAG:
+	case IPV4_HDR_LASTFRAG:
+		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_FRAG_HDR_SIZE);
+		bufhdr->w0 = htonl(ptask->hdr.w0);
+		bufhdr->w1 = htonl(ptask->hdr.w1);
+		break;
+
+	default:
+		BUG();
+	}
+	if ( ptask->tx_type == IPV4_GASP ) {
+		u32 *packets;
+		int generation;
+		int nodeid;
+
+		/* ptask->generation may not have been set yet */
+		generation = priv->card->generation;
+		smp_rmb();
+		nodeid = priv->card->node_id;
+		packets = (u32 *)skb_push(ptask->skb, sizeof(u32)*2);
+		packets[0] = htonl(nodeid << 16 | (IPV4_GASP_SPECIFIER_ID>>8));
+		packets[1] = htonl((IPV4_GASP_SPECIFIER_ID & 0xFF) << 24 | IPV4_GASP_VERSION);
+		fw_send_request ( priv->card, &ptask->transaction, TCODE_STREAM_DATA,
+ fw_stream_packet_destination_id(3, BROADCAST_CHANNEL, 0),
+ generation, SCODE_100, 0ULL, ptask->skb->data, tx_len + 8, ipv4_write_complete, ptask );
+		spin_lock_irqsave(&priv->lock,flags);
+		list_add_tail ( &ptask->packet_list, &priv->broadcasted_list );
+		spin_unlock_irqrestore(&priv->lock,flags);
+#if 0 /* stefanr */
+		return retval;
+#else
+		return 0;
+#endif
+	}
+	fw_debug("send_request (%p, %p, WRITE_BLOCK, %x, %x, %x, %llx, %p, %d, %p, %p\n",
+ priv->card, &ptask->transaction, ptask->dest_node, ptask->generation,
+ ptask->speed, (unsigned long long)ptask->fifo_addr, ptask->skb->data, tx_len,
+ ipv4_write_complete, ptask );
+	fw_send_request ( priv->card, &ptask->transaction,
+ TCODE_WRITE_BLOCK_REQUEST, ptask->dest_node, ptask->generation, ptask->speed,
+ ptask->fifo_addr, ptask->skb->data, tx_len, ipv4_write_complete, ptask );
+	spin_lock_irqsave(&priv->lock,flags);
+	list_add_tail ( &ptask->packet_list, &priv->sent_list );
+	spin_unlock_irqrestore(&priv->lock,flags);
+	netdev = priv->card->netdev;
+	netdev->trans_start = jiffies;
+	return 0;
+}
+
+static int ipv4_broadcast_start ( struct ipv4_priv *priv ) {
+	struct fw_iso_context *context;
+	int retval;
+	unsigned num_packets;
+	unsigned max_receive;
+	struct fw_iso_packet packet;
+	unsigned long offset;
+	unsigned u;
+	/* unsigned transmit_speed; */
+
+#if 0 /* stefanr */
+	if ( priv->card->broadcast_channel != (BROADCAST_CHANNEL_VALID|BROADCAST_CHANNEL_INITIAL)) {
+		fw_notify ( "Invalid broadcast channel %x\n", priv->card->broadcast_channel );
+		/* FIXME: try again later? */
+		/* return -EINVAL; */
+	}
+#endif
+	if ( priv->local_fifo == INVALID_FIFO_ADDR ) {
+		struct fw_address_region region;
+
+		priv->handler.length = FIFO_SIZE;
+		priv->handler.address_callback = ipv4_receive_packet;
+		priv->handler.callback_data = priv;
+		/* FIXME: this is OHCI, but what about others? */
+		region.start = 0xffff00000000ULL;
+		region.end =   0xfffffffffffcULL;
+
+		retval = fw_core_add_address_handler ( &priv->handler, &region );
+		if ( retval < 0 )
+			goto failed_initial;
+		priv->local_fifo = priv->handler.offset;
+	}
+
+	/*
+	 * FIXME: rawiso limits us to PAGE_SIZE.  This only matters if we ever have
+	 * a machine with PAGE_SIZE < 4096
+	 */
+	max_receive = 1U << (priv->card->max_receive + 1);
+	num_packets = ( ipv4_iso_page_count * PAGE_SIZE ) / max_receive;
+	if ( ! priv->broadcast_rcv_context ) {
+		void **ptrptr;
+
+		context = fw_iso_context_create ( priv->card,
+ FW_ISO_CONTEXT_RECEIVE, BROADCAST_CHANNEL,
+ priv->card->link_speed, 8, ipv4_receive_broadcast, priv );
+		if (IS_ERR(context)) {
+			retval = PTR_ERR(context);
+			goto failed_context_create;
+		}
+		retval = fw_iso_buffer_init ( &priv->broadcast_rcv_buffer,
+ priv->card, ipv4_iso_page_count, DMA_FROM_DEVICE );
+		if ( retval < 0 )
+			goto failed_buffer_init;
+		ptrptr = kmalloc ( sizeof(void*)*num_packets, GFP_KERNEL );
+		if ( ! ptrptr ) {
+			retval = -ENOMEM;
+			goto failed_ptrs_alloc;
+		}
+		priv->broadcast_rcv_buffer_ptrs = ptrptr;
+		for ( u = 0; u < ipv4_iso_page_count; u++ ) {
+			void *ptr;
+			unsigned v;
+
+			ptr = kmap ( priv->broadcast_rcv_buffer.pages[u] );
+			for ( v = 0; v < num_packets / ipv4_iso_page_count; v++ )
+				*ptrptr++ = (void *)((char *)ptr + v * max_receive);
+		}
+		priv->broadcast_rcv_context = context;
+	} else
+		context = priv->broadcast_rcv_context;
+
+	packet.payload_length = max_receive;
+	packet.interrupt = 1;
+	packet.skip = 0;
+	packet.tag = 3;
+	packet.sy = 0;
+	packet.header_length = IPV4_GASP_OVERHEAD;
+	offset = 0;
+	for ( u = 0; u < num_packets; u++ ) {
+		retval = fw_iso_context_queue ( context, &packet,
+ &priv->broadcast_rcv_buffer, offset );
+		if ( retval < 0 )
+			goto failed_rcv_queue;
+		offset += max_receive;
+	}
+	priv->num_broadcast_rcv_ptrs = num_packets;
+	priv->rcv_buffer_size = max_receive;
+	priv->broadcast_rcv_next_ptr = 0U;
+	retval = fw_iso_context_start ( context, -1, 0, FW_ISO_CONTEXT_MATCH_ALL_TAGS ); /* ??? sync */
+	if ( retval < 0 )
+		goto failed_rcv_queue;
+	/* FIXME: adjust this when we know the max receive speeds of all other IP nodes on the bus. */
+	/* since we only xmt at S100 ??? */
+	priv->broadcast_xmt_max_payload = S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD - IPV4_UNFRAG_HDR_SIZE;
+	priv->broadcast_state = IPV4_BROADCAST_RUNNING;
+	return 0;
+
+ failed_rcv_queue:
+	kfree ( priv->broadcast_rcv_buffer_ptrs );
+	priv->broadcast_rcv_buffer_ptrs = NULL;
+ failed_ptrs_alloc:
+	fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
+ failed_buffer_init:
+	fw_iso_context_destroy ( context );
+	priv->broadcast_rcv_context = NULL;
+ failed_context_create:
+	fw_core_remove_address_handler ( &priv->handler );
+ failed_initial:
+	priv->local_fifo = INVALID_FIFO_ADDR;
+	return retval;
+}
+
+/* This is called after an "ifup" */
+static int ipv4_open(struct net_device *dev) {
+	struct ipv4_priv *priv;
+	int ret;
+
+	priv = netdev_priv(dev);
+	if (priv->broadcast_state == IPV4_BROADCAST_ERROR) {
+		ret = ipv4_broadcast_start ( priv );
+		if (ret)
+			return ret;
+	}
+	netif_start_queue(dev);
+	return 0;
+}
+
+/* This is called after an "ifdown" */
+static int ipv4_stop(struct net_device *netdev)
+{
+	/* flush priv->wake */
+	/* flush_scheduled_work(); */
+
+	netif_stop_queue(netdev);
+	return 0;
+}
+
+/* Transmit a packet (called by kernel) */
+static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct ipv4_ether_hdr hdr_buf;
+	struct ipv4_priv *priv = netdev_priv(netdev);
+	__be16 proto;
+	u16 dest_node;
+	enum ipv4_tx_type tx_type;
+	unsigned max_payload;
+	u16 dg_size;
+	u16 *datagram_label_ptr;
+	struct ipv4_packet_task *ptask;
+	struct ipv4_node *node = NULL;
+
+	ptask = kmem_cache_alloc(ipv4_packet_task_cache, GFP_ATOMIC);
+	if (ptask == NULL)
+		goto fail;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		goto fail;
+
+	/*
+	 * Get rid of the fake ipv4 header, but first make a copy.
+	 * We might need to rebuild the header on tx failure.
+	 */
+	memcpy(&hdr_buf, skb->data, sizeof(hdr_buf));
+	skb_pull(skb, sizeof(hdr_buf));
+
+	proto = hdr_buf.h_proto;
+	dg_size = skb->len;
+
+	/*
+	 * Set the transmission type for the packet.  ARP packets and IP
+	 * broadcast packets are sent via GASP.
+	 */
+	if (   memcmp(hdr_buf.h_dest, netdev->broadcast, IPV4_ALEN) == 0
+	    || proto == htons(ETH_P_ARP)
+	    || (   proto == htons(ETH_P_IP)
+		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)) ) ) {
+		/* fw_debug ( "transmitting arp or multicast packet\n" );*/
+		tx_type = IPV4_GASP;
+		dest_node = ALL_NODES;
+		max_payload = priv->broadcast_xmt_max_payload;
+		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD); */
+		datagram_label_ptr = &priv->broadcast_xmt_datagramlabel;
+		ptask->fifo_addr = INVALID_FIFO_ADDR;
+		ptask->generation = 0U;
+		ptask->dest_node = 0U;
+		ptask->speed = 0;
+	} else {
+		__be64 guid = get_unaligned((u64 *)hdr_buf.h_dest);
+		u8 generation;
+
+		node = ipv4_node_find_by_guid(priv, be64_to_cpu(guid));
+		if (!node) {
+			fw_debug ( "Normal packet but no node\n" );
+			goto fail;
+		}
+
+		if (node->fifo == INVALID_FIFO_ADDR) {
+			fw_debug ( "Normal packet but no fifo addr\n" );
+			goto fail;
+		}
+
+		/* fw_debug ( "Transmitting normal packet to %x at %llxx\n", node->nodeid, node->fifo ); */
+		generation = node->generation;
+		dest_node = node->nodeid;
+		max_payload = node->max_payload;
+		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_FRAG_HDR_SIZE); */
+
+		datagram_label_ptr = &node->datagram_label;
+		tx_type = IPV4_WRREQ;
+		ptask->fifo_addr = node->fifo;
+		ptask->generation = generation;
+		ptask->dest_node = dest_node;
+		ptask->speed = node->xmt_speed;
+	}
+
+	/* If this is an ARP packet, convert it */
+	if (proto == htons(ETH_P_ARP)) {
+		/* Convert a standard ARP packet to 1394 ARP. The first 8 bytes (the entire
+		 * arphdr) is the same format as the ip1394 header, so they overlap.  The rest
+		 * needs to be munged a bit.  The remainder of the arphdr is formatted based
+		 * on hwaddr len and ipaddr len.  We know what they'll be, so it's easy to
+		 * judge.
+		 *
+		 * Now that the EUI is used for the hardware address all we need to do to make
+		 * this work for 1394 is to insert 2 quadlets that contain max_rec size,
+		 * speed, and unicast FIFO address information between the sender_unique_id
+		 * and the IP addresses.
+		 */
+		struct arphdr *arp = (struct arphdr *)skb->data;
+		unsigned char *arp_ptr = (unsigned char *)(arp + 1);
+		struct ipv4_arp *arp1394 = (struct ipv4_arp *)skb->data;
+		u32 ipaddr;
+
+		ipaddr = *(u32*)(arp_ptr + IPV4_ALEN);
+		arp1394->hw_addr_len    = 16;
+		arp1394->max_rec        = priv->card->max_receive;
+		arp1394->sspd		= priv->card->link_speed;
+		arp1394->fifo_hi	= htons(priv->local_fifo >> 32);
+		arp1394->fifo_lo        = htonl(priv->local_fifo & 0xFFFFFFFF);
+		arp1394->sip		= ipaddr;
+	}
+	if ( ipv4_max_xmt && max_payload > ipv4_max_xmt )
+		max_payload = ipv4_max_xmt;
+
+	ptask->hdr.w0 = 0;
+	ptask->hdr.w1 = 0;
+	ptask->skb = skb;
+	ptask->priv = priv;
+        ptask->tx_type = tx_type;
+	/* Does it all fit in one packet? */
+	if ( dg_size <= max_payload ) {
+		ipv4_make_uf_hdr(&ptask->hdr, be16_to_cpu(proto));
+		ptask->outstanding_pkts = 1;
+		max_payload = dg_size + IPV4_UNFRAG_HDR_SIZE;
+	} else {
+		u16 datagram_label;
+
+		max_payload -= IPV4_FRAG_OVERHEAD;
+		datagram_label = (*datagram_label_ptr)++;
+		ipv4_make_ff_hdr(&ptask->hdr, be16_to_cpu(proto), dg_size, datagram_label );
+		ptask->outstanding_pkts = DIV_ROUND_UP(dg_size, max_payload);
+		max_payload += IPV4_FRAG_HDR_SIZE;
+	}
+	ptask->max_payload = max_payload;
+	ipv4_send_packet ( ptask );
+	return NETDEV_TX_OK;
+
+ fail:
+	if (ptask)
+		kmem_cache_free(ipv4_packet_task_cache, ptask);
+
+	if (skb != NULL)
+		dev_kfree_skb(skb);
+
+	netdev->stats.tx_dropped++;
+	netdev->stats.tx_errors++;
+
+	/*
+	 * FIXME: According to a patch from 2003-02-26, "returning non-zero
+	 * causes serious problems" here, allegedly.  Before that patch,
+	 * -ERRNO was returned which is not appropriate under Linux 2.6.
+	 * Perhaps more needs to be done?  Stop the queue in serious
+	 * conditions and restart it elsewhere?
+	 */
+	return NETDEV_TX_OK;
+}
+
+/*
+ * FIXME: What to do if we timeout? I think a host reset is probably in order,
+ * so that's what we do. Should we increment the stat counters too?
+ */
+static void ipv4_tx_timeout(struct net_device *dev) {
+	struct ipv4_priv *priv;
+
+	priv = netdev_priv(dev);
+	fw_error ( "%s: Timeout, resetting host\n", dev->name );
+#if 0 /* stefanr */
+	fw_core_initiate_bus_reset ( priv->card, 1 );
+#endif
+}
+
+static int ipv4_change_mtu ( struct net_device *dev, int new_mtu ) {
+#if 0
+	int max_mtu;
+	struct ipv4_priv *priv;
+#endif
+
+	if (new_mtu < 68)
+		return -EINVAL;
+
+#if 0
+	priv = netdev_priv(dev);
+	/* This is not actually true because we can fragment packets at the firewire layer */
+	max_mtu = (1 << (priv->card->max_receive + 1))
+		                - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
+	if (new_mtu > max_mtu) {
+		fw_notify ( "%s: Local node constrains MTU to %d\n", dev->name, max_mtu);
+		return -ERANGE;
+	}
+#endif
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static void ipv4_get_drvinfo(struct net_device *dev,
+struct ethtool_drvinfo *info) {
+	strcpy(info->driver, ipv4_driver_name);
+	strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */
+}
+
+static struct ethtool_ops ipv4_ethtool_ops = {
+	.get_drvinfo = ipv4_get_drvinfo,
+};
+
+static const struct net_device_ops ipv4_netdev_ops = {
+	.ndo_open       = ipv4_open,
+	.ndo_stop	= ipv4_stop,
+	.ndo_start_xmit = ipv4_tx,
+	.ndo_tx_timeout = ipv4_tx_timeout,
+	.ndo_change_mtu = ipv4_change_mtu,
+};
+
+static void ipv4_init_dev ( struct net_device *dev ) {
+	dev->header_ops		= &ipv4_header_ops;
+	dev->netdev_ops         = &ipv4_netdev_ops;
+	SET_ETHTOOL_OPS(dev, &ipv4_ethtool_ops);
+
+	dev->watchdog_timeo	= IPV4_TIMEOUT;
+	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
+	dev->features		= NETIF_F_HIGHDMA;
+	dev->addr_len		= IPV4_ALEN;
+	dev->hard_header_len	= IPV4_HLEN;
+	dev->type		= ARPHRD_IEEE1394;
+
+	/* FIXME: This value was copied from ether_setup(). Is it too much? */
+	dev->tx_queue_len	= 1000;
+}
+
+static int ipv4_probe ( struct device *dev ) {
+	struct fw_unit * unit;
+	struct fw_device *device;
+	struct fw_card *card;
+	struct net_device *netdev;
+	struct ipv4_priv *priv;
+	unsigned max_mtu;
+	__be64 guid;
+
+	fw_debug("ipv4 Probing\n" );
+	unit = fw_unit ( dev );
+	device = fw_device ( unit->device.parent );
+	card = device->card;
+
+	if ( ! device->is_local ) {
+		int added;
+
+		fw_debug ( "Non-local, adding remote node entry\n" );
+		added = ipv4_node_new ( card, device );
+		return added;
+	}
+	fw_debug("ipv4 Local: adding netdev\n" );
+	netdev = alloc_netdev ( sizeof(*priv), "fw-ipv4-%d", ipv4_init_dev );
+	if ( netdev == NULL) {
+		fw_error( "Out of memory\n");
+		goto out;
+	}
+
+	SET_NETDEV_DEV(netdev, card->device);
+	priv = netdev_priv(netdev);
+
+	spin_lock_init(&priv->lock);
+	priv->broadcast_state = IPV4_BROADCAST_ERROR;
+	priv->broadcast_rcv_context = NULL;
+	priv->broadcast_xmt_max_payload = 0;
+	priv->broadcast_xmt_datagramlabel = 0;
+
+	priv->local_fifo = INVALID_FIFO_ADDR;
+
+	/* INIT_WORK(&priv->wake, ipv4_handle_queue);*/
+	INIT_LIST_HEAD(&priv->packet_list);
+	INIT_LIST_HEAD(&priv->broadcasted_list);
+	INIT_LIST_HEAD(&priv->sent_list );
+
+	priv->card = card;
+
+	/*
+	 * Use the RFC 2734 default 1500 octets or the maximum payload
+	 * as initial MTU
+	 */
+	max_mtu = (1 << (card->max_receive + 1))
+		  - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
+	netdev->mtu = min(1500U, max_mtu);
+
+	/* Set our hardware address while we're at it */
+	guid = cpu_to_be64(card->guid);
+	memcpy(netdev->dev_addr, &guid, sizeof(u64));
+	memset(netdev->broadcast, 0xff, sizeof(u64));
+	if ( register_netdev ( netdev ) ) {
+		fw_error ( "Cannot register the driver\n");
+		goto out;
+	}
+
+	fw_notify ( "%s: IPv4 over Firewire on device %016llx\n",
+ netdev->name, card->guid );
+	card->netdev = netdev;
+
+	return 0 /* ipv4_new_node ( ud ) */;
+ out:
+	if ( netdev )
+		free_netdev ( netdev );
+	return -ENOENT;
+}
+
+
+static int ipv4_remove ( struct device *dev ) {
+	struct fw_unit * unit;
+	struct fw_device *device;
+	struct fw_card *card;
+	struct net_device *netdev;
+	struct ipv4_priv *priv;
+	struct ipv4_node *node;
+	struct ipv4_partial_datagram *pd, *pd_next;
+	struct ipv4_packet_task *ptask, *pt_next;
+
+	fw_debug("ipv4 Removing\n" );
+	unit = fw_unit ( dev );
+	device = fw_device ( unit->device.parent );
+	card = device->card;
+
+	if ( ! device->is_local ) {
+		fw_debug ( "Node %x is non-local, removing remote node entry\n", device->node_id );
+		ipv4_node_delete ( card, device );
+		return 0;
+	}
+	netdev = card->netdev;
+	if ( netdev ) {
+		fw_debug ( "Node %x is local: deleting netdev\n", device->node_id );
+		priv = netdev_priv ( netdev );
+		unregister_netdev ( netdev );
+		fw_debug ( "unregistered\n" );
+		if ( priv->local_fifo != INVALID_FIFO_ADDR )
+			fw_core_remove_address_handler ( &priv->handler );
+		fw_debug ( "address handler gone\n" );
+		if ( priv->broadcast_rcv_context ) {
+			fw_iso_context_stop ( priv->broadcast_rcv_context );
+			fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
+			fw_iso_context_destroy ( priv->broadcast_rcv_context );
+			fw_debug ( "rcv stopped\n" );
+		}
+		list_for_each_entry_safe( ptask, pt_next, &priv->packet_list, packet_list ) {
+			dev_kfree_skb_any ( ptask->skb );
+			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		}
+		list_for_each_entry_safe( ptask, pt_next, &priv->broadcasted_list, packet_list ) {
+			dev_kfree_skb_any ( ptask->skb );
+			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		}
+		list_for_each_entry_safe( ptask, pt_next, &priv->sent_list, packet_list ) {
+			dev_kfree_skb_any ( ptask->skb );
+			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		}
+		fw_debug ( "lists emptied\n" );
+		list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
+			if ( node->pdg_size ) {
+				list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
+					ipv4_pd_delete ( pd );
+				node->pdg_size = 0;
+			}
+			node->fifo = INVALID_FIFO_ADDR;
+		}
+		fw_debug ( "nodes cleaned up\n" );
+		free_netdev ( netdev );
+		card->netdev = NULL;
+		fw_debug ( "done\n" );
+	}
+	return 0;
+}
+
+static void ipv4_update ( struct fw_unit *unit ) {
+	struct fw_device *device;
+	struct fw_card *card;
+
+	fw_debug ( "ipv4_update unit %p\n", unit );
+	device = fw_device ( unit->device.parent );
+	card = device->card;
+	if ( ! device->is_local ) {
+		struct ipv4_node *node;
+		u64 guid;
+		struct net_device *netdev;
+		struct ipv4_priv *priv;
+
+		netdev = card->netdev;
+		if ( netdev ) {
+			priv = netdev_priv ( netdev );
+			guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+			node = ipv4_node_find_by_guid ( priv, guid );
+			if ( ! node ) {
+				fw_error ( "ipv4_update: no node for device %llx\n", guid );
+				return;
+			}
+			fw_debug ( "Non-local, updating remote node entry for guid %llx old generation %x, old nodeid %x\n", guid, node->generation, node->nodeid );
+			node->generation = device->generation;
+			rmb();
+			node->nodeid = device->node_id;
+			fw_debug ( "New generation %x, new nodeid %x\n", node->generation, node->nodeid );
+		} else
+			fw_error ( "nonlocal, but no netdev?  How can that be?\n" );
+	} else {
+		/* FIXME: What do we need to do on bus reset? */
+		fw_debug ( "Local, doing nothing\n" );
+	}
+}
+
+static struct fw_driver ipv4_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = ipv4_driver_name,
+		.bus = &fw_bus_type,
+		.probe = ipv4_probe,
+		.remove = ipv4_remove,
+	},
+	.update = ipv4_update,
+	.id_table = ipv4_id_table,
+};
+
+static int __init ipv4_init ( void ) {
+	int added;
+
+	added = fw_core_add_descriptor ( &ipv4_unit_directory );
+	if ( added < 0 )
+		fw_error ( "Failed to add descriptor" );
+	ipv4_packet_task_cache = kmem_cache_create("packet_task",
+ sizeof(struct ipv4_packet_task), 0, 0, NULL);
+	fw_debug("Adding ipv4 module\n" );
+	return driver_register ( &ipv4_driver.driver );
+}
+
+static void __exit ipv4_cleanup ( void ) {
+	fw_core_remove_descriptor ( &ipv4_unit_directory );
+	fw_debug("Removing ipv4 module\n" );
+	driver_unregister ( &ipv4_driver.driver );
+}
+
+module_init(ipv4_init);
+module_exit(ipv4_cleanup);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index e584b7215e8..d44f47d3b2d 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -3,6 +3,7 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/list.h>
@@ -130,6 +131,13 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
+	/* Only non-NULL if firewire-ipv4 is active on this card. */
+	void *netdev;
+	/*
+	 * The nodes get probed before the card, so we need a place to store
+	 * them independent of card->netdev
+	 */
+	struct list_head ipv4_nodes;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
@@ -355,4 +363,90 @@ int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
 		       int generation, int speed, unsigned long long offset,
 		       void *payload, size_t length);
 
+static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
+{
+	return tag << 14 | channel << 8 | sy;
+}
+
+struct fw_descriptor {
+	struct list_head link;
+	size_t length;
+	u32 immediate;
+	u32 key;
+	const u32 *data;
+};
+
+int fw_core_add_descriptor(struct fw_descriptor *desc);
+void fw_core_remove_descriptor(struct fw_descriptor *desc);
+
+/*
+ * The iso packet format allows for an immediate header/payload part
+ * stored in 'header' immediately after the packet info plus an
+ * indirect payload part that is pointer to by the 'payload' field.
+ * Applications can use one or the other or both to implement simple
+ * low-bandwidth streaming (e.g. audio) or more advanced
+ * scatter-gather streaming (e.g. assembling video frame automatically).
+ */
+struct fw_iso_packet {
+	u16 payload_length;	/* Length of indirect payload. */
+	u32 interrupt:1;	/* Generate interrupt on this packet */
+	u32 skip:1;		/* Set to not send packet at all. */
+	u32 tag:2;
+	u32 sy:4;
+	u32 header_length:8;	/* Length of immediate header. */
+	u32 header[0];
+};
+
+#define FW_ISO_CONTEXT_TRANSMIT	0
+#define FW_ISO_CONTEXT_RECEIVE	1
+
+#define FW_ISO_CONTEXT_MATCH_TAG0	 1
+#define FW_ISO_CONTEXT_MATCH_TAG1	 2
+#define FW_ISO_CONTEXT_MATCH_TAG2	 4
+#define FW_ISO_CONTEXT_MATCH_TAG3	 8
+#define FW_ISO_CONTEXT_MATCH_ALL_TAGS	15
+
+/*
+ * An iso buffer is just a set of pages mapped for DMA in the
+ * specified direction.  Since the pages are to be used for DMA, they
+ * are not mapped into the kernel virtual address space.  We store the
+ * DMA address in the page private. The helper function
+ * fw_iso_buffer_map() will map the pages into a given vma.
+ */
+struct fw_iso_buffer {
+	enum dma_data_direction direction;
+	struct page **pages;
+	int page_count;
+};
+
+int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
+		       int page_count, enum dma_data_direction direction);
+void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
+
+struct fw_iso_context;
+typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
+				  u32 cycle, size_t header_length,
+				  void *header, void *data);
+struct fw_iso_context {
+	struct fw_card *card;
+	int type;
+	int channel;
+	int speed;
+	size_t header_size;
+	fw_iso_callback_t callback;
+	void *callback_data;
+};
+
+struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
+		int type, int channel, int speed, size_t header_size,
+		fw_iso_callback_t callback, void *callback_data);
+int fw_iso_context_queue(struct fw_iso_context *ctx,
+			 struct fw_iso_packet *packet,
+			 struct fw_iso_buffer *buffer,
+			 unsigned long payload);
+int fw_iso_context_start(struct fw_iso_context *ctx,
+			 int cycle, int sync, int tags);
+int fw_iso_context_stop(struct fw_iso_context *ctx);
+void fw_iso_context_destroy(struct fw_iso_context *ctx);
+
 #endif /* _LINUX_FIREWIRE_H */
-- 
cgit v1.2.3-70-g09d2


From f91e3bd842ec6f5cea245993926ee8ff26250467 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 7 Jun 2009 22:57:53 +0200
Subject: firewire: net: style changes

Change names of types, variables, functions.
Omit debug code.
Use get_unaligned*, put_unaligned*.
Annotate big endian data.
Handle errors in __init.
Change whitespace.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c |    2 +-
 drivers/firewire/net.c       | 2041 ++++++++++++++++++++----------------------
 include/linux/firewire.h     |    9 +-
 3 files changed, 969 insertions(+), 1083 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index cdab32b2067..8c45e43da7c 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -430,7 +430,7 @@ void fw_card_initialize(struct fw_card *card,
 
 	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
 	card->netdev = NULL;
-	INIT_LIST_HEAD(&card->ipv4_nodes);
+	INIT_LIST_HEAD(&card->peer_list);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 15353886bd8..ba6f924b1b1 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -6,6 +6,7 @@
  * based on eth1394 by Ben Collins et al
  */
 
+#include <linux/bug.h>
 #include <linux/device.h>
 #include <linux/ethtool.h>
 #include <linux/firewire.h>
@@ -13,6 +14,7 @@
 #include <linux/highmem.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/jiffies.h>
 #include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -22,181 +24,109 @@
 #include <asm/unaligned.h>
 #include <net/arp.h>
 
-/* Things to potentially make runtime cofigurable */
-/* must be at least as large as our maximum receive size */
-#define FIFO_SIZE 4096
-/* Network timeout in glibbles */
-#define IPV4_TIMEOUT       100000
+#define FWNET_MAX_FRAGMENTS	25	/* arbitrary limit */
+#define FWNET_ISO_PAGE_COUNT	(PAGE_SIZE < 16 * 1024 ? 4 : 2)
 
-/* Runitme configurable paramaters */
-static int ipv4_mpd = 25;
-static int ipv4_max_xmt = 0;
-/* 16k for receiving arp and broadcast packets.  Enough? */
-static int ipv4_iso_page_count = 4;
+#define IEEE1394_BROADCAST_CHANNEL	31
+#define IEEE1394_ALL_NODES		(0xffc0 | 0x003f)
+#define IEEE1394_MAX_PAYLOAD_S100	512
+#define FWNET_NO_FIFO_ADDR		(~0ULL)
 
-MODULE_AUTHOR("Jay Fenlason (fenlason@redhat.com)");
-MODULE_DESCRIPTION("Firewire IPv4 Driver (IPv4-over-IEEE1394 as per RFC 2734)");
-MODULE_LICENSE("GPL");
-MODULE_DEVICE_TABLE(ieee1394, ipv4_id_table);
-module_param_named(max_partial_datagrams, ipv4_mpd, int, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_partial_datagrams, "Maximum number of received"
- " incomplete fragmented datagrams (default = 25).");
-
-/* Max xmt is useful for forcing fragmentation, which makes testing easier. */
-module_param_named(max_transmit, ipv4_max_xmt, int, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_transmit, "Maximum datagram size to transmit"
- " (larger datagrams will be fragmented) (default = 0 (use hardware defaults).");
-
-/* iso page count controls how many pages will be used for receiving broadcast packets. */
-module_param_named(iso_pages, ipv4_iso_page_count, int, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(iso_pages, "Number of pages to use for receiving broadcast packets"
- " (default = 4).");
-
-/* uncomment this line to do debugging */
-#define fw_debug(s, args...) printk(KERN_DEBUG KBUILD_MODNAME ": " s, ## args)
-
-/* comment out these lines to do debugging. */
-/* #undef fw_debug */
-/* #define fw_debug(s...) */
-/* #define print_hex_dump(l...) */
-
-/* Define a fake hardware header format for the networking core.  Note that
- * header size cannot exceed 16 bytes as that is the size of the header cache.
- * Also, we do not need the source address in the header so we omit it and
- * keep the header to under 16 bytes */
-#define IPV4_ALEN (8)
-/* This must equal sizeof(struct ipv4_ether_hdr) */
-#define IPV4_HLEN (10)
-
-/* FIXME: what's a good size for this? */
-#define INVALID_FIFO_ADDR (u64)~0ULL
-
-/* Things specified by standards */
-#define BROADCAST_CHANNEL 31
-
-#define S100_BUFFER_SIZE 512
-#define MAX_BUFFER_SIZE 4096
-
-#define IPV4_GASP_SPECIFIER_ID	0x00005EU
-#define IPV4_GASP_VERSION	0x00000001U
-
-#define IPV4_GASP_OVERHEAD (2 * sizeof(u32)) /* for GASP header */
-
-#define IPV4_UNFRAG_HDR_SIZE	sizeof(u32)
-#define IPV4_FRAG_HDR_SIZE	(2 * sizeof(u32))
-#define IPV4_FRAG_OVERHEAD	sizeof(u32)
-
-#define ALL_NODES (0xffc0 | 0x003f)
-
-#define IPV4_HDR_UNFRAG		0	/* unfragmented		*/
-#define IPV4_HDR_FIRSTFRAG	1	/* first fragment	*/
-#define IPV4_HDR_LASTFRAG	2	/* last fragment	*/
-#define IPV4_HDR_INTFRAG	3	/* interior fragment	*/
-
-/* Our arp packet (ARPHRD_IEEE1394) */
-/* FIXME: note that this is probably bogus on weird-endian machines */
-struct ipv4_arp {
-	u16 hw_type;		/* 0x0018	*/
-	u16 proto_type;		/* 0x0806       */
-	u8 hw_addr_len;		/* 16		*/
-	u8 ip_addr_len;         /* 4		*/
-	u16 opcode;	        /* ARP Opcode	*/
-	/* Above is exactly the same format as struct arphdr */
-
-	u64 s_uniq_id;		/* Sender's 64bit EUI			*/
-	u8 max_rec;             /* Sender's max packet size		*/
-	u8 sspd;		/* Sender's max speed			*/
-	u16 fifo_hi;            /* hi 16bits of sender's FIFO addr	*/
-	u32 fifo_lo;            /* lo 32bits of sender's FIFO addr	*/
-	u32 sip;		/* Sender's IP Address			*/
-	u32 tip;		/* IP Address of requested hw addr	*/
-} __attribute__((packed));
+#define IANA_SPECIFIER_ID		0x00005eU
+#define RFC2734_SW_VERSION		0x000001U
 
-struct ipv4_ether_hdr {
-	unsigned char	h_dest[IPV4_ALEN];	/* destination address */
-	unsigned short  h_proto;                /* packet type ID field */
-}  __attribute__((packed));
+#define IEEE1394_GASP_HDR_SIZE	8
 
-static inline struct ipv4_ether_hdr *ipv4_ether_hdr(const struct sk_buff *skb)
-{
-	return (struct ipv4_ether_hdr *)skb_mac_header(skb);
-}
+#define RFC2374_UNFRAG_HDR_SIZE	4
+#define RFC2374_FRAG_HDR_SIZE	8
+#define RFC2374_FRAG_OVERHEAD	4
 
-enum ipv4_tx_type {
-	IPV4_UNKNOWN = 0,
-	IPV4_GASP = 1,
-	IPV4_WRREQ = 2,
-};
+#define RFC2374_HDR_UNFRAG	0	/* unfragmented		*/
+#define RFC2374_HDR_FIRSTFRAG	1	/* first fragment	*/
+#define RFC2374_HDR_LASTFRAG	2	/* last fragment	*/
+#define RFC2374_HDR_INTFRAG	3	/* interior fragment	*/
 
-enum ipv4_broadcast_state {
-	IPV4_BROADCAST_ERROR,
-	IPV4_BROADCAST_RUNNING,
-	IPV4_BROADCAST_STOPPED,
-};
+#define RFC2734_HW_ADDR_LEN	16
 
-#define ipv4_get_hdr_lf(h)		(((h)->w0&0xC0000000)>>30)
-#define ipv4_get_hdr_ether_type(h)	(((h)->w0&0x0000FFFF)    )
-#define ipv4_get_hdr_dg_size(h)		(((h)->w0&0x0FFF0000)>>16)
-#define ipv4_get_hdr_fg_off(h)		(((h)->w0&0x00000FFF)    )
-#define ipv4_get_hdr_dgl(h)		(((h)->w1&0xFFFF0000)>>16)
+struct rfc2734_arp {
+	__be16 hw_type;		/* 0x0018	*/
+	__be16 proto_type;	/* 0x0806       */
+	u8 hw_addr_len;		/* 16		*/
+	u8 ip_addr_len;		/* 4		*/
+	__be16 opcode;		/* ARP Opcode	*/
+	/* Above is exactly the same format as struct arphdr */
 
-#define ipv4_set_hdr_lf(lf)		(( lf)<<30)
-#define ipv4_set_hdr_ether_type(et)	(( et)    )
-#define ipv4_set_hdr_dg_size(dgs)	((dgs)<<16)
-#define ipv4_set_hdr_fg_off(fgo)	((fgo)    )
+	__be64 s_uniq_id;	/* Sender's 64bit EUI			*/
+	u8 max_rec;		/* Sender's max packet size		*/
+	u8 sspd;		/* Sender's max speed			*/
+	__be16 fifo_hi;		/* hi 16bits of sender's FIFO addr	*/
+	__be32 fifo_lo;		/* lo 32bits of sender's FIFO addr	*/
+	__be32 sip;		/* Sender's IP Address			*/
+	__be32 tip;		/* IP Address of requested hw addr	*/
+} __attribute__((packed));
 
-#define ipv4_set_hdr_dgl(dgl)		((dgl)<<16)
+/* This header format is specific to this driver implementation. */
+#define FWNET_ALEN	8
+#define FWNET_HLEN	10
+struct fwnet_header {
+	u8 h_dest[FWNET_ALEN];	/* destination address */
+	__be16 h_proto;		/* packet type ID field */
+} __attribute__((packed));
 
-struct ipv4_hdr {
+/* IPv4 and IPv6 encapsulation header */
+struct rfc2734_header {
 	u32 w0;
 	u32 w1;
 };
 
-static inline void ipv4_make_uf_hdr( struct ipv4_hdr *hdr, unsigned ether_type) {
-	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_UNFRAG)
-		   |ipv4_set_hdr_ether_type(ether_type);
-	fw_debug ( "Setting unfragmented header %p to %x\n", hdr, hdr->w0 );
-}
+#define fwnet_get_hdr_lf(h)		(((h)->w0 & 0xc0000000) >> 30)
+#define fwnet_get_hdr_ether_type(h)	(((h)->w0 & 0x0000ffff))
+#define fwnet_get_hdr_dg_size(h)	(((h)->w0 & 0x0fff0000) >> 16)
+#define fwnet_get_hdr_fg_off(h)		(((h)->w0 & 0x00000fff))
+#define fwnet_get_hdr_dgl(h)		(((h)->w1 & 0xffff0000) >> 16)
 
-static inline void ipv4_make_ff_hdr ( struct ipv4_hdr *hdr, unsigned ether_type, unsigned dg_size, unsigned dgl ) {
-	hdr->w0 = ipv4_set_hdr_lf(IPV4_HDR_FIRSTFRAG)
-		   |ipv4_set_hdr_dg_size(dg_size)
-		   |ipv4_set_hdr_ether_type(ether_type);
-	hdr->w1 = ipv4_set_hdr_dgl(dgl);
-	fw_debug ( "Setting fragmented header %p to first_frag %x,%x (et %x, dgs %x, dgl %x)\n", hdr, hdr->w0, hdr->w1,
- ether_type, dg_size, dgl );
-}
+#define fwnet_set_hdr_lf(lf)		((lf)  << 30)
+#define fwnet_set_hdr_ether_type(et)	(et)
+#define fwnet_set_hdr_dg_size(dgs)	((dgs) << 16)
+#define fwnet_set_hdr_fg_off(fgo)	(fgo)
 
-static inline void ipv4_make_sf_hdr ( struct ipv4_hdr *hdr, unsigned lf, unsigned dg_size, unsigned fg_off, unsigned dgl) {
-	hdr->w0 = ipv4_set_hdr_lf(lf)
-		 |ipv4_set_hdr_dg_size(dg_size)
-		 |ipv4_set_hdr_fg_off(fg_off);
-	hdr->w1 = ipv4_set_hdr_dgl(dgl);
-	fw_debug ( "Setting fragmented header %p to %x,%x (lf %x, dgs %x, fo %x dgl %x)\n",
- hdr, hdr->w0, hdr->w1,
- lf, dg_size, fg_off, dgl );
-}
+#define fwnet_set_hdr_dgl(dgl)		((dgl) << 16)
 
-/* End of IP1394 headers */
+static inline void fwnet_make_uf_hdr(struct rfc2734_header *hdr,
+		unsigned ether_type)
+{
+	hdr->w0 = fwnet_set_hdr_lf(RFC2374_HDR_UNFRAG)
+		  | fwnet_set_hdr_ether_type(ether_type);
+}
 
-/* Fragment types */
-#define ETH1394_HDR_LF_UF	0	/* unfragmented		*/
-#define ETH1394_HDR_LF_FF	1	/* first fragment	*/
-#define ETH1394_HDR_LF_LF	2	/* last fragment	*/
-#define ETH1394_HDR_LF_IF	3	/* interior fragment	*/
+static inline void fwnet_make_ff_hdr(struct rfc2734_header *hdr,
+		unsigned ether_type, unsigned dg_size, unsigned dgl)
+{
+	hdr->w0 = fwnet_set_hdr_lf(RFC2374_HDR_FIRSTFRAG)
+		  | fwnet_set_hdr_dg_size(dg_size)
+		  | fwnet_set_hdr_ether_type(ether_type);
+	hdr->w1 = fwnet_set_hdr_dgl(dgl);
+}
 
-#define IP1394_HW_ADDR_LEN	16	/* As per RFC		*/
+static inline void fwnet_make_sf_hdr(struct rfc2734_header *hdr,
+		unsigned lf, unsigned dg_size, unsigned fg_off, unsigned dgl)
+{
+	hdr->w0 = fwnet_set_hdr_lf(lf)
+		  | fwnet_set_hdr_dg_size(dg_size)
+		  | fwnet_set_hdr_fg_off(fg_off);
+	hdr->w1 = fwnet_set_hdr_dgl(dgl);
+}
 
 /* This list keeps track of what parts of the datagram have been filled in */
-struct ipv4_fragment_info {
-        struct list_head fragment_info;
+struct fwnet_fragment_info {
+	struct list_head fi_link;
 	u16 offset;
 	u16 len;
 };
 
-struct ipv4_partial_datagram {
-	struct list_head pdg_list;
-	struct list_head fragment_info;
+struct fwnet_partial_datagram {
+	struct list_head pd_link;
+	struct list_head fi_list;
 	struct sk_buff *skb;
 	/* FIXME Why not use skb->data? */
 	char *pbuf;
@@ -208,40 +138,43 @@ struct ipv4_partial_datagram {
 /*
  * We keep one of these for each IPv4 capable device attached to a fw_card.
  * The list of them is stored in the fw_card structure rather than in the
- * ipv4_priv because the remote IPv4 nodes may be probed before the card is,
- * so we need a place to store them before the ipv4_priv structure is
+ * fwnet_device because the remote IPv4 nodes may be probed before the card is,
+ * so we need a place to store them before the fwnet_device structure is
  * allocated.
  */
-struct ipv4_node {
-	struct list_head ipv4_nodes;
-	/* guid of the remote node */
+struct fwnet_peer {
+	struct list_head peer_link;
+	/* guid of the remote peer */
 	u64 guid;
-	/* FIFO address to transmit datagrams to, or INVALID_FIFO_ADDR */
+	/* FIFO address to transmit datagrams to, or FWNET_NO_FIFO_ADDR */
 	u64 fifo;
 
 	spinlock_t pdg_lock;	/* partial datagram lock		*/
-	/* List of partial datagrams received from this node */
-	struct list_head pdg_list;
-	/* Number of entries in pdg_list at the moment */
+	/* List of partial datagrams received from this peer */
+	struct list_head pd_list;
+	/* Number of entries in pd_list at the moment */
 	unsigned pdg_size;
 
-	/* max payload to transmit to this remote node */
-	/* This already includes the IPV4_FRAG_HDR_SIZE overhead */
+	/* max payload to transmit to this remote peer */
+	/* This already includes the RFC2374_FRAG_HDR_SIZE overhead */
 	u16 max_payload;
 	/* outgoing datagram label */
 	u16 datagram_label;
-	/* Current node_id of the remote node */
-	u16 nodeid;
-	/* current generation of the remote node */
+	/* Current node_id of the remote peer */
+	u16 node_id;
+	/* current generation of the remote peer */
 	u8 generation;
-	/* max speed that this node can receive at */
+	/* max speed that this peer can receive at */
 	u8 xmt_speed;
 };
 
-struct ipv4_priv {
+struct fwnet_device {
 	spinlock_t lock;
-
-	enum ipv4_broadcast_state broadcast_state;
+	enum {
+		FWNET_BROADCAST_ERROR,
+		FWNET_BROADCAST_RUNNING,
+		FWNET_BROADCAST_STOPPED,
+	} broadcast_state;
 	struct fw_iso_context *broadcast_rcv_context;
 	struct fw_iso_buffer broadcast_rcv_buffer;
 	void **broadcast_rcv_buffer_ptrs;
@@ -257,14 +190,12 @@ struct ipv4_priv {
 	u16 broadcast_xmt_datagramlabel;
 
 	/*
-	 * The csr address that remote nodes must send datagrams to for us to
+	 * The CSR address that remote nodes must send datagrams to for us to
 	 * receive them.
 	 */
 	struct fw_address_handler handler;
 	u64 local_fifo;
 
-	/* Wake up to xmt	 */
-        /* struct work_struct wake;*/
 	/* List of packets to be sent */
 	struct list_head packet_list;
 	/*
@@ -279,17 +210,17 @@ struct ipv4_priv {
 };
 
 /* This is our task struct. It's used for the packet complete callback.  */
-struct ipv4_packet_task {
+struct fwnet_packet_task {
 	/*
-	 * ptask can actually be on priv->packet_list, priv->broadcasted_list,
-	 * or priv->sent_list depending on its current state.
+	 * ptask can actually be on dev->packet_list, dev->broadcasted_list,
+	 * or dev->sent_list depending on its current state.
 	 */
-	struct list_head packet_list;
+	struct list_head pt_link;
 	struct fw_transaction transaction;
-	struct ipv4_hdr hdr;
+	struct rfc2734_header hdr;
 	struct sk_buff *skb;
-	struct ipv4_priv *priv;
-	enum ipv4_tx_type tx_type;
+	struct fwnet_device *dev;
+
 	int outstanding_pkts;
 	unsigned max_payload;
 	u64 fifo_addr;
@@ -298,243 +229,192 @@ struct ipv4_packet_task {
 	u8 speed;
 };
 
-static struct kmem_cache *ipv4_packet_task_cache;
-
-static const char ipv4_driver_name[] = "firewire-ipv4";
-
-static const struct ieee1394_device_id ipv4_id_table[] = {
-	{
-		.match_flags  = IEEE1394_MATCH_SPECIFIER_ID |
-				IEEE1394_MATCH_VERSION,
-		.specifier_id = IPV4_GASP_SPECIFIER_ID,
-		.version      = IPV4_GASP_VERSION,
-	},
-	{ }
-};
-
-static u32 ipv4_unit_directory_data[] = {
-	0x00040000,					/* unit directory */
-	0x12000000 | IPV4_GASP_SPECIFIER_ID,	/* specifier ID */
-	0x81000003,					/* text descriptor */
-	0x13000000 | IPV4_GASP_VERSION,		/* version */
-	0x81000005,					/* text descriptor */
-
-	0x00030000,					/* Three quadlets */
-	0x00000000,					/* Text */
-	0x00000000,					/* Language 0 */
-	0x49414e41,					/* I A N A */
-	0x00030000,					/* Three quadlets */
-	0x00000000,					/* Text */
-	0x00000000,					/* Language 0 */
-	0x49507634,					/* I P v 4 */
-};
-
-static struct fw_descriptor ipv4_unit_directory = {
-	.length = ARRAY_SIZE(ipv4_unit_directory_data),
-	.key = 0xd1000000,
-	.data = ipv4_unit_directory_data
-};
-
-static int ipv4_send_packet(struct ipv4_packet_task *ptask );
-
-/* ------------------------------------------------------------------ */
-/******************************************
- * HW Header net device functions
- ******************************************/
-  /* These functions have been adapted from net/ethernet/eth.c */
-
-/* Create a fake MAC header for an arbitrary protocol layer.
- * saddr=NULL means use device source address
- * daddr=NULL means leave destination address (eg unresolved arp). */
+/*
+ * saddr == NULL means use device source address.
+ * daddr == NULL means leave destination address (eg unresolved arp).
+ */
+static int fwnet_header_create(struct sk_buff *skb, struct net_device *net,
+			unsigned short type, const void *daddr,
+			const void *saddr, unsigned len)
+{
+	struct fwnet_header *h;
 
-static int ipv4_header ( struct sk_buff *skb, struct net_device *dev,
-		       unsigned short type, const void *daddr,
-		       const void *saddr, unsigned len) {
-	struct ipv4_ether_hdr *eth;
+	h = (struct fwnet_header *)skb_push(skb, sizeof(*h));
+	put_unaligned_be16(type, &h->h_proto);
 
-	eth = (struct ipv4_ether_hdr *)skb_push(skb, sizeof(*eth));
-	eth->h_proto = htons(type);
+	if (net->flags & (IFF_LOOPBACK | IFF_NOARP)) {
+		memset(h->h_dest, 0, net->addr_len);
 
-	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
-		memset(eth->h_dest, 0, dev->addr_len);
-		return dev->hard_header_len;
+		return net->hard_header_len;
 	}
 
 	if (daddr) {
-		memcpy(eth->h_dest, daddr, dev->addr_len);
-		return dev->hard_header_len;
+		memcpy(h->h_dest, daddr, net->addr_len);
+
+		return net->hard_header_len;
 	}
 
-	return -dev->hard_header_len;
+	return -net->hard_header_len;
 }
 
-/* Rebuild the faked MAC header. This is called after an ARP
- * (or in future other address resolution) has completed on this
- * sk_buff. We now let ARP fill in the other fields.
- *
- * This routine CANNOT use cached dst->neigh!
- * Really, it is used only when dst->neigh is wrong.
- */
-
-static int ipv4_rebuild_header(struct sk_buff *skb)
+static int fwnet_header_rebuild(struct sk_buff *skb)
 {
-	struct ipv4_ether_hdr *eth;
+	struct fwnet_header *h = (struct fwnet_header *)skb->data;
 
-	eth = (struct ipv4_ether_hdr *)skb->data;
-	if (eth->h_proto == htons(ETH_P_IP))
-		return arp_find((unsigned char *)&eth->h_dest, skb);
+	if (get_unaligned_be16(&h->h_proto) == ETH_P_IP)
+		return arp_find((unsigned char *)&h->h_dest, skb);
 
-	fw_notify ( "%s: unable to resolve type %04x addresses\n",
-		   skb->dev->name,ntohs(eth->h_proto) );
+	fw_notify("%s: unable to resolve type %04x addresses\n",
+		  skb->dev->name, be16_to_cpu(h->h_proto));
 	return 0;
 }
 
-static int ipv4_header_cache(const struct neighbour *neigh, struct hh_cache *hh) {
-	unsigned short type = hh->hh_type;
-	struct net_device *dev;
-	struct ipv4_ether_hdr *eth;
+static int fwnet_header_cache(const struct neighbour *neigh,
+			      struct hh_cache *hh)
+{
+	struct net_device *net;
+	struct fwnet_header *h;
 
-	if (type == htons(ETH_P_802_3))
+	if (hh->hh_type == cpu_to_be16(ETH_P_802_3))
 		return -1;
-	dev = neigh->dev;
-	eth = (struct ipv4_ether_hdr *)((u8 *)hh->hh_data + 16 - sizeof(*eth));
-	eth->h_proto = type;
-	memcpy(eth->h_dest, neigh->ha, dev->addr_len);
+	net = neigh->dev;
+	h = (struct fwnet_header *)((u8 *)hh->hh_data + 16 - sizeof(*h));
+	h->h_proto = hh->hh_type;
+	memcpy(h->h_dest, neigh->ha, net->addr_len);
+	hh->hh_len = FWNET_HLEN;
 
-	hh->hh_len = IPV4_HLEN;
 	return 0;
 }
 
 /* Called by Address Resolution module to notify changes in address. */
-static void ipv4_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char * haddr ) {
-	memcpy((u8 *)hh->hh_data + 16 - IPV4_HLEN, haddr, dev->addr_len);
+static void fwnet_header_cache_update(struct hh_cache *hh,
+		const struct net_device *net, const unsigned char *haddr)
+{
+	memcpy((u8 *)hh->hh_data + 16 - FWNET_HLEN, haddr, net->addr_len);
 }
 
-static int ipv4_header_parse(const struct sk_buff *skb, unsigned char *haddr) {
-	memcpy(haddr, skb->dev->dev_addr, IPV4_ALEN);
-	return IPV4_ALEN;
+static int fwnet_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	memcpy(haddr, skb->dev->dev_addr, FWNET_ALEN);
+
+	return FWNET_ALEN;
 }
 
-static const struct header_ops ipv4_header_ops = {
-	.create         = ipv4_header,
-	.rebuild        = ipv4_rebuild_header,
-	.cache		= ipv4_header_cache,
-	.cache_update	= ipv4_header_cache_update,
-	.parse          = ipv4_header_parse,
+static const struct header_ops fwnet_header_ops = {
+	.create         = fwnet_header_create,
+	.rebuild        = fwnet_header_rebuild,
+	.cache		= fwnet_header_cache,
+	.cache_update	= fwnet_header_cache_update,
+	.parse          = fwnet_header_parse,
 };
 
-/* ------------------------------------------------------------------ */
-
 /* FIXME: is this correct for all cases? */
-static bool ipv4_frag_overlap(struct ipv4_partial_datagram *pd, unsigned offset, unsigned len)
+static bool fwnet_frag_overlap(struct fwnet_partial_datagram *pd,
+			       unsigned offset, unsigned len)
 {
-        struct ipv4_fragment_info *fi;
+	struct fwnet_fragment_info *fi;
 	unsigned end = offset + len;
 
-	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
-		if (offset < fi->offset + fi->len && end > fi->offset) {
-			fw_debug ( "frag_overlap pd %p fi %p (%x@%x) with %x@%x\n", pd, fi, fi->len, fi->offset, len, offset );
+	list_for_each_entry(fi, &pd->fi_list, fi_link)
+		if (offset < fi->offset + fi->len && end > fi->offset)
 			return true;
-		}
-	}
-	fw_debug ( "frag_overlap %p does not overlap with %x@%x\n", pd, len, offset );
+
 	return false;
 }
 
 /* Assumes that new fragment does not overlap any existing fragments */
-static struct ipv4_fragment_info *ipv4_frag_new ( struct ipv4_partial_datagram *pd, unsigned offset, unsigned len ) {
-	struct ipv4_fragment_info *fi, *fi2, *new;
+static struct fwnet_fragment_info *fwnet_frag_new(
+	struct fwnet_partial_datagram *pd, unsigned offset, unsigned len)
+{
+	struct fwnet_fragment_info *fi, *fi2, *new;
 	struct list_head *list;
 
-	fw_debug ( "frag_new pd %p %x@%x\n", pd, len, offset );
-	list = &pd->fragment_info;
-	list_for_each_entry(fi, &pd->fragment_info, fragment_info) {
+	list = &pd->fi_list;
+	list_for_each_entry(fi, &pd->fi_list, fi_link) {
 		if (fi->offset + fi->len == offset) {
 			/* The new fragment can be tacked on to the end */
 			/* Did the new fragment plug a hole? */
-			fi2 = list_entry(fi->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+			fi2 = list_entry(fi->fi_link.next,
+					 struct fwnet_fragment_info, fi_link);
 			if (fi->offset + fi->len == fi2->offset) {
-				fw_debug ( "pd %p: hole filling %p (%x@%x) and %p(%x@%x): now %x@%x\n", pd, fi, fi->len, fi->offset,
-				fi2, fi2->len, fi2->offset, fi->len + len + fi2->len, fi->offset );
 				/* glue fragments together */
 				fi->len += len + fi2->len;
-				list_del(&fi2->fragment_info);
+				list_del(&fi2->fi_link);
 				kfree(fi2);
 			} else {
-				fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, fi->len+len, fi->offset );
 				fi->len += len;
 			}
+
 			return fi;
 		}
 		if (offset + len == fi->offset) {
 			/* The new fragment can be tacked on to the beginning */
 			/* Did the new fragment plug a hole? */
-			fi2 = list_entry(fi->fragment_info.prev, struct ipv4_fragment_info, fragment_info);
+			fi2 = list_entry(fi->fi_link.prev,
+					 struct fwnet_fragment_info, fi_link);
 			if (fi2->offset + fi2->len == fi->offset) {
 				/* glue fragments together */
-				fw_debug ( "pd %p: extending %p and merging with %p from %x@%x to %x@%x\n",
- pd, fi2, fi, fi2->len, fi2->offset, fi2->len + fi->len + len, fi2->offset );
 				fi2->len += fi->len + len;
-				list_del(&fi->fragment_info);
+				list_del(&fi->fi_link);
 				kfree(fi);
+
 				return fi2;
 			}
-			fw_debug ( "pd %p: extending %p from %x@%x to %x@%x\n", pd, fi, fi->len, fi->offset, offset, fi->len + len );
 			fi->offset = offset;
 			fi->len += len;
+
 			return fi;
 		}
 		if (offset > fi->offset + fi->len) {
-			list = &fi->fragment_info;
+			list = &fi->fi_link;
 			break;
 		}
 		if (offset + len < fi->offset) {
-			list = fi->fragment_info.prev;
+			list = fi->fi_link.prev;
 			break;
 		}
 	}
 
 	new = kmalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new) {
-		fw_error ( "out of memory in fragment handling!\n" );
+		fw_error("out of memory\n");
 		return NULL;
 	}
 
 	new->offset = offset;
 	new->len = len;
-	list_add(&new->fragment_info, list);
-	fw_debug ( "pd %p: new frag %p %x@%x\n", pd, new, new->len, new->offset );
-	list_for_each_entry( fi, &pd->fragment_info, fragment_info )
-		fw_debug ( "fi %p %x@%x\n", fi, fi->len, fi->offset );
+	list_add(&new->fi_link, list);
+
 	return new;
 }
 
-/* ------------------------------------------------------------------ */
-
-static struct ipv4_partial_datagram *ipv4_pd_new(struct net_device *netdev,
- struct ipv4_node *node, u16 datagram_label, unsigned dg_size, u32 *frag_buf,
- unsigned frag_off, unsigned frag_len) {
-	struct ipv4_partial_datagram *new;
-	struct ipv4_fragment_info *fi;
+static struct fwnet_partial_datagram *fwnet_pd_new(struct net_device *net,
+		struct fwnet_peer *peer, u16 datagram_label, unsigned dg_size,
+		void *frag_buf, unsigned frag_off, unsigned frag_len)
+{
+	struct fwnet_partial_datagram *new;
+	struct fwnet_fragment_info *fi;
 
 	new = kmalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new)
 		goto fail;
-	INIT_LIST_HEAD(&new->fragment_info);
-	fi = ipv4_frag_new ( new, frag_off, frag_len);
-	if ( fi == NULL )
+
+	INIT_LIST_HEAD(&new->fi_list);
+	fi = fwnet_frag_new(new, frag_off, frag_len);
+	if (fi == NULL)
 		goto fail_w_new;
+
 	new->datagram_label = datagram_label;
 	new->datagram_size = dg_size;
-	new->skb = dev_alloc_skb(dg_size + netdev->hard_header_len + 15);
-	if ( new->skb == NULL )
+	new->skb = dev_alloc_skb(dg_size + net->hard_header_len + 15);
+	if (new->skb == NULL)
 		goto fail_w_fi;
-	skb_reserve(new->skb, (netdev->hard_header_len + 15) & ~15);
+
+	skb_reserve(new->skb, (net->hard_header_len + 15) & ~15);
 	new->pbuf = skb_put(new->skb, dg_size);
 	memcpy(new->pbuf + frag_off, frag_buf, frag_len);
-	list_add_tail(&new->pdg_list, &node->pdg_list);
-	fw_debug ( "pd_new: new pd %p { dgl %u, dg_size %u, skb %p, pbuf %p } on node %p\n",
- new, new->datagram_label, new->datagram_size, new->skb, new->pbuf, node );
+	list_add_tail(&new->pd_link, &peer->pd_list);
+
 	return new;
 
 fail_w_fi:
@@ -542,174 +422,171 @@ fail_w_fi:
 fail_w_new:
 	kfree(new);
 fail:
-	fw_error("ipv4_pd_new: no memory\n");
+	fw_error("out of memory\n");
+
 	return NULL;
 }
 
-static struct ipv4_partial_datagram *ipv4_pd_find(struct ipv4_node *node, u16 datagram_label) {
-	struct ipv4_partial_datagram *pd;
+static struct fwnet_partial_datagram *fwnet_pd_find(struct fwnet_peer *peer,
+						    u16 datagram_label)
+{
+	struct fwnet_partial_datagram *pd;
 
-	list_for_each_entry(pd, &node->pdg_list, pdg_list) {
-	        if ( pd->datagram_label == datagram_label ) {
-			fw_debug ( "pd_find(node %p, label %u): pd %p\n", node, datagram_label, pd );
+	list_for_each_entry(pd, &peer->pd_list, pd_link)
+		if (pd->datagram_label == datagram_label)
 			return pd;
-		}
-	}
-	fw_debug ( "pd_find(node %p, label %u) no entry\n", node, datagram_label );
+
 	return NULL;
 }
 
 
-static void ipv4_pd_delete ( struct ipv4_partial_datagram *old ) {
-	struct ipv4_fragment_info *fi, *n;
+static void fwnet_pd_delete(struct fwnet_partial_datagram *old)
+{
+	struct fwnet_fragment_info *fi, *n;
 
-	fw_debug ( "pd_delete %p\n", old );
-	list_for_each_entry_safe(fi, n, &old->fragment_info, fragment_info) {
-		fw_debug ( "Freeing fi %p\n", fi );
+	list_for_each_entry_safe(fi, n, &old->fi_list, fi_link)
 		kfree(fi);
-	}
-	list_del(&old->pdg_list);
+
+	list_del(&old->pd_link);
 	dev_kfree_skb_any(old->skb);
 	kfree(old);
 }
 
-static bool ipv4_pd_update ( struct ipv4_node *node, struct ipv4_partial_datagram *pd,
- u32 *frag_buf, unsigned frag_off, unsigned frag_len) {
-	fw_debug ( "pd_update node %p, pd %p, frag_buf %p, %x@%x\n", node, pd, frag_buf, frag_len, frag_off );
-	if ( ipv4_frag_new ( pd, frag_off, frag_len ) == NULL)
+static bool fwnet_pd_update(struct fwnet_peer *peer,
+		struct fwnet_partial_datagram *pd, void *frag_buf,
+		unsigned frag_off, unsigned frag_len)
+{
+	if (fwnet_frag_new(pd, frag_off, frag_len) == NULL)
 		return false;
+
 	memcpy(pd->pbuf + frag_off, frag_buf, frag_len);
 
 	/*
 	 * Move list entry to beginnig of list so that oldest partial
 	 * datagrams percolate to the end of the list
 	 */
-	list_move_tail(&pd->pdg_list, &node->pdg_list);
-	fw_debug ( "New pd list:\n" );
-	list_for_each_entry ( pd, &node->pdg_list, pdg_list ) {
-		fw_debug ( "pd %p\n", pd );
-	}
+	list_move_tail(&pd->pd_link, &peer->pd_list);
+
 	return true;
 }
 
-static bool ipv4_pd_is_complete ( struct ipv4_partial_datagram *pd ) {
-	struct ipv4_fragment_info *fi;
-	bool ret;
+static bool fwnet_pd_is_complete(struct fwnet_partial_datagram *pd)
+{
+	struct fwnet_fragment_info *fi;
 
-	fi = list_entry(pd->fragment_info.next, struct ipv4_fragment_info, fragment_info);
+	fi = list_entry(pd->fi_list.next, struct fwnet_fragment_info, fi_link);
 
-	ret = (fi->len == pd->datagram_size);
-	fw_debug ( "pd_is_complete (pd %p, dgs %x): fi %p (%x@%x) %s\n", pd, pd->datagram_size, fi, fi->len, fi->offset, ret ? "yes" : "no" );
-	return ret;
+	return fi->len == pd->datagram_size;
 }
 
-/* ------------------------------------------------------------------ */
+static int fwnet_peer_new(struct fw_card *card, struct fw_device *device)
+{
+	struct fwnet_peer *peer;
 
-static int ipv4_node_new ( struct fw_card *card, struct fw_device *device ) {
-	struct ipv4_node *node;
+	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
+	if (!peer) {
+		fw_error("out of memory\n");
 
-	node = kmalloc ( sizeof(*node), GFP_KERNEL );
-	if ( ! node ) {
-		fw_error ( "allocate new node failed\n" );
 		return -ENOMEM;
 	}
-	node->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	node->fifo = INVALID_FIFO_ADDR;
-	INIT_LIST_HEAD(&node->pdg_list);
-	spin_lock_init(&node->pdg_lock);
-	node->pdg_size = 0;
-	node->generation = device->generation;
+	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	peer->fifo = FWNET_NO_FIFO_ADDR;
+	INIT_LIST_HEAD(&peer->pd_list);
+	spin_lock_init(&peer->pdg_lock);
+	peer->pdg_size = 0;
+	peer->generation = device->generation;
 	rmb();
-	node->nodeid = device->node_id;
+	peer->node_id = device->node_id;
 	 /* FIXME what should it really be? */
-	node->max_payload = S100_BUFFER_SIZE - IPV4_UNFRAG_HDR_SIZE;
-	node->datagram_label = 0U;
-	node->xmt_speed = device->max_speed;
-	list_add_tail ( &node->ipv4_nodes, &card->ipv4_nodes );
-	fw_debug ( "node_new: %p { guid %016llx, generation %u, nodeid %x, max_payload %x, xmt_speed %x } added\n",
- node, (unsigned long long)node->guid, node->generation, node->nodeid, node->max_payload, node->xmt_speed );
+	peer->max_payload = IEEE1394_MAX_PAYLOAD_S100 - RFC2374_UNFRAG_HDR_SIZE;
+	peer->datagram_label = 0U;
+	peer->xmt_speed = device->max_speed;
+	list_add_tail(&peer->peer_link, &card->peer_list);
+
 	return 0;
 }
 
-static struct ipv4_node *ipv4_node_find_by_guid(struct ipv4_priv *priv, u64 guid) {
-	struct ipv4_node *node;
+/* FIXME caller must take the lock, or peer needs to be reference-counted */
+static struct fwnet_peer *fwnet_peer_find_by_guid(struct fwnet_device *dev,
+						  u64 guid)
+{
+	struct fwnet_peer *p, *peer = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
-		if (node->guid == guid) {
-			/* FIXME: lock the node first? */
-			spin_unlock_irqrestore ( &priv->lock, flags );
-			fw_debug ( "node_find_by_guid (%016llx) found %p\n", (unsigned long long)guid, node );
-			return node;
+	spin_lock_irqsave(&dev->lock, flags);
+	list_for_each_entry(p, &dev->card->peer_list, peer_link)
+		if (p->guid == guid) {
+			peer = p;
+			break;
 		}
+	spin_unlock_irqrestore(&dev->lock, flags);
 
-	spin_unlock_irqrestore ( &priv->lock, flags );
-	fw_debug ( "node_find_by_guid (%016llx) not found\n", (unsigned long long)guid );
-	return NULL;
+	return peer;
 }
 
-static struct ipv4_node *ipv4_node_find_by_nodeid(struct ipv4_priv *priv, u16 nodeid) {
-	struct ipv4_node *node;
+/* FIXME caller must take the lock, or peer needs to be reference-counted */
+/* FIXME node_id doesn't mean anything without generation */
+static struct fwnet_peer *fwnet_peer_find_by_node_id(struct fwnet_device *dev,
+						     u16 node_id)
+{
+	struct fwnet_peer *p, *peer = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	list_for_each_entry(node, &priv->card->ipv4_nodes, ipv4_nodes)
-		if (node->nodeid == nodeid) {
-			/* FIXME: lock the node first? */
-			spin_unlock_irqrestore ( &priv->lock, flags );
-			fw_debug ( "node_find_by_nodeid (%x) found %p\n", nodeid, node );
-			return node;
+	spin_lock_irqsave(&dev->lock, flags);
+	list_for_each_entry(p, &dev->card->peer_list, peer_link)
+		if (p->node_id == node_id) {
+			peer = p;
+			break;
 		}
-	fw_debug ( "node_find_by_nodeid (%x) not found\n", nodeid );
-	spin_unlock_irqrestore ( &priv->lock, flags );
-	return NULL;
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	return peer;
 }
 
-/* This is only complicated because we can't assume priv exists */
-static void ipv4_node_delete ( struct fw_card *card, struct fw_device *device ) {
-	struct net_device *netdev;
-	struct ipv4_priv *priv;
-	struct ipv4_node *node;
+/* FIXME */
+static void fwnet_peer_delete(struct fw_card *card, struct fw_device *device)
+{
+	struct net_device *net;
+	struct fwnet_device *dev;
+	struct fwnet_peer *peer;
 	u64 guid;
 	unsigned long flags;
-	struct ipv4_partial_datagram *pd, *pd_next;
+	struct fwnet_partial_datagram *pd, *pd_next;
 
 	guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	netdev = card->netdev;
-	if ( netdev )
-		priv = netdev_priv ( netdev );
+	net = card->netdev;
+	if (net)
+		dev = netdev_priv(net);
 	else
-		priv = NULL;
-	if ( priv )
-		spin_lock_irqsave ( &priv->lock, flags );
-	list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
-		if ( node->guid == guid ) {
-			list_del ( &node->ipv4_nodes );
-			list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
-				ipv4_pd_delete ( pd );
+		dev = NULL;
+	if (dev)
+		spin_lock_irqsave(&dev->lock, flags);
+
+	list_for_each_entry(peer, &card->peer_list, peer_link) {
+		if (peer->guid == guid) {
+			list_del(&peer->peer_link);
+			list_for_each_entry_safe(pd, pd_next, &peer->pd_list,
+						 pd_link)
+				fwnet_pd_delete(pd);
 			break;
 		}
 	}
-	if ( priv )
-		spin_unlock_irqrestore ( &priv->lock, flags );
+	if (dev)
+		spin_unlock_irqrestore(&dev->lock, flags);
 }
 
-/* ------------------------------------------------------------------ */
-
-
-static int ipv4_finish_incoming_packet ( struct net_device *netdev,
- struct sk_buff *skb, u16 source_node_id, bool is_broadcast, u16 ether_type ) {
-	struct ipv4_priv *priv;
-	static u64 broadcast_hw = ~0ULL;
+static int fwnet_finish_incoming_packet(struct net_device *net,
+					struct sk_buff *skb, u16 source_node_id,
+					bool is_broadcast, u16 ether_type)
+{
+	struct fwnet_device *dev;
+	static const __be64 broadcast_hw = cpu_to_be64(~0ULL);
 	int status;
-	u64 guid;
+	__be64 guid;
 
-	fw_debug ( "ipv4_finish_incoming_packet(%p, %p, %x, %s, %x\n",
- netdev, skb, source_node_id, is_broadcast ? "true" : "false", ether_type );
-	priv = netdev_priv(netdev);
+	dev = netdev_priv(net);
 	/* Write metadata, and then pass to the receive level */
-	skb->dev = netdev;
+	skb->dev = net;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;  /* don't check it */
 
 	/*
@@ -724,73 +601,75 @@ static int ipv4_finish_incoming_packet ( struct net_device *netdev,
 	 * about the sending machine.
 	 */
 	if (ether_type == ETH_P_ARP) {
-		struct ipv4_arp *arp1394;
+		struct rfc2734_arp *arp1394;
 		struct arphdr *arp;
 		unsigned char *arp_ptr;
 		u64 fifo_addr;
+		u64 peer_guid;
 		u8 max_rec;
 		u8 sspd;
 		u16 max_payload;
-		struct ipv4_node *node;
-		static const u16 ipv4_speed_to_max_payload[] = {
+		struct fwnet_peer *peer;
+		static const u16 fwnet_speed_to_max_payload[] = {
 			/* S100, S200, S400, S800, S1600, S3200 */
 			    512, 1024, 2048, 4096,  4096,  4096
 		};
 
-		/* fw_debug ( "ARP packet\n" ); */
-		arp1394 = (struct ipv4_arp *)skb->data;
+		arp1394 = (struct rfc2734_arp *)skb->data;
 		arp = (struct arphdr *)skb->data;
 		arp_ptr = (unsigned char *)(arp + 1);
-		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32 |
- ntohl(arp1394->fifo_lo);
-		max_rec = priv->card->max_receive;
-		if ( arp1394->max_rec < max_rec )
+		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32
+				| ntohl(arp1394->fifo_lo);
+		max_rec = dev->card->max_receive;
+		if (arp1394->max_rec < max_rec)
 			max_rec = arp1394->max_rec;
 		sspd = arp1394->sspd;
-		/*
-		 * Sanity check. MacOSX seems to be sending us 131 in this
-		 * field (atleast on my Panther G5). Not sure why.
-		 */
-		if (sspd > 5 ) {
-			fw_notify ( "sspd %x out of range\n", sspd );
+		/* Sanity check.  OS X 10.3 PPC reportedly sends 131. */
+		if (sspd > SCODE_3200) {
+			fw_notify("sspd %x out of range\n", sspd);
 			sspd = 0;
 		}
 
-		max_payload = min(ipv4_speed_to_max_payload[sspd],
- (u16)(1 << (max_rec + 1))) - IPV4_UNFRAG_HDR_SIZE;
+		max_payload = min(fwnet_speed_to_max_payload[sspd],
+			(u16)(1 << (max_rec + 1))) - RFC2374_UNFRAG_HDR_SIZE;
 
-		guid = be64_to_cpu(get_unaligned(&arp1394->s_uniq_id));
-		node = ipv4_node_find_by_guid(priv, guid);
-		if (!node) {
-			fw_notify ( "No node for ARP packet from %llx\n", guid );
+		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
+		peer = fwnet_peer_find_by_guid(dev, peer_guid);
+		if (!peer) {
+			fw_notify("No peer for ARP packet from %016llx\n",
+				  (unsigned long long)peer_guid);
 			goto failed_proto;
 		}
-		if ( node->nodeid != source_node_id || node->generation != priv->card->generation ) {
-			fw_notify ( "Internal error: node->nodeid (%x) != soucre_node_id (%x) or node->generation (%x) != priv->card->generation(%x)\n",
- node->nodeid, source_node_id, node->generation, priv->card->generation );
-			node->nodeid = source_node_id;
-			node->generation = priv->card->generation;
+
+		/* FIXME don't use card->generation */
+		if (peer->node_id != source_node_id ||
+		    peer->generation != dev->card->generation) {
+			fw_notify("Internal error: peer->node_id (%x) != "
+				  "source_node_id (%x) or peer->generation (%x)"
+				  " != dev->card->generation(%x)\n",
+				  peer->node_id, source_node_id,
+				  peer->generation, dev->card->generation);
+			peer->node_id = source_node_id;
+			peer->generation = dev->card->generation;
 		}
 
 		/* FIXME: for debugging */
-		if ( sspd > SCODE_400 )
+		if (sspd > SCODE_400)
 			sspd = SCODE_400;
 		/* Update our speed/payload/fifo_offset table */
 		/*
 		 * FIXME: this does not handle cases where two high-speed endpoints must use a slower speed because of
 		 * a lower speed hub between them.  We need to look at the actual topology map here.
 		 */
-		fw_debug ( "Setting node %p fifo %llx (was %llx), max_payload %x (was %x), speed %x (was %x)\n",
- node, fifo_addr, node->fifo, max_payload, node->max_payload, sspd, node->xmt_speed );
-		node->fifo =	fifo_addr;
-		node->max_payload = max_payload;
+		peer->fifo = fifo_addr;
+		peer->max_payload = max_payload;
 		/*
 		 * Only allow speeds to go down from their initial value.
-		 * Otherwise a local node that can only do S400 or slower may
-		 * be told to transmit at S800 to a faster remote node.
+		 * Otherwise a local peer that can only do S400 or slower may
+		 * be told to transmit at S800 to a faster remote peer.
 		 */
-		if ( node->xmt_speed > sspd )
-			node->xmt_speed = sspd;
+		if (peer->xmt_speed > sspd)
+			peer->xmt_speed = sspd;
 
 		/*
 		 * Now that we're done with the 1394 specific stuff, we'll
@@ -805,248 +684,257 @@ static int ipv4_finish_incoming_packet ( struct net_device *netdev,
 		 */
 
 		arp->ar_hln = 8;
-		arp_ptr += arp->ar_hln;		/* skip over sender unique id */
-		*(u32 *)arp_ptr = arp1394->sip; /* move sender IP addr */
-		arp_ptr += arp->ar_pln;		/* skip over sender IP addr */
+		/* skip over sender unique id */
+		arp_ptr += arp->ar_hln;
+		/* move sender IP addr */
+		put_unaligned(arp1394->sip, (u32 *)arp_ptr);
+		/* skip over sender IP addr */
+		arp_ptr += arp->ar_pln;
 
 		if (arp->ar_op == htons(ARPOP_REQUEST))
 			memset(arp_ptr, 0, sizeof(u64));
 		else
-			memcpy(arp_ptr, netdev->dev_addr, sizeof(u64));
+			memcpy(arp_ptr, net->dev_addr, sizeof(u64));
 	}
 
 	/* Now add the ethernet header. */
-	guid = cpu_to_be64(priv->card->guid);
-	if (dev_hard_header(skb, netdev, ether_type, is_broadcast ? &broadcast_hw : &guid, NULL,
- skb->len) >= 0) {
-		struct ipv4_ether_hdr *eth;
+	guid = cpu_to_be64(dev->card->guid);
+	if (dev_hard_header(skb, net, ether_type,
+			   is_broadcast ? &broadcast_hw : &guid,
+			   NULL, skb->len) >= 0) {
+		struct fwnet_header *eth;
 		u16 *rawp;
 		__be16 protocol;
 
 		skb_reset_mac_header(skb);
 		skb_pull(skb, sizeof(*eth));
-		eth = ipv4_ether_hdr(skb);
+		eth = (struct fwnet_header *)skb_mac_header(skb);
 		if (*eth->h_dest & 1) {
-			if (memcmp(eth->h_dest, netdev->broadcast, netdev->addr_len) == 0) {
-				fw_debug ( "Broadcast\n" );
+			if (memcmp(eth->h_dest, net->broadcast,
+				   net->addr_len) == 0)
 				skb->pkt_type = PACKET_BROADCAST;
-			}
 #if 0
 			else
 				skb->pkt_type = PACKET_MULTICAST;
 #endif
 		} else {
-			if (memcmp(eth->h_dest, netdev->dev_addr, netdev->addr_len)) {
+			if (memcmp(eth->h_dest, net->dev_addr, net->addr_len)) {
 				u64 a1, a2;
 
-				memcpy ( &a1, eth->h_dest, sizeof(u64));
-				memcpy ( &a2, netdev->dev_addr, sizeof(u64));
-				fw_debug ( "Otherhost %llx %llx %x\n", a1, a2, netdev->addr_len );
+				memcpy(&a1, eth->h_dest, sizeof(u64));
+				memcpy(&a2, net->dev_addr, sizeof(u64));
 				skb->pkt_type = PACKET_OTHERHOST;
 			}
 		}
 		if (ntohs(eth->h_proto) >= 1536) {
-			fw_debug ( " proto %x %x\n", eth->h_proto, ntohs(eth->h_proto) );
 			protocol = eth->h_proto;
 		} else {
 			rawp = (u16 *)skb->data;
-			if (*rawp == 0xFFFF) {
-				fw_debug ( "proto 802_3\n" );
+			if (*rawp == 0xffff)
 				protocol = htons(ETH_P_802_3);
-			} else {
-				fw_debug ( "proto 802_2\n" );
+			else
 				protocol = htons(ETH_P_802_2);
-			}
 		}
 		skb->protocol = protocol;
 	}
 	status = netif_rx(skb);
-	if ( status == NET_RX_DROP) {
-		netdev->stats.rx_errors++;
-		netdev->stats.rx_dropped++;
+	if (status == NET_RX_DROP) {
+		net->stats.rx_errors++;
+		net->stats.rx_dropped++;
 	} else {
-		netdev->stats.rx_packets++;
-		netdev->stats.rx_bytes += skb->len;
+		net->stats.rx_packets++;
+		net->stats.rx_bytes += skb->len;
 	}
-	if (netif_queue_stopped(netdev))
-		netif_wake_queue(netdev);
+	if (netif_queue_stopped(net))
+		netif_wake_queue(net);
+
 	return 0;
 
  failed_proto:
-	netdev->stats.rx_errors++;
-	netdev->stats.rx_dropped++;
+	net->stats.rx_errors++;
+	net->stats.rx_dropped++;
+
 	dev_kfree_skb_any(skb);
-	if (netif_queue_stopped(netdev))
-		netif_wake_queue(netdev);
-	netdev->last_rx = jiffies;
+	if (netif_queue_stopped(net))
+		netif_wake_queue(net);
+
+	net->last_rx = jiffies;
+
 	return 0;
 }
 
-/* ------------------------------------------------------------------ */
-
-static int ipv4_incoming_packet ( struct ipv4_priv *priv, u32 *buf, int len, u16 source_node_id, bool is_broadcast ) {
+static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
+				 u16 source_node_id, bool is_broadcast)
+{
 	struct sk_buff *skb;
-	struct net_device *netdev;
-	struct ipv4_hdr hdr;
+	struct net_device *net;
+	struct rfc2734_header hdr;
 	unsigned lf;
 	unsigned long flags;
-	struct ipv4_node *node;
-	struct ipv4_partial_datagram *pd;
+	struct fwnet_peer *peer;
+	struct fwnet_partial_datagram *pd;
 	int fg_off;
 	int dg_size;
 	u16 datagram_label;
 	int retval;
 	u16 ether_type;
 
-	fw_debug ( "ipv4_incoming_packet(%p, %p, %d, %x, %s)\n", priv, buf, len, source_node_id, is_broadcast ? "true" : "false" );
-	netdev = priv->card->netdev;
+	net = dev->card->netdev;
 
-	hdr.w0 = ntohl(buf[0]);
-	lf = ipv4_get_hdr_lf(&hdr);
-	if ( lf == IPV4_HDR_UNFRAG ) {
+	hdr.w0 = be32_to_cpu(buf[0]);
+	lf = fwnet_get_hdr_lf(&hdr);
+	if (lf == RFC2374_HDR_UNFRAG) {
 		/*
 		 * An unfragmented datagram has been received by the ieee1394
 		 * bus. Build an skbuff around it so we can pass it to the
 		 * high level network layer.
 		 */
-		ether_type = ipv4_get_hdr_ether_type(&hdr);
-		fw_debug ( "header w0 = %x, lf = %x, ether_type = %x\n", hdr.w0, lf, ether_type );
+		ether_type = fwnet_get_hdr_ether_type(&hdr);
 		buf++;
-		len -= IPV4_UNFRAG_HDR_SIZE;
+		len -= RFC2374_UNFRAG_HDR_SIZE;
 
-		skb = dev_alloc_skb(len + netdev->hard_header_len + 15);
+		skb = dev_alloc_skb(len + net->hard_header_len + 15);
 		if (unlikely(!skb)) {
-			fw_error ( "Out of memory for incoming packet\n");
-			netdev->stats.rx_dropped++;
+			fw_error("out of memory\n");
+			net->stats.rx_dropped++;
+
 			return -1;
 		}
-		skb_reserve(skb, (netdev->hard_header_len + 15) & ~15);
-		memcpy(skb_put(skb, len), buf, len );
-		return ipv4_finish_incoming_packet(netdev, skb, source_node_id, is_broadcast, ether_type );
+		skb_reserve(skb, (net->hard_header_len + 15) & ~15);
+		memcpy(skb_put(skb, len), buf, len);
+
+		return fwnet_finish_incoming_packet(net, skb, source_node_id,
+						    is_broadcast, ether_type);
 	}
 	/* A datagram fragment has been received, now the fun begins. */
 	hdr.w1 = ntohl(buf[1]);
-	buf +=2;
-	len -= IPV4_FRAG_HDR_SIZE;
-	if ( lf ==IPV4_HDR_FIRSTFRAG ) {
-		ether_type = ipv4_get_hdr_ether_type(&hdr);
+	buf += 2;
+	len -= RFC2374_FRAG_HDR_SIZE;
+	if (lf == RFC2374_HDR_FIRSTFRAG) {
+		ether_type = fwnet_get_hdr_ether_type(&hdr);
 		fg_off = 0;
 	} else {
-		fg_off = ipv4_get_hdr_fg_off(&hdr);
-		ether_type = 0; /* Shut up compiler! */
+		ether_type = 0;
+		fg_off = fwnet_get_hdr_fg_off(&hdr);
 	}
-	datagram_label = ipv4_get_hdr_dgl(&hdr);
-	dg_size = ipv4_get_hdr_dg_size(&hdr); /* ??? + 1 */
-	fw_debug ( "fragmented: %x.%x = lf %x, ether_type %x, fg_off %x, dgl %x, dg_size %x\n", hdr.w0, hdr.w1, lf, ether_type, fg_off, datagram_label, dg_size );
-	node = ipv4_node_find_by_nodeid ( priv, source_node_id);
-	spin_lock_irqsave(&node->pdg_lock, flags);
-	pd = ipv4_pd_find( node, datagram_label );
+	datagram_label = fwnet_get_hdr_dgl(&hdr);
+	dg_size = fwnet_get_hdr_dg_size(&hdr); /* ??? + 1 */
+	peer = fwnet_peer_find_by_node_id(dev, source_node_id);
+
+	spin_lock_irqsave(&peer->pdg_lock, flags);
+
+	pd = fwnet_pd_find(peer, datagram_label);
 	if (pd == NULL) {
-		while ( node->pdg_size >= ipv4_mpd ) {
+		while (peer->pdg_size >= FWNET_MAX_FRAGMENTS) {
 			/* remove the oldest */
-			ipv4_pd_delete ( list_first_entry(&node->pdg_list, struct ipv4_partial_datagram, pdg_list) );
-			node->pdg_size--;
+			fwnet_pd_delete(list_first_entry(&peer->pd_list,
+				struct fwnet_partial_datagram, pd_link));
+			peer->pdg_size--;
 		}
-		pd = ipv4_pd_new ( netdev, node, datagram_label, dg_size,
- buf, fg_off, len);
-		if ( pd == NULL) {
+		pd = fwnet_pd_new(net, peer, datagram_label,
+				  dg_size, buf, fg_off, len);
+		if (pd == NULL) {
 			retval = -ENOMEM;
 			goto bad_proto;
 		}
-		node->pdg_size++;
+		peer->pdg_size++;
 	} else {
-		if (ipv4_frag_overlap(pd, fg_off, len) || pd->datagram_size != dg_size) {
+		if (fwnet_frag_overlap(pd, fg_off, len) ||
+		    pd->datagram_size != dg_size) {
 			/*
 			 * Differing datagram sizes or overlapping fragments,
-			 * Either way the remote machine is playing silly buggers
-			 * with us: obliterate the old datagram and start a new one.
+			 * discard old datagram and start a new one.
 			 */
-			ipv4_pd_delete ( pd );
-			pd = ipv4_pd_new ( netdev, node, datagram_label,
- dg_size, buf, fg_off, len);
-			if ( pd == NULL ) {
+			fwnet_pd_delete(pd);
+			pd = fwnet_pd_new(net, peer, datagram_label,
+					  dg_size, buf, fg_off, len);
+			if (pd == NULL) {
 				retval = -ENOMEM;
-				node->pdg_size--;
+				peer->pdg_size--;
 				goto bad_proto;
 			}
 		} else {
-			bool worked;
-
-			worked = ipv4_pd_update ( node, pd,
- buf, fg_off, len );
-			if ( ! worked ) {
+			if (!fwnet_pd_update(peer, pd, buf, fg_off, len)) {
 				/*
 				 * Couldn't save off fragment anyway
 				 * so might as well obliterate the
 				 * datagram now.
 				 */
-				ipv4_pd_delete ( pd );
-				node->pdg_size--;
+				fwnet_pd_delete(pd);
+				peer->pdg_size--;
 				goto bad_proto;
 			}
 		}
 	} /* new datagram or add to existing one */
 
-	if ( lf == IPV4_HDR_FIRSTFRAG )
+	if (lf == RFC2374_HDR_FIRSTFRAG)
 		pd->ether_type = ether_type;
-	if ( ipv4_pd_is_complete ( pd ) ) {
+
+	if (fwnet_pd_is_complete(pd)) {
 		ether_type = pd->ether_type;
-		node->pdg_size--;
+		peer->pdg_size--;
 		skb = skb_get(pd->skb);
-		ipv4_pd_delete ( pd );
-		spin_unlock_irqrestore(&node->pdg_lock, flags);
-		return ipv4_finish_incoming_packet ( netdev, skb, source_node_id, false, ether_type );
+		fwnet_pd_delete(pd);
+
+		spin_unlock_irqrestore(&peer->pdg_lock, flags);
+
+		return fwnet_finish_incoming_packet(net, skb, source_node_id,
+						    false, ether_type);
 	}
 	/*
 	 * Datagram is not complete, we're done for the
 	 * moment.
 	 */
-	spin_unlock_irqrestore(&node->pdg_lock, flags);
+	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+
 	return 0;
 
  bad_proto:
-	spin_unlock_irqrestore(&node->pdg_lock, flags);
-	if (netif_queue_stopped(netdev))
-		netif_wake_queue(netdev);
+	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+
+	if (netif_queue_stopped(net))
+		netif_wake_queue(net);
+
 	return 0;
 }
 
-static void ipv4_receive_packet ( struct fw_card *card, struct fw_request *r,
- int tcode, int destination, int source, int generation, int speed,
- unsigned long long offset, void *payload, size_t length, void *callback_data ) {
-	struct ipv4_priv *priv;
+static void fwnet_receive_packet(struct fw_card *card, struct fw_request *r,
+		int tcode, int destination, int source, int generation,
+		int speed, unsigned long long offset, void *payload,
+		size_t length, void *callback_data)
+{
+	struct fwnet_device *dev;
 	int status;
 
-	fw_debug ( "ipv4_receive_packet(%p,%p,%x,%x,%x,%x,%x,%llx,%p,%lx,%p)\n",
- card, r, tcode, destination, source, generation, speed, offset, payload,
- (unsigned long)length, callback_data);
-	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, payload, length, false );
-	priv = callback_data;
-	if (   tcode != TCODE_WRITE_BLOCK_REQUEST
-	    || destination != card->node_id
-	    || generation != card->generation
-	    || offset != priv->handler.offset ) {
+	dev = callback_data;
+	if (tcode != TCODE_WRITE_BLOCK_REQUEST
+	    || destination != card->node_id	/* <- FIXME */
+	    || generation != card->generation	/* <- FIXME */
+	    || offset != dev->handler.offset) {
 		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
-		fw_debug("Conflict error card node_id=%x, card generation=%x, local offset %llx\n",
- card->node_id, card->generation, (unsigned long long)priv->handler.offset );
+
 		return;
 	}
-	status = ipv4_incoming_packet ( priv, payload, length, source, false );
-	if ( status != 0 ) {
-		fw_error ( "Incoming packet failure\n" );
-		fw_send_response ( card, r, RCODE_CONFLICT_ERROR );
+
+	status = fwnet_incoming_packet(dev, payload, length, source, false);
+	if (status != 0) {
+		fw_error("Incoming packet failure\n");
+		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
+
 		return;
 	}
-	fw_send_response ( card, r, RCODE_COMPLETE );
+
+	fw_send_response(card, r, RCODE_COMPLETE);
 }
 
-static void ipv4_receive_broadcast(struct fw_iso_context *context, u32 cycle,
- size_t header_length, void *header, void *data) {
-	struct ipv4_priv *priv;
+static void fwnet_receive_broadcast(struct fw_iso_context *context,
+		u32 cycle, size_t header_length, void *header, void *data)
+{
+	struct fwnet_device *dev;
 	struct fw_iso_packet packet;
 	struct fw_card *card;
-	u16 *hdr_ptr;
-	u32 *buf_ptr;
+	__be16 *hdr_ptr;
+	__be32 *buf_ptr;
 	int retval;
 	u32 length;
 	u16 source_node_id;
@@ -1055,70 +943,68 @@ static void ipv4_receive_broadcast(struct fw_iso_context *context, u32 cycle,
 	unsigned long offset;
 	unsigned long flags;
 
-	fw_debug ( "ipv4_receive_broadcast ( context=%p, cycle=%x, header_length=%lx, header=%p, data=%p )\n", context, cycle, (unsigned long)header_length, header, data );
-	print_hex_dump ( KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 32, 1, header, header_length, false );
-	priv = data;
-	card = priv->card;
+	dev = data;
+	card = dev->card;
 	hdr_ptr = header;
-	length = ntohs(hdr_ptr[0]);
-	spin_lock_irqsave(&priv->lock,flags);
-	offset = priv->rcv_buffer_size * priv->broadcast_rcv_next_ptr;
-	buf_ptr = priv->broadcast_rcv_buffer_ptrs[priv->broadcast_rcv_next_ptr++];
-	if ( priv->broadcast_rcv_next_ptr == priv->num_broadcast_rcv_ptrs )
-		priv->broadcast_rcv_next_ptr = 0;
-	spin_unlock_irqrestore(&priv->lock,flags);
-	fw_debug ( "length %u at %p\n", length, buf_ptr );
-	print_hex_dump ( KERN_DEBUG, "buffer: ", DUMP_PREFIX_OFFSET, 32, 1, buf_ptr, length, false );
+	length = be16_to_cpup(hdr_ptr);
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	offset = dev->rcv_buffer_size * dev->broadcast_rcv_next_ptr;
+	buf_ptr = dev->broadcast_rcv_buffer_ptrs[dev->broadcast_rcv_next_ptr++];
+	if (dev->broadcast_rcv_next_ptr == dev->num_broadcast_rcv_ptrs)
+		dev->broadcast_rcv_next_ptr = 0;
+
+	spin_unlock_irqrestore(&dev->lock, flags);
 
 	specifier_id =    (be32_to_cpu(buf_ptr[0]) & 0xffff) << 8
 			| (be32_to_cpu(buf_ptr[1]) & 0xff000000) >> 24;
-	ver = be32_to_cpu(buf_ptr[1]) & 0xFFFFFF;
+	ver = be32_to_cpu(buf_ptr[1]) & 0xffffff;
 	source_node_id = be32_to_cpu(buf_ptr[0]) >> 16;
-	/* fw_debug ( "source %x SpecID %x ver %x\n", source_node_id, specifier_id, ver ); */
-	if ( specifier_id == IPV4_GASP_SPECIFIER_ID && ver == IPV4_GASP_VERSION ) {
+
+	if (specifier_id == IANA_SPECIFIER_ID && ver == RFC2734_SW_VERSION) {
 		buf_ptr += 2;
-		length -= IPV4_GASP_OVERHEAD;
-		ipv4_incoming_packet(priv, buf_ptr, length, source_node_id, true);
-	} else
-		fw_debug ( "Ignoring packet: not GASP\n" );
-	packet.payload_length = priv->rcv_buffer_size;
+		length -= IEEE1394_GASP_HDR_SIZE;
+		fwnet_incoming_packet(dev, buf_ptr, length,
+				      source_node_id, true);
+	}
+
+	packet.payload_length = dev->rcv_buffer_size;
 	packet.interrupt = 1;
 	packet.skip = 0;
 	packet.tag = 3;
 	packet.sy = 0;
-	packet.header_length = IPV4_GASP_OVERHEAD;
-	spin_lock_irqsave(&priv->lock,flags);
-	retval = fw_iso_context_queue ( priv->broadcast_rcv_context, &packet,
- &priv->broadcast_rcv_buffer, offset );
-	spin_unlock_irqrestore(&priv->lock,flags);
-	if ( retval < 0 )
-		fw_error ( "requeue failed\n" );
-}
+	packet.header_length = IEEE1394_GASP_HDR_SIZE;
+
+	spin_lock_irqsave(&dev->lock, flags);
 
-static void debug_ptask ( struct ipv4_packet_task *ptask ) {
-	static const char *tx_types[] = { "Unknown", "GASP", "Write" };
-
-	fw_debug ( "packet %p { hdr { w0 %x w1 %x }, skb %p, priv %p,"
- " tx_type %s, outstanding_pkts %d, max_payload %x, fifo %llx,"
- " speed %x, dest_node %x, generation %x }\n",
- ptask, ptask->hdr.w0, ptask->hdr.w1, ptask->skb, ptask->priv,
- ptask->tx_type > IPV4_WRREQ ? "Invalid" : tx_types[ptask->tx_type],
- ptask->outstanding_pkts,  ptask->max_payload,
- ptask->fifo_addr, ptask->speed, ptask->dest_node, ptask->generation );
-	print_hex_dump ( KERN_DEBUG, "packet :", DUMP_PREFIX_OFFSET, 32, 1,
- ptask->skb->data, ptask->skb->len, false );
+	retval = fw_iso_context_queue(dev->broadcast_rcv_context, &packet,
+				      &dev->broadcast_rcv_buffer, offset);
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	if (retval < 0)
+		fw_error("requeue failed\n");
 }
 
-static void ipv4_transmit_packet_done ( struct ipv4_packet_task *ptask ) {
-	struct ipv4_priv *priv;
+static struct kmem_cache *fwnet_packet_task_cache;
+
+static int fwnet_send_packet(struct fwnet_packet_task *ptask);
+
+static void fwnet_transmit_packet_done(struct fwnet_packet_task *ptask)
+{
+	struct fwnet_device *dev;
 	unsigned long flags;
 
-	priv = ptask->priv;
-	spin_lock_irqsave ( &priv->lock, flags );
-	list_del ( &ptask->packet_list );
-	spin_unlock_irqrestore ( &priv->lock, flags );
-	ptask->outstanding_pkts--;
-	if ( ptask->outstanding_pkts > 0 ) {
+	dev = ptask->dev;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	list_del(&ptask->pt_link);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	ptask->outstanding_pkts--; /* FIXME access inside lock */
+
+	if (ptask->outstanding_pkts > 0) {
 		u16 dg_size;
 		u16 fg_off;
 		u16 datagram_label;
@@ -1126,133 +1012,139 @@ static void ipv4_transmit_packet_done ( struct ipv4_packet_task *ptask ) {
 		struct sk_buff *skb;
 
 		/* Update the ptask to point to the next fragment and send it */
-		lf = ipv4_get_hdr_lf(&ptask->hdr);
+		lf = fwnet_get_hdr_lf(&ptask->hdr);
 		switch (lf) {
-		case IPV4_HDR_LASTFRAG:
-		case IPV4_HDR_UNFRAG:
+		case RFC2374_HDR_LASTFRAG:
+		case RFC2374_HDR_UNFRAG:
 		default:
-			fw_error ( "Outstanding packet %x lf %x, header %x,%x\n", ptask->outstanding_pkts, lf, ptask->hdr.w0, ptask->hdr.w1 );
+			fw_error("Outstanding packet %x lf %x, header %x,%x\n",
+				 ptask->outstanding_pkts, lf, ptask->hdr.w0,
+				 ptask->hdr.w1);
 			BUG();
 
-		case IPV4_HDR_FIRSTFRAG:
+		case RFC2374_HDR_FIRSTFRAG:
 			/* Set frag type here for future interior fragments */
-			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
-			fg_off = ptask->max_payload - IPV4_FRAG_HDR_SIZE;
-			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+			dg_size = fwnet_get_hdr_dg_size(&ptask->hdr);
+			fg_off = ptask->max_payload - RFC2374_FRAG_HDR_SIZE;
+			datagram_label = fwnet_get_hdr_dgl(&ptask->hdr);
 			break;
 
-		case IPV4_HDR_INTFRAG:
-			dg_size = ipv4_get_hdr_dg_size(&ptask->hdr);
-			fg_off = ipv4_get_hdr_fg_off(&ptask->hdr) + ptask->max_payload - IPV4_FRAG_HDR_SIZE;
-			datagram_label = ipv4_get_hdr_dgl(&ptask->hdr);
+		case RFC2374_HDR_INTFRAG:
+			dg_size = fwnet_get_hdr_dg_size(&ptask->hdr);
+			fg_off = fwnet_get_hdr_fg_off(&ptask->hdr)
+				  + ptask->max_payload - RFC2374_FRAG_HDR_SIZE;
+			datagram_label = fwnet_get_hdr_dgl(&ptask->hdr);
 			break;
 		}
 		skb = ptask->skb;
-		skb_pull ( skb, ptask->max_payload );
-		if ( ptask->outstanding_pkts > 1 ) {
-			ipv4_make_sf_hdr ( &ptask->hdr,
-  IPV4_HDR_INTFRAG, dg_size, fg_off, datagram_label );
+		skb_pull(skb, ptask->max_payload);
+		if (ptask->outstanding_pkts > 1) {
+			fwnet_make_sf_hdr(&ptask->hdr, RFC2374_HDR_INTFRAG,
+					  dg_size, fg_off, datagram_label);
 		} else {
-			ipv4_make_sf_hdr ( &ptask->hdr,
-  IPV4_HDR_LASTFRAG, dg_size, fg_off, datagram_label );
-			ptask->max_payload = skb->len + IPV4_FRAG_HDR_SIZE;
-
+			fwnet_make_sf_hdr(&ptask->hdr, RFC2374_HDR_LASTFRAG,
+					  dg_size, fg_off, datagram_label);
+			ptask->max_payload = skb->len + RFC2374_FRAG_HDR_SIZE;
 		}
-		ipv4_send_packet ( ptask );
+		fwnet_send_packet(ptask);
 	} else {
-		dev_kfree_skb_any ( ptask->skb );
-		kmem_cache_free( ipv4_packet_task_cache, ptask );
+		dev_kfree_skb_any(ptask->skb);
+		kmem_cache_free(fwnet_packet_task_cache, ptask);
 	}
 }
 
-static void ipv4_write_complete ( struct fw_card *card, int rcode,
- void *payload, size_t length, void *data ) {
-	struct ipv4_packet_task *ptask;
+static void fwnet_write_complete(struct fw_card *card, int rcode,
+				 void *payload, size_t length, void *data)
+{
+	struct fwnet_packet_task *ptask;
 
 	ptask = data;
-	fw_debug ( "ipv4_write_complete ( %p, %x, %p, %lx, %p )\n",
- card, rcode, payload, (unsigned long)length, data );
-	debug_ptask ( ptask );
 
-	if ( rcode == RCODE_COMPLETE ) {
-		ipv4_transmit_packet_done ( ptask );
-	} else {
-		fw_error ( "ipv4_write_complete: failed: %x\n", rcode );
+	if (rcode == RCODE_COMPLETE)
+		fwnet_transmit_packet_done(ptask);
+	else
+		fw_error("fwnet_write_complete: failed: %x\n", rcode);
 		/* ??? error recovery */
-	}
 }
 
-static int ipv4_send_packet ( struct ipv4_packet_task *ptask ) {
-	struct ipv4_priv *priv;
+static int fwnet_send_packet(struct fwnet_packet_task *ptask)
+{
+	struct fwnet_device *dev;
 	unsigned tx_len;
-	struct ipv4_hdr *bufhdr;
+	struct rfc2734_header *bufhdr;
 	unsigned long flags;
-	struct net_device *netdev;
-#if 0 /* stefanr */
-	int retval;
-#endif
+	struct net_device *net;
 
-	fw_debug ( "ipv4_send_packet\n" );
-	debug_ptask ( ptask );
-	priv = ptask->priv;
+	dev = ptask->dev;
 	tx_len = ptask->max_payload;
-	switch (ipv4_get_hdr_lf(&ptask->hdr)) {
-	case IPV4_HDR_UNFRAG:
-		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_UNFRAG_HDR_SIZE);
-		bufhdr->w0 = htonl(ptask->hdr.w0);
+	switch (fwnet_get_hdr_lf(&ptask->hdr)) {
+	case RFC2374_HDR_UNFRAG:
+		bufhdr = (struct rfc2734_header *)
+				skb_push(ptask->skb, RFC2374_UNFRAG_HDR_SIZE);
+		put_unaligned_be32(ptask->hdr.w0, &bufhdr->w0);
 		break;
 
-	case IPV4_HDR_FIRSTFRAG:
-	case IPV4_HDR_INTFRAG:
-	case IPV4_HDR_LASTFRAG:
-		bufhdr = (struct ipv4_hdr *)skb_push(ptask->skb, IPV4_FRAG_HDR_SIZE);
-		bufhdr->w0 = htonl(ptask->hdr.w0);
-		bufhdr->w1 = htonl(ptask->hdr.w1);
+	case RFC2374_HDR_FIRSTFRAG:
+	case RFC2374_HDR_INTFRAG:
+	case RFC2374_HDR_LASTFRAG:
+		bufhdr = (struct rfc2734_header *)
+				skb_push(ptask->skb, RFC2374_FRAG_HDR_SIZE);
+		put_unaligned_be32(ptask->hdr.w0, &bufhdr->w0);
+		put_unaligned_be32(ptask->hdr.w1, &bufhdr->w1);
 		break;
 
 	default:
 		BUG();
 	}
-	if ( ptask->tx_type == IPV4_GASP ) {
-		u32 *packets;
+	if (ptask->dest_node == IEEE1394_ALL_NODES) {
+		u8 *p;
 		int generation;
-		int nodeid;
+		int node_id;
 
 		/* ptask->generation may not have been set yet */
-		generation = priv->card->generation;
+		generation = dev->card->generation;
 		smp_rmb();
-		nodeid = priv->card->node_id;
-		packets = (u32 *)skb_push(ptask->skb, sizeof(u32)*2);
-		packets[0] = htonl(nodeid << 16 | (IPV4_GASP_SPECIFIER_ID>>8));
-		packets[1] = htonl((IPV4_GASP_SPECIFIER_ID & 0xFF) << 24 | IPV4_GASP_VERSION);
-		fw_send_request ( priv->card, &ptask->transaction, TCODE_STREAM_DATA,
- fw_stream_packet_destination_id(3, BROADCAST_CHANNEL, 0),
- generation, SCODE_100, 0ULL, ptask->skb->data, tx_len + 8, ipv4_write_complete, ptask );
-		spin_lock_irqsave(&priv->lock,flags);
-		list_add_tail ( &ptask->packet_list, &priv->broadcasted_list );
-		spin_unlock_irqrestore(&priv->lock,flags);
-#if 0 /* stefanr */
-		return retval;
-#else
+		node_id = dev->card->node_id;
+
+		p = skb_push(ptask->skb, 8);
+		put_unaligned_be32(node_id << 16 | IANA_SPECIFIER_ID >> 8, p);
+		put_unaligned_be32((IANA_SPECIFIER_ID & 0xff) << 24
+						| RFC2734_SW_VERSION, &p[4]);
+
+		/* We should not transmit if broadcast_channel.valid == 0. */
+		fw_send_request(dev->card, &ptask->transaction,
+				TCODE_STREAM_DATA,
+				fw_stream_packet_destination_id(3,
+						IEEE1394_BROADCAST_CHANNEL, 0),
+				generation, SCODE_100, 0ULL, ptask->skb->data,
+				tx_len + 8, fwnet_write_complete, ptask);
+
+		/* FIXME race? */
+		spin_lock_irqsave(&dev->lock, flags);
+		list_add_tail(&ptask->pt_link, &dev->broadcasted_list);
+		spin_unlock_irqrestore(&dev->lock, flags);
+
 		return 0;
-#endif
 	}
-	fw_debug("send_request (%p, %p, WRITE_BLOCK, %x, %x, %x, %llx, %p, %d, %p, %p\n",
- priv->card, &ptask->transaction, ptask->dest_node, ptask->generation,
- ptask->speed, (unsigned long long)ptask->fifo_addr, ptask->skb->data, tx_len,
- ipv4_write_complete, ptask );
-	fw_send_request ( priv->card, &ptask->transaction,
- TCODE_WRITE_BLOCK_REQUEST, ptask->dest_node, ptask->generation, ptask->speed,
- ptask->fifo_addr, ptask->skb->data, tx_len, ipv4_write_complete, ptask );
-	spin_lock_irqsave(&priv->lock,flags);
-	list_add_tail ( &ptask->packet_list, &priv->sent_list );
-	spin_unlock_irqrestore(&priv->lock,flags);
-	netdev = priv->card->netdev;
-	netdev->trans_start = jiffies;
+
+	fw_send_request(dev->card, &ptask->transaction,
+			TCODE_WRITE_BLOCK_REQUEST, ptask->dest_node,
+			ptask->generation, ptask->speed, ptask->fifo_addr,
+			ptask->skb->data, tx_len, fwnet_write_complete, ptask);
+
+	/* FIXME race? */
+	spin_lock_irqsave(&dev->lock, flags);
+	list_add_tail(&ptask->pt_link, &dev->sent_list);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	net = dev->card->netdev;
+	net->trans_start = jiffies;
+
 	return 0;
 }
 
-static int ipv4_broadcast_start ( struct ipv4_priv *priv ) {
+static int fwnet_broadcast_start(struct fwnet_device *dev)
+{
 	struct fw_iso_context *context;
 	int retval;
 	unsigned num_packets;
@@ -1260,150 +1152,151 @@ static int ipv4_broadcast_start ( struct ipv4_priv *priv ) {
 	struct fw_iso_packet packet;
 	unsigned long offset;
 	unsigned u;
-	/* unsigned transmit_speed; */
 
-#if 0 /* stefanr */
-	if ( priv->card->broadcast_channel != (BROADCAST_CHANNEL_VALID|BROADCAST_CHANNEL_INITIAL)) {
-		fw_notify ( "Invalid broadcast channel %x\n", priv->card->broadcast_channel );
-		/* FIXME: try again later? */
-		/* return -EINVAL; */
-	}
-#endif
-	if ( priv->local_fifo == INVALID_FIFO_ADDR ) {
-		struct fw_address_region region;
-
-		priv->handler.length = FIFO_SIZE;
-		priv->handler.address_callback = ipv4_receive_packet;
-		priv->handler.callback_data = priv;
-		/* FIXME: this is OHCI, but what about others? */
-		region.start = 0xffff00000000ULL;
-		region.end =   0xfffffffffffcULL;
-
-		retval = fw_core_add_address_handler ( &priv->handler, &region );
-		if ( retval < 0 )
+	if (dev->local_fifo == FWNET_NO_FIFO_ADDR) {
+		/* outside OHCI posted write area? */
+		static const struct fw_address_region region = {
+			.start = 0xffff00000000ULL,
+			.end   = CSR_REGISTER_BASE,
+		};
+
+		dev->handler.length = 4096;
+		dev->handler.address_callback = fwnet_receive_packet;
+		dev->handler.callback_data = dev;
+
+		retval = fw_core_add_address_handler(&dev->handler, &region);
+		if (retval < 0)
 			goto failed_initial;
-		priv->local_fifo = priv->handler.offset;
+
+		dev->local_fifo = dev->handler.offset;
 	}
 
-	/*
-	 * FIXME: rawiso limits us to PAGE_SIZE.  This only matters if we ever have
-	 * a machine with PAGE_SIZE < 4096
-	 */
-	max_receive = 1U << (priv->card->max_receive + 1);
-	num_packets = ( ipv4_iso_page_count * PAGE_SIZE ) / max_receive;
-	if ( ! priv->broadcast_rcv_context ) {
+	max_receive = 1U << (dev->card->max_receive + 1);
+	num_packets = (FWNET_ISO_PAGE_COUNT * PAGE_SIZE) / max_receive;
+
+	if (!dev->broadcast_rcv_context) {
 		void **ptrptr;
 
-		context = fw_iso_context_create ( priv->card,
- FW_ISO_CONTEXT_RECEIVE, BROADCAST_CHANNEL,
- priv->card->link_speed, 8, ipv4_receive_broadcast, priv );
+		context = fw_iso_context_create(dev->card,
+		    FW_ISO_CONTEXT_RECEIVE, IEEE1394_BROADCAST_CHANNEL,
+		    dev->card->link_speed, 8, fwnet_receive_broadcast, dev);
 		if (IS_ERR(context)) {
 			retval = PTR_ERR(context);
 			goto failed_context_create;
 		}
-		retval = fw_iso_buffer_init ( &priv->broadcast_rcv_buffer,
- priv->card, ipv4_iso_page_count, DMA_FROM_DEVICE );
-		if ( retval < 0 )
+
+		retval = fw_iso_buffer_init(&dev->broadcast_rcv_buffer,
+		    dev->card, FWNET_ISO_PAGE_COUNT, DMA_FROM_DEVICE);
+		if (retval < 0)
 			goto failed_buffer_init;
-		ptrptr = kmalloc ( sizeof(void*)*num_packets, GFP_KERNEL );
-		if ( ! ptrptr ) {
+
+		ptrptr = kmalloc(sizeof(void *) * num_packets, GFP_KERNEL);
+		if (!ptrptr) {
 			retval = -ENOMEM;
 			goto failed_ptrs_alloc;
 		}
-		priv->broadcast_rcv_buffer_ptrs = ptrptr;
-		for ( u = 0; u < ipv4_iso_page_count; u++ ) {
+
+		dev->broadcast_rcv_buffer_ptrs = ptrptr;
+		for (u = 0; u < FWNET_ISO_PAGE_COUNT; u++) {
 			void *ptr;
 			unsigned v;
 
-			ptr = kmap ( priv->broadcast_rcv_buffer.pages[u] );
-			for ( v = 0; v < num_packets / ipv4_iso_page_count; v++ )
-				*ptrptr++ = (void *)((char *)ptr + v * max_receive);
+			ptr = kmap(dev->broadcast_rcv_buffer.pages[u]);
+			for (v = 0; v < num_packets / FWNET_ISO_PAGE_COUNT; v++)
+				*ptrptr++ = (void *)
+						((char *)ptr + v * max_receive);
 		}
-		priv->broadcast_rcv_context = context;
-	} else
-		context = priv->broadcast_rcv_context;
+		dev->broadcast_rcv_context = context;
+	} else {
+		context = dev->broadcast_rcv_context;
+	}
 
 	packet.payload_length = max_receive;
 	packet.interrupt = 1;
 	packet.skip = 0;
 	packet.tag = 3;
 	packet.sy = 0;
-	packet.header_length = IPV4_GASP_OVERHEAD;
+	packet.header_length = IEEE1394_GASP_HDR_SIZE;
 	offset = 0;
-	for ( u = 0; u < num_packets; u++ ) {
-		retval = fw_iso_context_queue ( context, &packet,
- &priv->broadcast_rcv_buffer, offset );
-		if ( retval < 0 )
+
+	for (u = 0; u < num_packets; u++) {
+		retval = fw_iso_context_queue(context, &packet,
+				&dev->broadcast_rcv_buffer, offset);
+		if (retval < 0)
 			goto failed_rcv_queue;
+
 		offset += max_receive;
 	}
-	priv->num_broadcast_rcv_ptrs = num_packets;
-	priv->rcv_buffer_size = max_receive;
-	priv->broadcast_rcv_next_ptr = 0U;
-	retval = fw_iso_context_start ( context, -1, 0, FW_ISO_CONTEXT_MATCH_ALL_TAGS ); /* ??? sync */
-	if ( retval < 0 )
+	dev->num_broadcast_rcv_ptrs = num_packets;
+	dev->rcv_buffer_size = max_receive;
+	dev->broadcast_rcv_next_ptr = 0U;
+	retval = fw_iso_context_start(context, -1, 0,
+			FW_ISO_CONTEXT_MATCH_ALL_TAGS); /* ??? sync */
+	if (retval < 0)
 		goto failed_rcv_queue;
-	/* FIXME: adjust this when we know the max receive speeds of all other IP nodes on the bus. */
-	/* since we only xmt at S100 ??? */
-	priv->broadcast_xmt_max_payload = S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD - IPV4_UNFRAG_HDR_SIZE;
-	priv->broadcast_state = IPV4_BROADCAST_RUNNING;
+
+	/* FIXME: adjust it according to the min. speed of all known peers? */
+	dev->broadcast_xmt_max_payload = IEEE1394_MAX_PAYLOAD_S100
+			- IEEE1394_GASP_HDR_SIZE - RFC2374_UNFRAG_HDR_SIZE;
+	dev->broadcast_state = FWNET_BROADCAST_RUNNING;
+
 	return 0;
 
  failed_rcv_queue:
-	kfree ( priv->broadcast_rcv_buffer_ptrs );
-	priv->broadcast_rcv_buffer_ptrs = NULL;
+	kfree(dev->broadcast_rcv_buffer_ptrs);
+	dev->broadcast_rcv_buffer_ptrs = NULL;
  failed_ptrs_alloc:
-	fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
+	fw_iso_buffer_destroy(&dev->broadcast_rcv_buffer, dev->card);
  failed_buffer_init:
-	fw_iso_context_destroy ( context );
-	priv->broadcast_rcv_context = NULL;
+	fw_iso_context_destroy(context);
+	dev->broadcast_rcv_context = NULL;
  failed_context_create:
-	fw_core_remove_address_handler ( &priv->handler );
+	fw_core_remove_address_handler(&dev->handler);
  failed_initial:
-	priv->local_fifo = INVALID_FIFO_ADDR;
+	dev->local_fifo = FWNET_NO_FIFO_ADDR;
+
 	return retval;
 }
 
-/* This is called after an "ifup" */
-static int ipv4_open(struct net_device *dev) {
-	struct ipv4_priv *priv;
+/* ifup */
+static int fwnet_open(struct net_device *net)
+{
+	struct fwnet_device *dev = netdev_priv(net);
 	int ret;
 
-	priv = netdev_priv(dev);
-	if (priv->broadcast_state == IPV4_BROADCAST_ERROR) {
-		ret = ipv4_broadcast_start ( priv );
+	if (dev->broadcast_state == FWNET_BROADCAST_ERROR) {
+		ret = fwnet_broadcast_start(dev);
 		if (ret)
 			return ret;
 	}
-	netif_start_queue(dev);
+	netif_start_queue(net);
+
 	return 0;
 }
 
-/* This is called after an "ifdown" */
-static int ipv4_stop(struct net_device *netdev)
+/* ifdown */
+static int fwnet_stop(struct net_device *net)
 {
-	/* flush priv->wake */
-	/* flush_scheduled_work(); */
+	netif_stop_queue(net);
+
+	/* Deallocate iso context for use by other applications? */
 
-	netif_stop_queue(netdev);
 	return 0;
 }
 
-/* Transmit a packet (called by kernel) */
-static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
+static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 {
-	struct ipv4_ether_hdr hdr_buf;
-	struct ipv4_priv *priv = netdev_priv(netdev);
+	struct fwnet_header hdr_buf;
+	struct fwnet_device *dev = netdev_priv(net);
 	__be16 proto;
 	u16 dest_node;
-	enum ipv4_tx_type tx_type;
 	unsigned max_payload;
 	u16 dg_size;
 	u16 *datagram_label_ptr;
-	struct ipv4_packet_task *ptask;
-	struct ipv4_node *node = NULL;
+	struct fwnet_packet_task *ptask;
+	struct fwnet_peer *peer = NULL;
 
-	ptask = kmem_cache_alloc(ipv4_packet_task_cache, GFP_ATOMIC);
+	ptask = kmem_cache_alloc(fwnet_packet_task_cache, GFP_ATOMIC);
 	if (ptask == NULL)
 		goto fail;
 
@@ -1412,7 +1305,7 @@ static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
 		goto fail;
 
 	/*
-	 * Get rid of the fake ipv4 header, but first make a copy.
+	 * Make a copy of the driver-specific header.
 	 * We might need to rebuild the header on tx failure.
 	 */
 	memcpy(&hdr_buf, skb->data, sizeof(hdr_buf));
@@ -1425,110 +1318,95 @@ static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
 	 * Set the transmission type for the packet.  ARP packets and IP
 	 * broadcast packets are sent via GASP.
 	 */
-	if (   memcmp(hdr_buf.h_dest, netdev->broadcast, IPV4_ALEN) == 0
+	if (memcmp(hdr_buf.h_dest, net->broadcast, FWNET_ALEN) == 0
 	    || proto == htons(ETH_P_ARP)
-	    || (   proto == htons(ETH_P_IP)
-		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)) ) ) {
-		/* fw_debug ( "transmitting arp or multicast packet\n" );*/
-		tx_type = IPV4_GASP;
-		dest_node = ALL_NODES;
-		max_payload = priv->broadcast_xmt_max_payload;
-		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_GASP_OVERHEAD); */
-		datagram_label_ptr = &priv->broadcast_xmt_datagramlabel;
-		ptask->fifo_addr = INVALID_FIFO_ADDR;
-		ptask->generation = 0U;
-		ptask->dest_node = 0U;
-		ptask->speed = 0;
+	    || (proto == htons(ETH_P_IP)
+		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)))) {
+		max_payload = dev->broadcast_xmt_max_payload;
+		datagram_label_ptr = &dev->broadcast_xmt_datagramlabel;
+
+		ptask->fifo_addr = FWNET_NO_FIFO_ADDR;
+		ptask->generation = 0;
+		ptask->dest_node = IEEE1394_ALL_NODES;
+		ptask->speed = SCODE_100;
 	} else {
-		__be64 guid = get_unaligned((u64 *)hdr_buf.h_dest);
+		__be64 guid = get_unaligned((__be64 *)hdr_buf.h_dest);
 		u8 generation;
 
-		node = ipv4_node_find_by_guid(priv, be64_to_cpu(guid));
-		if (!node) {
-			fw_debug ( "Normal packet but no node\n" );
+		peer = fwnet_peer_find_by_guid(dev, be64_to_cpu(guid));
+		if (!peer)
 			goto fail;
-		}
 
-		if (node->fifo == INVALID_FIFO_ADDR) {
-			fw_debug ( "Normal packet but no fifo addr\n" );
+		if (peer->fifo == FWNET_NO_FIFO_ADDR)
 			goto fail;
-		}
 
-		/* fw_debug ( "Transmitting normal packet to %x at %llxx\n", node->nodeid, node->fifo ); */
-		generation = node->generation;
-		dest_node = node->nodeid;
-		max_payload = node->max_payload;
-		/* BUG_ON(max_payload < S100_BUFFER_SIZE - IPV4_FRAG_HDR_SIZE); */
+		generation = peer->generation;
+		smp_rmb();
+		dest_node = peer->node_id;
+
+		max_payload = peer->max_payload;
+		datagram_label_ptr = &peer->datagram_label;
 
-		datagram_label_ptr = &node->datagram_label;
-		tx_type = IPV4_WRREQ;
-		ptask->fifo_addr = node->fifo;
+		ptask->fifo_addr = peer->fifo;
 		ptask->generation = generation;
 		ptask->dest_node = dest_node;
-		ptask->speed = node->xmt_speed;
+		ptask->speed = peer->xmt_speed;
 	}
 
 	/* If this is an ARP packet, convert it */
 	if (proto == htons(ETH_P_ARP)) {
-		/* Convert a standard ARP packet to 1394 ARP. The first 8 bytes (the entire
-		 * arphdr) is the same format as the ip1394 header, so they overlap.  The rest
-		 * needs to be munged a bit.  The remainder of the arphdr is formatted based
-		 * on hwaddr len and ipaddr len.  We know what they'll be, so it's easy to
-		 * judge.
-		 *
-		 * Now that the EUI is used for the hardware address all we need to do to make
-		 * this work for 1394 is to insert 2 quadlets that contain max_rec size,
-		 * speed, and unicast FIFO address information between the sender_unique_id
-		 * and the IP addresses.
-		 */
 		struct arphdr *arp = (struct arphdr *)skb->data;
 		unsigned char *arp_ptr = (unsigned char *)(arp + 1);
-		struct ipv4_arp *arp1394 = (struct ipv4_arp *)skb->data;
-		u32 ipaddr;
-
-		ipaddr = *(u32*)(arp_ptr + IPV4_ALEN);
-		arp1394->hw_addr_len    = 16;
-		arp1394->max_rec        = priv->card->max_receive;
-		arp1394->sspd		= priv->card->link_speed;
-		arp1394->fifo_hi	= htons(priv->local_fifo >> 32);
-		arp1394->fifo_lo        = htonl(priv->local_fifo & 0xFFFFFFFF);
-		arp1394->sip		= ipaddr;
+		struct rfc2734_arp *arp1394 = (struct rfc2734_arp *)skb->data;
+		__be32 ipaddr;
+
+		ipaddr = get_unaligned((__be32 *)(arp_ptr + FWNET_ALEN));
+
+		arp1394->hw_addr_len    = RFC2734_HW_ADDR_LEN;
+		arp1394->max_rec        = dev->card->max_receive;
+		arp1394->sspd		= dev->card->link_speed;
+
+		put_unaligned_be16(dev->local_fifo >> 32,
+				   &arp1394->fifo_hi);
+		put_unaligned_be32(dev->local_fifo & 0xffffffff,
+				   &arp1394->fifo_lo);
+		put_unaligned(ipaddr, &arp1394->sip);
 	}
-	if ( ipv4_max_xmt && max_payload > ipv4_max_xmt )
-		max_payload = ipv4_max_xmt;
 
 	ptask->hdr.w0 = 0;
 	ptask->hdr.w1 = 0;
 	ptask->skb = skb;
-	ptask->priv = priv;
-        ptask->tx_type = tx_type;
+	ptask->dev = dev;
+
 	/* Does it all fit in one packet? */
-	if ( dg_size <= max_payload ) {
-		ipv4_make_uf_hdr(&ptask->hdr, be16_to_cpu(proto));
+	if (dg_size <= max_payload) {
+		fwnet_make_uf_hdr(&ptask->hdr, ntohs(proto));
 		ptask->outstanding_pkts = 1;
-		max_payload = dg_size + IPV4_UNFRAG_HDR_SIZE;
+		max_payload = dg_size + RFC2374_UNFRAG_HDR_SIZE;
 	} else {
 		u16 datagram_label;
 
-		max_payload -= IPV4_FRAG_OVERHEAD;
+		max_payload -= RFC2374_FRAG_OVERHEAD;
 		datagram_label = (*datagram_label_ptr)++;
-		ipv4_make_ff_hdr(&ptask->hdr, be16_to_cpu(proto), dg_size, datagram_label );
+		fwnet_make_ff_hdr(&ptask->hdr, ntohs(proto), dg_size,
+				  datagram_label);
 		ptask->outstanding_pkts = DIV_ROUND_UP(dg_size, max_payload);
-		max_payload += IPV4_FRAG_HDR_SIZE;
+		max_payload += RFC2374_FRAG_HDR_SIZE;
 	}
 	ptask->max_payload = max_payload;
-	ipv4_send_packet ( ptask );
+	fwnet_send_packet(ptask);
+
 	return NETDEV_TX_OK;
 
  fail:
 	if (ptask)
-		kmem_cache_free(ipv4_packet_task_cache, ptask);
+		kmem_cache_free(fwnet_packet_task_cache, ptask);
 
 	if (skb != NULL)
 		dev_kfree_skb(skb);
 
-	netdev->stats.tx_dropped++;
-	netdev->stats.tx_errors++;
+	net->stats.tx_dropped++;
+	net->stats.tx_errors++;
 
 	/*
 	 * FIXME: According to a patch from 2003-02-26, "returning non-zero
@@ -1540,280 +1418,291 @@ static int ipv4_tx(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 }
 
-/*
- * FIXME: What to do if we timeout? I think a host reset is probably in order,
- * so that's what we do. Should we increment the stat counters too?
- */
-static void ipv4_tx_timeout(struct net_device *dev) {
-	struct ipv4_priv *priv;
+static void fwnet_tx_timeout(struct net_device *net)
+{
+	fw_error("%s: timeout\n", net->name);
 
-	priv = netdev_priv(dev);
-	fw_error ( "%s: Timeout, resetting host\n", dev->name );
-#if 0 /* stefanr */
-	fw_core_initiate_bus_reset ( priv->card, 1 );
-#endif
+	/* FIXME: What to do if we timeout? */
 }
 
-static int ipv4_change_mtu ( struct net_device *dev, int new_mtu ) {
-#if 0
-	int max_mtu;
-	struct ipv4_priv *priv;
-#endif
-
+static int fwnet_change_mtu(struct net_device *net, int new_mtu)
+{
 	if (new_mtu < 68)
 		return -EINVAL;
 
-#if 0
-	priv = netdev_priv(dev);
-	/* This is not actually true because we can fragment packets at the firewire layer */
-	max_mtu = (1 << (priv->card->max_receive + 1))
-		                - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
-	if (new_mtu > max_mtu) {
-		fw_notify ( "%s: Local node constrains MTU to %d\n", dev->name, max_mtu);
-		return -ERANGE;
-	}
-#endif
-	dev->mtu = new_mtu;
+	net->mtu = new_mtu;
 	return 0;
 }
 
-static void ipv4_get_drvinfo(struct net_device *dev,
-struct ethtool_drvinfo *info) {
-	strcpy(info->driver, ipv4_driver_name);
-	strcpy(info->bus_info, "ieee1394"); /* FIXME provide more detail? */
+static void fwnet_get_drvinfo(struct net_device *net,
+			      struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, KBUILD_MODNAME);
+	strcpy(info->bus_info, "ieee1394");
 }
 
-static struct ethtool_ops ipv4_ethtool_ops = {
-	.get_drvinfo = ipv4_get_drvinfo,
+static struct ethtool_ops fwnet_ethtool_ops = {
+	.get_drvinfo = fwnet_get_drvinfo,
 };
 
-static const struct net_device_ops ipv4_netdev_ops = {
-	.ndo_open       = ipv4_open,
-	.ndo_stop	= ipv4_stop,
-	.ndo_start_xmit = ipv4_tx,
-	.ndo_tx_timeout = ipv4_tx_timeout,
-	.ndo_change_mtu = ipv4_change_mtu,
+static const struct net_device_ops fwnet_netdev_ops = {
+	.ndo_open       = fwnet_open,
+	.ndo_stop	= fwnet_stop,
+	.ndo_start_xmit = fwnet_tx,
+	.ndo_tx_timeout = fwnet_tx_timeout,
+	.ndo_change_mtu = fwnet_change_mtu,
 };
 
-static void ipv4_init_dev ( struct net_device *dev ) {
-	dev->header_ops		= &ipv4_header_ops;
-	dev->netdev_ops         = &ipv4_netdev_ops;
-	SET_ETHTOOL_OPS(dev, &ipv4_ethtool_ops);
-
-	dev->watchdog_timeo	= IPV4_TIMEOUT;
-	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
-	dev->features		= NETIF_F_HIGHDMA;
-	dev->addr_len		= IPV4_ALEN;
-	dev->hard_header_len	= IPV4_HLEN;
-	dev->type		= ARPHRD_IEEE1394;
-
-	/* FIXME: This value was copied from ether_setup(). Is it too much? */
-	dev->tx_queue_len	= 1000;
+static void fwnet_init_dev(struct net_device *net)
+{
+	net->header_ops		= &fwnet_header_ops;
+	net->netdev_ops		= &fwnet_netdev_ops;
+	net->watchdog_timeo	= 100000; /* ? FIXME */
+	net->flags		= IFF_BROADCAST | IFF_MULTICAST;
+	net->features		= NETIF_F_HIGHDMA;
+	net->addr_len		= FWNET_ALEN;
+	net->hard_header_len	= FWNET_HLEN;
+	net->type		= ARPHRD_IEEE1394;
+	net->tx_queue_len	= 1000; /* ? FIXME */
+	SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
 }
 
-static int ipv4_probe ( struct device *dev ) {
-	struct fw_unit * unit;
-	struct fw_device *device;
-	struct fw_card *card;
-	struct net_device *netdev;
-	struct ipv4_priv *priv;
+/* FIXME create netdev upon first fw_unit of a card, not upon local fw_unit */
+static int fwnet_probe(struct device *_dev)
+{
+	struct fw_unit *unit = fw_unit(_dev);
+	struct fw_device *device = fw_parent_device(unit);
+	struct fw_card *card = device->card;
+	struct net_device *net;
+	struct fwnet_device *dev;
 	unsigned max_mtu;
-	__be64 guid;
-
-	fw_debug("ipv4 Probing\n" );
-	unit = fw_unit ( dev );
-	device = fw_device ( unit->device.parent );
-	card = device->card;
 
-	if ( ! device->is_local ) {
+	if (!device->is_local) {
 		int added;
 
-		fw_debug ( "Non-local, adding remote node entry\n" );
-		added = ipv4_node_new ( card, device );
+		added = fwnet_peer_new(card, device);
 		return added;
 	}
-	fw_debug("ipv4 Local: adding netdev\n" );
-	netdev = alloc_netdev ( sizeof(*priv), "firewire%d", ipv4_init_dev );
-	if ( netdev == NULL) {
-		fw_error( "Out of memory\n");
+	net = alloc_netdev(sizeof(*dev), "firewire%d", fwnet_init_dev);
+	if (net == NULL) {
+		fw_error("out of memory\n");
 		goto out;
 	}
 
-	SET_NETDEV_DEV(netdev, card->device);
-	priv = netdev_priv(netdev);
+	SET_NETDEV_DEV(net, card->device);
+	dev = netdev_priv(net);
 
-	spin_lock_init(&priv->lock);
-	priv->broadcast_state = IPV4_BROADCAST_ERROR;
-	priv->broadcast_rcv_context = NULL;
-	priv->broadcast_xmt_max_payload = 0;
-	priv->broadcast_xmt_datagramlabel = 0;
+	spin_lock_init(&dev->lock);
+	dev->broadcast_state = FWNET_BROADCAST_ERROR;
+	dev->broadcast_rcv_context = NULL;
+	dev->broadcast_xmt_max_payload = 0;
+	dev->broadcast_xmt_datagramlabel = 0;
 
-	priv->local_fifo = INVALID_FIFO_ADDR;
+	dev->local_fifo = FWNET_NO_FIFO_ADDR;
 
-	/* INIT_WORK(&priv->wake, ipv4_handle_queue);*/
-	INIT_LIST_HEAD(&priv->packet_list);
-	INIT_LIST_HEAD(&priv->broadcasted_list);
-	INIT_LIST_HEAD(&priv->sent_list );
+	/* INIT_WORK(&dev->wake, fwnet_handle_queue);*/
+	INIT_LIST_HEAD(&dev->packet_list);
+	INIT_LIST_HEAD(&dev->broadcasted_list);
+	INIT_LIST_HEAD(&dev->sent_list);
 
-	priv->card = card;
+	dev->card = card;
 
 	/*
 	 * Use the RFC 2734 default 1500 octets or the maximum payload
 	 * as initial MTU
 	 */
 	max_mtu = (1 << (card->max_receive + 1))
-		  - sizeof(struct ipv4_hdr) - IPV4_GASP_OVERHEAD;
-	netdev->mtu = min(1500U, max_mtu);
+		  - sizeof(struct rfc2734_header) - IEEE1394_GASP_HDR_SIZE;
+	net->mtu = min(1500U, max_mtu);
 
 	/* Set our hardware address while we're at it */
-	guid = cpu_to_be64(card->guid);
-	memcpy(netdev->dev_addr, &guid, sizeof(u64));
-	memset(netdev->broadcast, 0xff, sizeof(u64));
-	if ( register_netdev ( netdev ) ) {
-		fw_error ( "Cannot register the driver\n");
+	put_unaligned_be64(card->guid, net->dev_addr);
+	put_unaligned_be64(~0ULL, net->broadcast);
+	if (register_netdev(net)) {
+		fw_error("Cannot register the driver\n");
 		goto out;
 	}
 
-	fw_notify ( "%s: IPv4 over Firewire on device %016llx\n",
- netdev->name, card->guid );
-	card->netdev = netdev;
+	fw_notify("%s: IPv4 over FireWire on device %016llx\n",
+		  net->name, (unsigned long long)card->guid);
+	card->netdev = net;
 
-	return 0 /* ipv4_new_node ( ud ) */;
+	return 0;
  out:
-	if ( netdev )
-		free_netdev ( netdev );
+	if (net)
+		free_netdev(net);
+
 	return -ENOENT;
 }
 
+static int fwnet_remove(struct device *_dev)
+{
+	struct fw_unit *unit = fw_unit(_dev);
+	struct fw_device *device = fw_parent_device(unit);
+	struct fw_card *card = device->card;
+	struct net_device *net;
+	struct fwnet_device *dev;
+	struct fwnet_peer *peer;
+	struct fwnet_partial_datagram *pd, *pd_next;
+	struct fwnet_packet_task *ptask, *pt_next;
+
+	if (!device->is_local) {
+		fwnet_peer_delete(card, device);
 
-static int ipv4_remove ( struct device *dev ) {
-	struct fw_unit * unit;
-	struct fw_device *device;
-	struct fw_card *card;
-	struct net_device *netdev;
-	struct ipv4_priv *priv;
-	struct ipv4_node *node;
-	struct ipv4_partial_datagram *pd, *pd_next;
-	struct ipv4_packet_task *ptask, *pt_next;
-
-	fw_debug("ipv4 Removing\n" );
-	unit = fw_unit ( dev );
-	device = fw_device ( unit->device.parent );
-	card = device->card;
-
-	if ( ! device->is_local ) {
-		fw_debug ( "Node %x is non-local, removing remote node entry\n", device->node_id );
-		ipv4_node_delete ( card, device );
 		return 0;
 	}
-	netdev = card->netdev;
-	if ( netdev ) {
-		fw_debug ( "Node %x is local: deleting netdev\n", device->node_id );
-		priv = netdev_priv ( netdev );
-		unregister_netdev ( netdev );
-		fw_debug ( "unregistered\n" );
-		if ( priv->local_fifo != INVALID_FIFO_ADDR )
-			fw_core_remove_address_handler ( &priv->handler );
-		fw_debug ( "address handler gone\n" );
-		if ( priv->broadcast_rcv_context ) {
-			fw_iso_context_stop ( priv->broadcast_rcv_context );
-			fw_iso_buffer_destroy ( &priv->broadcast_rcv_buffer, priv->card );
-			fw_iso_context_destroy ( priv->broadcast_rcv_context );
-			fw_debug ( "rcv stopped\n" );
+
+	net = card->netdev;
+	if (net) {
+		dev = netdev_priv(net);
+		unregister_netdev(net);
+
+		if (dev->local_fifo != FWNET_NO_FIFO_ADDR)
+			fw_core_remove_address_handler(&dev->handler);
+		if (dev->broadcast_rcv_context) {
+			fw_iso_context_stop(dev->broadcast_rcv_context);
+			fw_iso_buffer_destroy(&dev->broadcast_rcv_buffer,
+					      dev->card);
+			fw_iso_context_destroy(dev->broadcast_rcv_context);
 		}
-		list_for_each_entry_safe( ptask, pt_next, &priv->packet_list, packet_list ) {
-			dev_kfree_skb_any ( ptask->skb );
-			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		list_for_each_entry_safe(ptask, pt_next,
+					 &dev->packet_list, pt_link) {
+			dev_kfree_skb_any(ptask->skb);
+			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		list_for_each_entry_safe( ptask, pt_next, &priv->broadcasted_list, packet_list ) {
-			dev_kfree_skb_any ( ptask->skb );
-			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		list_for_each_entry_safe(ptask, pt_next,
+					 &dev->broadcasted_list, pt_link) {
+			dev_kfree_skb_any(ptask->skb);
+			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		list_for_each_entry_safe( ptask, pt_next, &priv->sent_list, packet_list ) {
-			dev_kfree_skb_any ( ptask->skb );
-			kmem_cache_free( ipv4_packet_task_cache, ptask );
+		list_for_each_entry_safe(ptask, pt_next,
+					 &dev->sent_list, pt_link) {
+			dev_kfree_skb_any(ptask->skb);
+			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		fw_debug ( "lists emptied\n" );
-		list_for_each_entry( node, &card->ipv4_nodes, ipv4_nodes ) {
-			if ( node->pdg_size ) {
-				list_for_each_entry_safe( pd, pd_next, &node->pdg_list, pdg_list )
-					ipv4_pd_delete ( pd );
-				node->pdg_size = 0;
+		list_for_each_entry(peer, &card->peer_list, peer_link) {
+			if (peer->pdg_size) {
+				list_for_each_entry_safe(pd, pd_next,
+						&peer->pd_list, pd_link)
+					fwnet_pd_delete(pd);
+				peer->pdg_size = 0;
 			}
-			node->fifo = INVALID_FIFO_ADDR;
+			peer->fifo = FWNET_NO_FIFO_ADDR;
 		}
-		fw_debug ( "nodes cleaned up\n" );
-		free_netdev ( netdev );
+		free_netdev(net);
 		card->netdev = NULL;
-		fw_debug ( "done\n" );
 	}
+
 	return 0;
 }
 
-static void ipv4_update ( struct fw_unit *unit ) {
-	struct fw_device *device;
-	struct fw_card *card;
+/*
+ * FIXME abort partially sent fragmented datagrams,
+ * discard partially received fragmented datagrams
+ */
+static void fwnet_update(struct fw_unit *unit)
+{
+	struct fw_device *device = fw_parent_device(unit);
+	struct net_device *net = device->card->netdev;
+	struct fwnet_device *dev;
+	struct fwnet_peer *peer;
+	u64 guid;
 
-	fw_debug ( "ipv4_update unit %p\n", unit );
-	device = fw_device ( unit->device.parent );
-	card = device->card;
-	if ( ! device->is_local ) {
-		struct ipv4_node *node;
-		u64 guid;
-		struct net_device *netdev;
-		struct ipv4_priv *priv;
-
-		netdev = card->netdev;
-		if ( netdev ) {
-			priv = netdev_priv ( netdev );
-			guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-			node = ipv4_node_find_by_guid ( priv, guid );
-			if ( ! node ) {
-				fw_error ( "ipv4_update: no node for device %llx\n", guid );
-				return;
-			}
-			fw_debug ( "Non-local, updating remote node entry for guid %llx old generation %x, old nodeid %x\n", guid, node->generation, node->nodeid );
-			node->generation = device->generation;
-			rmb();
-			node->nodeid = device->node_id;
-			fw_debug ( "New generation %x, new nodeid %x\n", node->generation, node->nodeid );
-		} else
-			fw_error ( "nonlocal, but no netdev?  How can that be?\n" );
-	} else {
-		/* FIXME: What do we need to do on bus reset? */
-		fw_debug ( "Local, doing nothing\n" );
+	if (net && !device->is_local) {
+		dev = netdev_priv(net);
+		guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+		peer = fwnet_peer_find_by_guid(dev, guid);
+		if (!peer) {
+			fw_error("fwnet_update: no peer for device %016llx\n",
+				 (unsigned long long)guid);
+			return;
+		}
+		peer->generation = device->generation;
+		rmb();
+		peer->node_id = device->node_id;
 	}
 }
 
-static struct fw_driver ipv4_driver = {
+static const struct ieee1394_device_id fwnet_id_table[] = {
+	{
+		.match_flags  = IEEE1394_MATCH_SPECIFIER_ID |
+				IEEE1394_MATCH_VERSION,
+		.specifier_id = IANA_SPECIFIER_ID,
+		.version      = RFC2734_SW_VERSION,
+	},
+	{ }
+};
+
+static struct fw_driver fwnet_driver = {
 	.driver = {
-		.owner = THIS_MODULE,
-		.name = ipv4_driver_name,
-		.bus = &fw_bus_type,
-		.probe = ipv4_probe,
-		.remove = ipv4_remove,
+		.owner  = THIS_MODULE,
+		.name   = "net",
+		.bus    = &fw_bus_type,
+		.probe  = fwnet_probe,
+		.remove = fwnet_remove,
 	},
-	.update = ipv4_update,
-	.id_table = ipv4_id_table,
+	.update   = fwnet_update,
+	.id_table = fwnet_id_table,
+};
+
+static const u32 rfc2374_unit_directory_data[] = {
+	0x00040000,	/* directory_length		*/
+	0x1200005e,	/* unit_specifier_id: IANA	*/
+	0x81000003,	/* textual descriptor offset	*/
+	0x13000001,	/* unit_sw_version: RFC 2734	*/
+	0x81000005,	/* textual descriptor offset	*/
+	0x00030000,	/* descriptor_length		*/
+	0x00000000,	/* text				*/
+	0x00000000,	/* minimal ASCII, en		*/
+	0x49414e41,	/* I A N A			*/
+	0x00030000,	/* descriptor_length		*/
+	0x00000000,	/* text				*/
+	0x00000000,	/* minimal ASCII, en		*/
+	0x49507634,	/* I P v 4			*/
+};
+
+static struct fw_descriptor rfc2374_unit_directory = {
+	.length = ARRAY_SIZE(rfc2374_unit_directory_data),
+	.key    = (CSR_DIRECTORY | CSR_UNIT) << 24,
+	.data   = rfc2374_unit_directory_data
 };
 
-static int __init ipv4_init ( void ) {
-	int added;
+static int __init fwnet_init(void)
+{
+	int err;
+
+	err = fw_core_add_descriptor(&rfc2374_unit_directory);
+	if (err)
+		return err;
 
-	added = fw_core_add_descriptor ( &ipv4_unit_directory );
-	if ( added < 0 )
-		fw_error ( "Failed to add descriptor" );
-	ipv4_packet_task_cache = kmem_cache_create("packet_task",
- sizeof(struct ipv4_packet_task), 0, 0, NULL);
-	fw_debug("Adding ipv4 module\n" );
-	return driver_register ( &ipv4_driver.driver );
+	fwnet_packet_task_cache = kmem_cache_create("packet_task",
+			sizeof(struct fwnet_packet_task), 0, 0, NULL);
+	if (!fwnet_packet_task_cache) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = driver_register(&fwnet_driver.driver);
+	if (!err)
+		return 0;
+
+	kmem_cache_destroy(fwnet_packet_task_cache);
+out:
+	fw_core_remove_descriptor(&rfc2374_unit_directory);
+
+	return err;
 }
+module_init(fwnet_init);
 
-static void __exit ipv4_cleanup ( void ) {
-	fw_core_remove_descriptor ( &ipv4_unit_directory );
-	fw_debug("Removing ipv4 module\n" );
-	driver_unregister ( &ipv4_driver.driver );
+static void __exit fwnet_cleanup(void)
+{
+	driver_unregister(&fwnet_driver.driver);
+	kmem_cache_destroy(fwnet_packet_task_cache);
+	fw_core_remove_descriptor(&rfc2374_unit_directory);
 }
+module_exit(fwnet_cleanup);
 
-module_init(ipv4_init);
-module_exit(ipv4_cleanup);
+MODULE_AUTHOR("Jay Fenlason <fenlason@redhat.com>");
+MODULE_DESCRIPTION("IPv4 over IEEE1394 as per RFC 2734");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(ieee1394, fwnet_id_table);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index d44f47d3b2d..5cb0c1549ff 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -131,13 +131,10 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-	/* Only non-NULL if firewire-ipv4 is active on this card. */
+
+	/* firewire-net driver data */
 	void *netdev;
-	/*
-	 * The nodes get probed before the card, so we need a place to store
-	 * them independent of card->netdev
-	 */
-	struct list_head ipv4_nodes;
+	struct list_head peer_list;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
-- 
cgit v1.2.3-70-g09d2


From 5a124d382ea5c97be43c779e4f481455e0287654 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 14 Jun 2009 11:45:27 +0200
Subject: firewire: net: allow for unordered unit discovery

Decouple the creation and destruction of the net_device from the order
of discovery and removal of nodes with RFC 2734 unit directories since
there is no reliable order.  The net_device is now created when the
first RFC 2734 unit on a card is discovered, and destroyed when the last
RFC 2734 unit on a card went away.  This includes all remote units as
well as the local unit, which is therefore tracked as a peer now too.

Also, locking around the list of peers is slightly extended to guard
against peer removal.  As a side effect, fwnet_peer.pdg_lock has become
superfluous and is deleted.

Peer data (max_rec, speed, node ID, generation) are updated more
carefully.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c |   2 -
 drivers/firewire/net.c       | 454 ++++++++++++++++++++-----------------------
 include/linux/firewire.h     |   4 -
 3 files changed, 207 insertions(+), 253 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 8c45e43da7c..667603ac14b 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -429,8 +429,6 @@ void fw_card_initialize(struct fw_card *card,
 	card->local_node = NULL;
 
 	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
-	card->netdev = NULL;
-	INIT_LIST_HEAD(&card->peer_list);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index ba6f924b1b1..d83c54587a6 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -18,8 +18,10 @@
 #include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/spinlock.h>
 
 #include <asm/unaligned.h>
 #include <net/arp.h>
@@ -135,40 +137,11 @@ struct fwnet_partial_datagram {
 	u16 datagram_size;
 };
 
-/*
- * We keep one of these for each IPv4 capable device attached to a fw_card.
- * The list of them is stored in the fw_card structure rather than in the
- * fwnet_device because the remote IPv4 nodes may be probed before the card is,
- * so we need a place to store them before the fwnet_device structure is
- * allocated.
- */
-struct fwnet_peer {
-	struct list_head peer_link;
-	/* guid of the remote peer */
-	u64 guid;
-	/* FIFO address to transmit datagrams to, or FWNET_NO_FIFO_ADDR */
-	u64 fifo;
-
-	spinlock_t pdg_lock;	/* partial datagram lock		*/
-	/* List of partial datagrams received from this peer */
-	struct list_head pd_list;
-	/* Number of entries in pd_list at the moment */
-	unsigned pdg_size;
-
-	/* max payload to transmit to this remote peer */
-	/* This already includes the RFC2374_FRAG_HDR_SIZE overhead */
-	u16 max_payload;
-	/* outgoing datagram label */
-	u16 datagram_label;
-	/* Current node_id of the remote peer */
-	u16 node_id;
-	/* current generation of the remote peer */
-	u8 generation;
-	/* max speed that this peer can receive at */
-	u8 xmt_speed;
-};
+static DEFINE_MUTEX(fwnet_device_mutex);
+static LIST_HEAD(fwnet_device_list);
 
 struct fwnet_device {
+	struct list_head dev_link;
 	spinlock_t lock;
 	enum {
 		FWNET_BROADCAST_ERROR,
@@ -206,7 +179,26 @@ struct fwnet_device {
 	/* List of packets that have been sent but not yet acked */
 	struct list_head sent_list;
 
+	struct list_head peer_list;
 	struct fw_card *card;
+	struct net_device *netdev;
+};
+
+struct fwnet_peer {
+	struct list_head peer_link;
+	struct fwnet_device *dev;
+	u64 guid;
+	u64 fifo;
+
+	/* guarded by dev->lock */
+	struct list_head pd_list; /* received partial datagrams */
+	unsigned pdg_size;        /* pd_list size */
+
+	u16 datagram_label;       /* outgoing datagram label */
+	unsigned max_payload;     /* includes RFC2374_FRAG_HDR_SIZE overhead */
+	int node_id;
+	int generation;
+	unsigned speed;
 };
 
 /* This is our task struct. It's used for the packet complete callback.  */
@@ -479,102 +471,47 @@ static bool fwnet_pd_is_complete(struct fwnet_partial_datagram *pd)
 	return fi->len == pd->datagram_size;
 }
 
-static int fwnet_peer_new(struct fw_card *card, struct fw_device *device)
-{
-	struct fwnet_peer *peer;
-
-	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
-	if (!peer) {
-		fw_error("out of memory\n");
-
-		return -ENOMEM;
-	}
-	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	peer->fifo = FWNET_NO_FIFO_ADDR;
-	INIT_LIST_HEAD(&peer->pd_list);
-	spin_lock_init(&peer->pdg_lock);
-	peer->pdg_size = 0;
-	peer->generation = device->generation;
-	rmb();
-	peer->node_id = device->node_id;
-	 /* FIXME what should it really be? */
-	peer->max_payload = IEEE1394_MAX_PAYLOAD_S100 - RFC2374_UNFRAG_HDR_SIZE;
-	peer->datagram_label = 0U;
-	peer->xmt_speed = device->max_speed;
-	list_add_tail(&peer->peer_link, &card->peer_list);
-
-	return 0;
-}
-
-/* FIXME caller must take the lock, or peer needs to be reference-counted */
+/* caller must hold dev->lock */
 static struct fwnet_peer *fwnet_peer_find_by_guid(struct fwnet_device *dev,
 						  u64 guid)
 {
-	struct fwnet_peer *p, *peer = NULL;
-	unsigned long flags;
+	struct fwnet_peer *peer;
 
-	spin_lock_irqsave(&dev->lock, flags);
-	list_for_each_entry(p, &dev->card->peer_list, peer_link)
-		if (p->guid == guid) {
-			peer = p;
-			break;
-		}
-	spin_unlock_irqrestore(&dev->lock, flags);
+	list_for_each_entry(peer, &dev->peer_list, peer_link)
+		if (peer->guid == guid)
+			return peer;
 
-	return peer;
+	return NULL;
 }
 
-/* FIXME caller must take the lock, or peer needs to be reference-counted */
-/* FIXME node_id doesn't mean anything without generation */
+/* caller must hold dev->lock */
 static struct fwnet_peer *fwnet_peer_find_by_node_id(struct fwnet_device *dev,
-						     u16 node_id)
+						int node_id, int generation)
 {
-	struct fwnet_peer *p, *peer = NULL;
-	unsigned long flags;
+	struct fwnet_peer *peer;
 
-	spin_lock_irqsave(&dev->lock, flags);
-	list_for_each_entry(p, &dev->card->peer_list, peer_link)
-		if (p->node_id == node_id) {
-			peer = p;
-			break;
-		}
-	spin_unlock_irqrestore(&dev->lock, flags);
+	list_for_each_entry(peer, &dev->peer_list, peer_link)
+		if (peer->node_id    == node_id &&
+		    peer->generation == generation)
+			return peer;
 
-	return peer;
+	return NULL;
 }
 
-/* FIXME */
-static void fwnet_peer_delete(struct fw_card *card, struct fw_device *device)
+/* See IEEE 1394-2008 table 6-4, table 8-8, table 16-18. */
+static unsigned fwnet_max_payload(unsigned max_rec, unsigned speed)
 {
-	struct net_device *net;
-	struct fwnet_device *dev;
-	struct fwnet_peer *peer;
-	u64 guid;
-	unsigned long flags;
-	struct fwnet_partial_datagram *pd, *pd_next;
-
-	guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	net = card->netdev;
-	if (net)
-		dev = netdev_priv(net);
-	else
-		dev = NULL;
-	if (dev)
-		spin_lock_irqsave(&dev->lock, flags);
-
-	list_for_each_entry(peer, &card->peer_list, peer_link) {
-		if (peer->guid == guid) {
-			list_del(&peer->peer_link);
-			list_for_each_entry_safe(pd, pd_next, &peer->pd_list,
-						 pd_link)
-				fwnet_pd_delete(pd);
-			break;
-		}
+	max_rec = min(max_rec, speed + 8);
+	max_rec = min(max_rec, 0xbU); /* <= 4096 */
+	if (max_rec < 8) {
+		fw_notify("max_rec %x out of range\n", max_rec);
+		max_rec = 8;
 	}
-	if (dev)
-		spin_unlock_irqrestore(&dev->lock, flags);
+
+	return (1 << (max_rec + 1)) - RFC2374_FRAG_HDR_SIZE;
 }
 
+
 static int fwnet_finish_incoming_packet(struct net_device *net,
 					struct sk_buff *skb, u16 source_node_id,
 					bool is_broadcast, u16 ether_type)
@@ -606,71 +543,44 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 		unsigned char *arp_ptr;
 		u64 fifo_addr;
 		u64 peer_guid;
-		u8 max_rec;
-		u8 sspd;
+		unsigned sspd;
 		u16 max_payload;
 		struct fwnet_peer *peer;
-		static const u16 fwnet_speed_to_max_payload[] = {
-			/* S100, S200, S400, S800, S1600, S3200 */
-			    512, 1024, 2048, 4096,  4096,  4096
-		};
+		unsigned long flags;
+
+		arp1394   = (struct rfc2734_arp *)skb->data;
+		arp       = (struct arphdr *)skb->data;
+		arp_ptr   = (unsigned char *)(arp + 1);
+		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
+		fifo_addr = (u64)get_unaligned_be16(&arp1394->fifo_hi) << 32
+				| get_unaligned_be32(&arp1394->fifo_lo);
 
-		arp1394 = (struct rfc2734_arp *)skb->data;
-		arp = (struct arphdr *)skb->data;
-		arp_ptr = (unsigned char *)(arp + 1);
-		fifo_addr = (u64)ntohs(arp1394->fifo_hi) << 32
-				| ntohl(arp1394->fifo_lo);
-		max_rec = dev->card->max_receive;
-		if (arp1394->max_rec < max_rec)
-			max_rec = arp1394->max_rec;
 		sspd = arp1394->sspd;
 		/* Sanity check.  OS X 10.3 PPC reportedly sends 131. */
 		if (sspd > SCODE_3200) {
 			fw_notify("sspd %x out of range\n", sspd);
-			sspd = 0;
+			sspd = SCODE_3200;
 		}
+		max_payload = fwnet_max_payload(arp1394->max_rec, sspd);
 
-		max_payload = min(fwnet_speed_to_max_payload[sspd],
-			(u16)(1 << (max_rec + 1))) - RFC2374_UNFRAG_HDR_SIZE;
-
-		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
+		spin_lock_irqsave(&dev->lock, flags);
 		peer = fwnet_peer_find_by_guid(dev, peer_guid);
+		if (peer) {
+			peer->fifo = fifo_addr;
+
+			if (peer->speed > sspd)
+				peer->speed = sspd;
+			if (peer->max_payload > max_payload)
+				peer->max_payload = max_payload;
+		}
+		spin_unlock_irqrestore(&dev->lock, flags);
+
 		if (!peer) {
 			fw_notify("No peer for ARP packet from %016llx\n",
 				  (unsigned long long)peer_guid);
 			goto failed_proto;
 		}
 
-		/* FIXME don't use card->generation */
-		if (peer->node_id != source_node_id ||
-		    peer->generation != dev->card->generation) {
-			fw_notify("Internal error: peer->node_id (%x) != "
-				  "source_node_id (%x) or peer->generation (%x)"
-				  " != dev->card->generation(%x)\n",
-				  peer->node_id, source_node_id,
-				  peer->generation, dev->card->generation);
-			peer->node_id = source_node_id;
-			peer->generation = dev->card->generation;
-		}
-
-		/* FIXME: for debugging */
-		if (sspd > SCODE_400)
-			sspd = SCODE_400;
-		/* Update our speed/payload/fifo_offset table */
-		/*
-		 * FIXME: this does not handle cases where two high-speed endpoints must use a slower speed because of
-		 * a lower speed hub between them.  We need to look at the actual topology map here.
-		 */
-		peer->fifo = fifo_addr;
-		peer->max_payload = max_payload;
-		/*
-		 * Only allow speeds to go down from their initial value.
-		 * Otherwise a local peer that can only do S400 or slower may
-		 * be told to transmit at S800 to a faster remote peer.
-		 */
-		if (peer->xmt_speed > sspd)
-			peer->xmt_speed = sspd;
-
 		/*
 		 * Now that we're done with the 1394 specific stuff, we'll
 		 * need to alter some of the data.  Believe it or not, all
@@ -764,10 +674,11 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 }
 
 static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
-				 u16 source_node_id, bool is_broadcast)
+				 int source_node_id, int generation,
+				 bool is_broadcast)
 {
 	struct sk_buff *skb;
-	struct net_device *net;
+	struct net_device *net = dev->netdev;
 	struct rfc2734_header hdr;
 	unsigned lf;
 	unsigned long flags;
@@ -779,8 +690,6 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 	int retval;
 	u16 ether_type;
 
-	net = dev->card->netdev;
-
 	hdr.w0 = be32_to_cpu(buf[0]);
 	lf = fwnet_get_hdr_lf(&hdr);
 	if (lf == RFC2374_HDR_UNFRAG) {
@@ -819,9 +728,12 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 	}
 	datagram_label = fwnet_get_hdr_dgl(&hdr);
 	dg_size = fwnet_get_hdr_dg_size(&hdr); /* ??? + 1 */
-	peer = fwnet_peer_find_by_node_id(dev, source_node_id);
 
-	spin_lock_irqsave(&peer->pdg_lock, flags);
+	spin_lock_irqsave(&dev->lock, flags);
+
+	peer = fwnet_peer_find_by_node_id(dev, source_node_id, generation);
+	if (!peer)
+		goto bad_proto;
 
 	pd = fwnet_pd_find(peer, datagram_label);
 	if (pd == NULL) {
@@ -876,7 +788,7 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 		skb = skb_get(pd->skb);
 		fwnet_pd_delete(pd);
 
-		spin_unlock_irqrestore(&peer->pdg_lock, flags);
+		spin_unlock_irqrestore(&dev->lock, flags);
 
 		return fwnet_finish_incoming_packet(net, skb, source_node_id,
 						    false, ether_type);
@@ -885,12 +797,12 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 	 * Datagram is not complete, we're done for the
 	 * moment.
 	 */
-	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+	spin_unlock_irqrestore(&dev->lock, flags);
 
 	return 0;
 
  bad_proto:
-	spin_unlock_irqrestore(&peer->pdg_lock, flags);
+	spin_unlock_irqrestore(&dev->lock, flags);
 
 	if (netif_queue_stopped(net))
 		netif_wake_queue(net);
@@ -916,7 +828,8 @@ static void fwnet_receive_packet(struct fw_card *card, struct fw_request *r,
 		return;
 	}
 
-	status = fwnet_incoming_packet(dev, payload, length, source, false);
+	status = fwnet_incoming_packet(dev, payload, length,
+				       source, generation, false);
 	if (status != 0) {
 		fw_error("Incoming packet failure\n");
 		fw_send_response(card, r, RCODE_CONFLICT_ERROR);
@@ -966,7 +879,7 @@ static void fwnet_receive_broadcast(struct fw_iso_context *context,
 		buf_ptr += 2;
 		length -= IEEE1394_GASP_HDR_SIZE;
 		fwnet_incoming_packet(dev, buf_ptr, length,
-				      source_node_id, true);
+				      source_node_id, -1, true);
 	}
 
 	packet.payload_length = dev->rcv_buffer_size;
@@ -1073,7 +986,6 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask)
 	unsigned tx_len;
 	struct rfc2734_header *bufhdr;
 	unsigned long flags;
-	struct net_device *net;
 
 	dev = ptask->dev;
 	tx_len = ptask->max_payload;
@@ -1137,8 +1049,7 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask)
 	list_add_tail(&ptask->pt_link, &dev->sent_list);
 	spin_unlock_irqrestore(&dev->lock, flags);
 
-	net = dev->card->netdev;
-	net->trans_start = jiffies;
+	dev->netdev->trans_start = jiffies;
 
 	return 0;
 }
@@ -1294,7 +1205,8 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	u16 dg_size;
 	u16 *datagram_label_ptr;
 	struct fwnet_packet_task *ptask;
-	struct fwnet_peer *peer = NULL;
+	struct fwnet_peer *peer;
+	unsigned long flags;
 
 	ptask = kmem_cache_alloc(fwnet_packet_task_cache, GFP_ATOMIC);
 	if (ptask == NULL)
@@ -1314,6 +1226,9 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	proto = hdr_buf.h_proto;
 	dg_size = skb->len;
 
+	/* serialize access to peer, including peer->datagram_label */
+	spin_lock_irqsave(&dev->lock, flags);
+
 	/*
 	 * Set the transmission type for the packet.  ARP packets and IP
 	 * broadcast packets are sent via GASP.
@@ -1322,35 +1237,30 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	    || proto == htons(ETH_P_ARP)
 	    || (proto == htons(ETH_P_IP)
 		&& IN_MULTICAST(ntohl(ip_hdr(skb)->daddr)))) {
-		max_payload = dev->broadcast_xmt_max_payload;
+		max_payload        = dev->broadcast_xmt_max_payload;
 		datagram_label_ptr = &dev->broadcast_xmt_datagramlabel;
 
-		ptask->fifo_addr = FWNET_NO_FIFO_ADDR;
-		ptask->generation = 0;
-		ptask->dest_node = IEEE1394_ALL_NODES;
-		ptask->speed = SCODE_100;
+		ptask->fifo_addr   = FWNET_NO_FIFO_ADDR;
+		ptask->generation  = 0;
+		ptask->dest_node   = IEEE1394_ALL_NODES;
+		ptask->speed       = SCODE_100;
 	} else {
 		__be64 guid = get_unaligned((__be64 *)hdr_buf.h_dest);
 		u8 generation;
 
 		peer = fwnet_peer_find_by_guid(dev, be64_to_cpu(guid));
-		if (!peer)
-			goto fail;
-
-		if (peer->fifo == FWNET_NO_FIFO_ADDR)
-			goto fail;
+		if (!peer || peer->fifo == FWNET_NO_FIFO_ADDR)
+			goto fail_unlock;
 
-		generation = peer->generation;
-		smp_rmb();
-		dest_node = peer->node_id;
-
-		max_payload = peer->max_payload;
+		generation         = peer->generation;
+		dest_node          = peer->node_id;
+		max_payload        = peer->max_payload;
 		datagram_label_ptr = &peer->datagram_label;
 
-		ptask->fifo_addr = peer->fifo;
-		ptask->generation = generation;
-		ptask->dest_node = dest_node;
-		ptask->speed = peer->xmt_speed;
+		ptask->fifo_addr   = peer->fifo;
+		ptask->generation  = generation;
+		ptask->dest_node   = dest_node;
+		ptask->speed       = peer->speed;
 	}
 
 	/* If this is an ARP packet, convert it */
@@ -1393,11 +1303,16 @@ static int fwnet_tx(struct sk_buff *skb, struct net_device *net)
 		ptask->outstanding_pkts = DIV_ROUND_UP(dg_size, max_payload);
 		max_payload += RFC2374_FRAG_HDR_SIZE;
 	}
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
 	ptask->max_payload = max_payload;
 	fwnet_send_packet(ptask);
 
 	return NETDEV_TX_OK;
 
+ fail_unlock:
+	spin_unlock_irqrestore(&dev->lock, flags);
  fail:
 	if (ptask)
 		kmem_cache_free(fwnet_packet_task_cache, ptask);
@@ -1467,7 +1382,48 @@ static void fwnet_init_dev(struct net_device *net)
 	SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
 }
 
-/* FIXME create netdev upon first fw_unit of a card, not upon local fw_unit */
+/* caller must hold fwnet_device_mutex */
+static struct fwnet_device *fwnet_dev_find(struct fw_card *card)
+{
+	struct fwnet_device *dev;
+
+	list_for_each_entry(dev, &fwnet_device_list, dev_link)
+		if (dev->card == card)
+			return dev;
+
+	return NULL;
+}
+
+static int fwnet_add_peer(struct fwnet_device *dev,
+			  struct fw_unit *unit, struct fw_device *device)
+{
+	struct fwnet_peer *peer;
+
+	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
+	if (!peer)
+		return -ENOMEM;
+
+	unit->device.driver_data = peer;
+	peer->dev = dev;
+	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
+	peer->fifo = FWNET_NO_FIFO_ADDR;
+	INIT_LIST_HEAD(&peer->pd_list);
+	peer->pdg_size = 0;
+	peer->datagram_label = 0;
+	peer->speed = device->max_speed;
+	peer->max_payload = fwnet_max_payload(device->max_rec, peer->speed);
+
+	peer->generation = device->generation;
+	smp_rmb();
+	peer->node_id = device->node_id;
+
+	spin_lock_irq(&dev->lock);
+	list_add_tail(&peer->peer_link, &dev->peer_list);
+	spin_unlock_irq(&dev->lock);
+
+	return 0;
+}
+
 static int fwnet_probe(struct device *_dev)
 {
 	struct fw_unit *unit = fw_unit(_dev);
@@ -1476,16 +1432,22 @@ static int fwnet_probe(struct device *_dev)
 	struct net_device *net;
 	struct fwnet_device *dev;
 	unsigned max_mtu;
+	bool new_netdev;
+	int ret;
 
-	if (!device->is_local) {
-		int added;
+	mutex_lock(&fwnet_device_mutex);
 
-		added = fwnet_peer_new(card, device);
-		return added;
+	dev = fwnet_dev_find(card);
+	if (dev) {
+		new_netdev = false;
+		net = dev->netdev;
+		goto have_dev;
 	}
+
+	new_netdev = true;
 	net = alloc_netdev(sizeof(*dev), "firewire%d", fwnet_init_dev);
 	if (net == NULL) {
-		fw_error("out of memory\n");
+		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -1500,12 +1462,13 @@ static int fwnet_probe(struct device *_dev)
 
 	dev->local_fifo = FWNET_NO_FIFO_ADDR;
 
-	/* INIT_WORK(&dev->wake, fwnet_handle_queue);*/
 	INIT_LIST_HEAD(&dev->packet_list);
 	INIT_LIST_HEAD(&dev->broadcasted_list);
 	INIT_LIST_HEAD(&dev->sent_list);
+	INIT_LIST_HEAD(&dev->peer_list);
 
 	dev->card = card;
+	dev->netdev = net;
 
 	/*
 	 * Use the RFC 2734 default 1500 octets or the maximum payload
@@ -1518,43 +1481,57 @@ static int fwnet_probe(struct device *_dev)
 	/* Set our hardware address while we're at it */
 	put_unaligned_be64(card->guid, net->dev_addr);
 	put_unaligned_be64(~0ULL, net->broadcast);
-	if (register_netdev(net)) {
+	ret = register_netdev(net);
+	if (ret) {
 		fw_error("Cannot register the driver\n");
 		goto out;
 	}
 
+	list_add_tail(&dev->dev_link, &fwnet_device_list);
 	fw_notify("%s: IPv4 over FireWire on device %016llx\n",
 		  net->name, (unsigned long long)card->guid);
-	card->netdev = net;
-
-	return 0;
+ have_dev:
+	ret = fwnet_add_peer(dev, unit, device);
+	if (ret && new_netdev) {
+		unregister_netdev(net);
+		list_del(&dev->dev_link);
+	}
  out:
-	if (net)
+	if (ret && new_netdev)
 		free_netdev(net);
 
-	return -ENOENT;
+	mutex_unlock(&fwnet_device_mutex);
+
+	return ret;
+}
+
+static void fwnet_remove_peer(struct fwnet_peer *peer)
+{
+	struct fwnet_partial_datagram *pd, *pd_next;
+
+	spin_lock_irq(&peer->dev->lock);
+	list_del(&peer->peer_link);
+	spin_unlock_irq(&peer->dev->lock);
+
+	list_for_each_entry_safe(pd, pd_next, &peer->pd_list, pd_link)
+		fwnet_pd_delete(pd);
+
+	kfree(peer);
 }
 
 static int fwnet_remove(struct device *_dev)
 {
-	struct fw_unit *unit = fw_unit(_dev);
-	struct fw_device *device = fw_parent_device(unit);
-	struct fw_card *card = device->card;
+	struct fwnet_peer *peer = _dev->driver_data;
+	struct fwnet_device *dev = peer->dev;
 	struct net_device *net;
-	struct fwnet_device *dev;
-	struct fwnet_peer *peer;
-	struct fwnet_partial_datagram *pd, *pd_next;
 	struct fwnet_packet_task *ptask, *pt_next;
 
-	if (!device->is_local) {
-		fwnet_peer_delete(card, device);
+	mutex_lock(&fwnet_device_mutex);
 
-		return 0;
-	}
+	fwnet_remove_peer(peer);
 
-	net = card->netdev;
-	if (net) {
-		dev = netdev_priv(net);
+	if (list_empty(&dev->peer_list)) {
+		net = dev->netdev;
 		unregister_netdev(net);
 
 		if (dev->local_fifo != FWNET_NO_FIFO_ADDR)
@@ -1580,19 +1557,11 @@ static int fwnet_remove(struct device *_dev)
 			dev_kfree_skb_any(ptask->skb);
 			kmem_cache_free(fwnet_packet_task_cache, ptask);
 		}
-		list_for_each_entry(peer, &card->peer_list, peer_link) {
-			if (peer->pdg_size) {
-				list_for_each_entry_safe(pd, pd_next,
-						&peer->pd_list, pd_link)
-					fwnet_pd_delete(pd);
-				peer->pdg_size = 0;
-			}
-			peer->fifo = FWNET_NO_FIFO_ADDR;
-		}
 		free_netdev(net);
-		card->netdev = NULL;
 	}
 
+	mutex_unlock(&fwnet_device_mutex);
+
 	return 0;
 }
 
@@ -1603,24 +1572,15 @@ static int fwnet_remove(struct device *_dev)
 static void fwnet_update(struct fw_unit *unit)
 {
 	struct fw_device *device = fw_parent_device(unit);
-	struct net_device *net = device->card->netdev;
-	struct fwnet_device *dev;
-	struct fwnet_peer *peer;
-	u64 guid;
+	struct fwnet_peer *peer = unit->device.driver_data;
+	int generation;
 
-	if (net && !device->is_local) {
-		dev = netdev_priv(net);
-		guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-		peer = fwnet_peer_find_by_guid(dev, guid);
-		if (!peer) {
-			fw_error("fwnet_update: no peer for device %016llx\n",
-				 (unsigned long long)guid);
-			return;
-		}
-		peer->generation = device->generation;
-		rmb();
-		peer->node_id = device->node_id;
-	}
+	generation = device->generation;
+
+	spin_lock_irq(&peer->dev->lock);
+	peer->node_id    = device->node_id;
+	peer->generation = generation;
+	spin_unlock_irq(&peer->dev->lock);
 }
 
 static const struct ieee1394_device_id fwnet_id_table[] = {
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5cb0c1549ff..9823946adbc 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -131,10 +131,6 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-
-	/* firewire-net driver data */
-	void *netdev;
-	struct list_head peer_list;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
-- 
cgit v1.2.3-70-g09d2


From 7923f90fffa8746f6457d4eea2109fd3d6414189 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 14 Jun 2009 22:10:41 +0200
Subject: vmlinux.lds.h update

Updated after review by Tim Abbott.
- Use HEAD_TEXT_SECTION
- Drop use of section-names.h and delete file
- Introduce EXIT_CALL

Deleting section-names.h required a few simple
updates of init.h

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@ksplice.com>
---
 include/asm-generic/vmlinux.lds.h | 17 ++++++++++-------
 include/linux/init.h              |  4 +---
 include/linux/section-names.h     |  6 ------
 3 files changed, 11 insertions(+), 16 deletions(-)
 delete mode 100644 include/linux/section-names.h

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fba42236e94..7381f701f3f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -12,7 +12,7 @@
  * {
  *	. = START;
  *	__init_begin = .;
- *	HEAD_SECTION
+ *	HEAD_TEXT_SECTION
  *	INIT_TEXT_SECTION(PAGE_SIZE)
  *	INIT_DATA_SECTION(...)
  *	PERCPU(PAGE_SIZE)
@@ -38,7 +38,7 @@
  *	/DISCARD/ : {
  *		EXIT_TEXT
  *		EXIT_DATA
- *		*(.exitcall.exit)
+ *		EXIT_CALL
  *	}
  *	STABS_DEBUG
  *	DWARF_DEBUG
@@ -52,7 +52,6 @@
  * Examples are: [__initramfs_start, __initramfs_end] for initramfs and
  *               [__nosave_begin, __nosave_end] for the nosave data
  */
- #include <linux/section-names.h>
 
 #ifndef LOAD_OFFSET
 #define LOAD_OFFSET 0
@@ -414,9 +413,9 @@
 #endif
 
 /* Section used for early init (in .S files) */
-#define HEAD_TEXT  *(HEAD_TEXT_SECTION)
+#define HEAD_TEXT  *(.head.text)
 
-#define HEAD_SECTION							\
+#define HEAD_TEXT_SECTION							\
 	.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {		\
 		HEAD_TEXT						\
 	}
@@ -473,6 +472,9 @@
 	CPU_DISCARD(exit.text)						\
 	MEM_DISCARD(exit.text)
 
+#define EXIT_CALL							\
+	*(.exitcall.exit)
+
 /*
  * bss (Block Started by Symbol) - uninitialized data
  * zeroed during startup
@@ -692,7 +694,7 @@
  * NOSAVE_DATA starts and ends with a PAGE_SIZE alignment which
  * matches the requirment of PAGE_ALIGNED_DATA.
  *
-/* use 0 as page_align if page_aligned data is not used */
+ * use 0 as page_align if page_aligned data is not used */
 #define RW_DATA_SECTION(cacheline, nosave, pagealigned, inittask)	\
 	. = ALIGN(PAGE_SIZE);						\
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {				\
@@ -726,4 +728,5 @@
 #define BSS_SECTION(sbss_align, bss_align)				\
 	SBSS								\
 	BSS(bss_align)							\
-	. = ALIGN(4);							\
+	. = ALIGN(4);
+
diff --git a/include/linux/init.h b/include/linux/init.h
index 9f70c9f25d4..b2189803f19 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -2,8 +2,6 @@
 #define _LINUX_INIT_H
 
 #include <linux/compiler.h>
-#include <linux/section-names.h>
-#include <linux/stringify.h>
 
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
@@ -101,7 +99,7 @@
 #define __memexitconst   __section(.memexit.rodata)
 
 /* For assembly routines */
-#define __HEAD		.section	__stringify(HEAD_TEXT_SECTION),"ax"
+#define __HEAD		.section	".head.text","ax"
 #define __INIT		.section	".init.text","ax"
 #define __FINIT		.previous
 
diff --git a/include/linux/section-names.h b/include/linux/section-names.h
deleted file mode 100644
index c956f4eb2ad..00000000000
--- a/include/linux/section-names.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __LINUX_SECTION_NAMES_H
-#define __LINUX_SECTION_NAMES_H
-
-#define HEAD_TEXT_SECTION .head.text
-
-#endif /* !__LINUX_SECTION_NAMES_H */
-- 
cgit v1.2.3-70-g09d2


From 5a7e3d1281bbc4404b250b4a18d3ecb07c77640c Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Sat, 13 Jun 2009 14:52:33 +0200
Subject: keyboard: advertise KT_DEAD2 extended diacriticals

In addition to KT_DEAD which has limited support for diacriticals,
there is KT_DEAD2 that can support 256 criticals, so let's advertise
it in <linux/keyboard.h>.

This lets userland know abut the drivers/char/keyboard.c function
k_dead2, which supports more than the few trivial ones that k_dead
supports.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/keyboard.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/keyboard.h b/include/linux/keyboard.h
index a3c984d780f..33a63f62d57 100644
--- a/include/linux/keyboard.h
+++ b/include/linux/keyboard.h
@@ -56,6 +56,7 @@ extern int unregister_keyboard_notifier(struct notifier_block *nb);
 #define KT_ASCII	9
 #define KT_LOCK		10
 #define KT_SLOCK	12
+#define KT_DEAD2	13
 #define KT_BRL		14
 
 #define K(t,v)		(((t)<<8)|(v))
-- 
cgit v1.2.3-70-g09d2


From 13685a1654b65357fb34066a98ef40445f7820fc Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 10 Jun 2009 04:38:40 +0000
Subject: block: Add bio_list_peek()

Introduce bio_list_peek(), to obtain a pointer to the first bio on the bio_list
without actually removing it from the list. This is needed when you want to
serialize based on the list being empty or not.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 include/linux/bio.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 12737be5860..2a04eb54c0d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -590,6 +590,11 @@ static inline void bio_list_merge_head(struct bio_list *bl,
 	bl->head = bl2->head;
 }
 
+static inline struct bio *bio_list_peek(struct bio_list *bl)
+{
+	return bl->head;
+}
+
 static inline struct bio *bio_list_pop(struct bio_list *bl)
 {
 	struct bio *bio = bl->head;
-- 
cgit v1.2.3-70-g09d2


From 55f4fa4e33e90c6b25b4c8ed038392a73b654fef Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Thu, 23 Apr 2009 20:10:43 +0200
Subject: Maxim 1586 regulator driver

The Maxim 1586 regulator is a voltage regulator with 2
voltage outputs, specially suitable for Marvell PXA
chips. One output is in the range of required VCC_CORE by
the PXA27x chips, the other in the VCC_USIM required as well
by PXA27x chips.

The chip is controlled through the I2C bus.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/Kconfig         |   9 ++
 drivers/regulator/Makefile        |   1 +
 drivers/regulator/max1586.c       | 249 ++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/max1586.h |  52 ++++++++
 4 files changed, 311 insertions(+)
 create mode 100644 drivers/regulator/max1586.c
 create mode 100644 include/linux/regulator/max1586.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index e58c0ce65aa..707da4d2353 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -56,6 +56,15 @@ config REGULATOR_BQ24022
 	  charging select between 100 mA and 500 mA charging current
 	  limit.
 
+config REGULATOR_MAX1586
+	tristate "Maxim 1586/1587 voltage regulator"
+	depends on I2C
+	default n
+	help
+	  This driver controls a Maxim 1586 or 1587 voltage output
+	  regulator via I2C bus. The provided regulator is suitable
+	  for PXA27x chips to control VCC_CORE and VCC_USIM voltages.
+
 config REGULATOR_TWL4030
 	bool "TI TWL4030/TWL5030/TPS695x0 PMIC"
 	depends on TWL4030_CORE
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index bac133afc06..1d7de87a8e1 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
+obj-$(CONFIG_REGULATOR_MAX1586) += max1586.o
 obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
new file mode 100644
index 00000000000..bbbb55fcfe8
--- /dev/null
+++ b/drivers/regulator/max1586.c
@@ -0,0 +1,249 @@
+/*
+ * max1586.c  --  Voltage and current regulation for the Maxim 1586
+ *
+ * Copyright (C) 2008 Robert Jarzmik
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/max1586.h>
+
+#define MAX1586_V3_MAX_VSEL 31
+#define MAX1586_V6_MAX_VSEL 3
+
+#define MAX1586_V3_MIN_UV   700000
+#define MAX1586_V3_MAX_UV  1475000
+#define MAX1586_V3_STEP_UV   25000
+
+#define MAX1586_V6_MIN_UV        0
+#define MAX1586_V6_MAX_UV  3000000
+
+#define I2C_V3_SELECT (0 << 5)
+#define I2C_V6_SELECT (1 << 5)
+
+/*
+ * V3 voltage
+ * On I2C bus, sending a "x" byte to the max1586 means :
+ *   set V3 to 0.700V + (x & 0x1f) * 0.025V
+ */
+static int max1586_v3_calc_voltage(unsigned selector)
+{
+	return MAX1586_V3_MIN_UV + (MAX1586_V3_STEP_UV * selector);
+}
+
+static int max1586_v3_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+{
+	struct i2c_client *client = rdev_get_drvdata(rdev);
+	unsigned selector;
+	u8 v3_prog;
+
+	if (min_uV < MAX1586_V3_MIN_UV || min_uV > MAX1586_V3_MAX_UV)
+		return -EINVAL;
+	if (max_uV < MAX1586_V3_MIN_UV || max_uV > MAX1586_V3_MAX_UV)
+		return -EINVAL;
+
+	selector = (min_uV - MAX1586_V3_MIN_UV) / MAX1586_V3_STEP_UV;
+	if (max1586_v3_calc_voltage(selector) > max_uV)
+		return -EINVAL;
+
+	dev_dbg(&client->dev, "changing voltage v3 to %dmv\n",
+		max1586_v3_calc_voltage(selector) / 1000);
+
+	v3_prog = I2C_V3_SELECT | (u8) selector;
+	return i2c_smbus_write_byte(client, v3_prog);
+}
+
+static int max1586_v3_list(struct regulator_dev *rdev, unsigned selector)
+{
+	if (selector > MAX1586_V3_MAX_VSEL)
+		return -EINVAL;
+	return max1586_v3_calc_voltage(selector);
+}
+
+/*
+ * V6 voltage
+ * On I2C bus, sending a "x" byte to the max1586 means :
+ *   set V6 to either 0V, 1.8V, 2.5V, 3V depending on (x & 0x3)
+ * As regulator framework doesn't accept voltages to be 0V, we use 1uV.
+ */
+static int max1586_v6_calc_voltage(unsigned selector)
+{
+	static int voltages_uv[] = { 1, 1800000, 2500000, 3000000 };
+
+	return voltages_uv[selector];
+}
+
+static int max1586_v6_set(struct regulator_dev *rdev, int min_uV, int max_uV)
+{
+	struct i2c_client *client = rdev_get_drvdata(rdev);
+	unsigned selector;
+	u8 v6_prog;
+
+	if (min_uV < MAX1586_V6_MIN_UV || min_uV > MAX1586_V6_MAX_UV)
+		return -EINVAL;
+	if (max_uV < MAX1586_V6_MIN_UV || max_uV > MAX1586_V6_MAX_UV)
+		return -EINVAL;
+
+	if (min_uV >= 3000000)
+		selector = 3;
+	if (min_uV < 3000000)
+		selector = 2;
+	if (min_uV < 2500000)
+		selector = 1;
+	if (min_uV < 1800000)
+		selector = 0;
+
+	if (max1586_v6_calc_voltage(selector) > max_uV)
+		return -EINVAL;
+
+	dev_dbg(&client->dev, "changing voltage v6 to %dmv\n",
+		max1586_v6_calc_voltage(selector) / 1000);
+
+	v6_prog = I2C_V6_SELECT | (u8) selector;
+	return i2c_smbus_write_byte(client, v6_prog);
+}
+
+static int max1586_v6_list(struct regulator_dev *rdev, unsigned selector)
+{
+	if (selector > MAX1586_V6_MAX_VSEL)
+		return -EINVAL;
+	return max1586_v6_calc_voltage(selector);
+}
+
+/*
+ * The Maxim 1586 controls V3 and V6 voltages, but offers no way of reading back
+ * the set up value.
+ */
+static struct regulator_ops max1586_v3_ops = {
+	.set_voltage = max1586_v3_set,
+	.list_voltage = max1586_v3_list,
+};
+
+static struct regulator_ops max1586_v6_ops = {
+	.set_voltage = max1586_v6_set,
+	.list_voltage = max1586_v6_list,
+};
+
+static struct regulator_desc max1586_reg[] = {
+	{
+		.name = "Output_V3",
+		.id = MAX1586_V3,
+		.ops = &max1586_v3_ops,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = MAX1586_V3_MAX_VSEL + 1,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "Output_V6",
+		.id = MAX1586_V6,
+		.ops = &max1586_v6_ops,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = MAX1586_V6_MAX_VSEL + 1,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int max1586_pmic_probe(struct i2c_client *client,
+			      const struct i2c_device_id *i2c_id)
+{
+	struct regulator_dev **rdev;
+	struct max1586_platform_data *pdata = client->dev.platform_data;
+	int i, id, ret = 0;
+
+	rdev = kzalloc(sizeof(struct regulator_dev *) * (MAX1586_V6 + 1),
+		       GFP_KERNEL);
+	if (!rdev)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	for (i = 0; i < pdata->num_subdevs && i <= MAX1586_V6; i++) {
+		id = pdata->subdevs[i].id;
+		if (!pdata->subdevs[i].platform_data)
+			continue;
+		if (id < MAX1586_V3 || id > MAX1586_V6) {
+			dev_err(&client->dev, "invalid regulator id %d\n", id);
+			goto err;
+		}
+		rdev[i] = regulator_register(&max1586_reg[id], &client->dev,
+					     pdata->subdevs[i].platform_data,
+					     client);
+		if (IS_ERR(rdev[i])) {
+			ret = PTR_ERR(rdev[i]);
+			dev_err(&client->dev, "failed to register %s\n",
+				max1586_reg[id].name);
+			goto err;
+		}
+	}
+
+	i2c_set_clientdata(client, rdev);
+	dev_info(&client->dev, "Maxim 1586 regulator driver loaded\n");
+	return 0;
+
+err:
+	while (--i >= 0)
+		regulator_unregister(rdev[i]);
+	kfree(rdev);
+	return ret;
+}
+
+static int max1586_pmic_remove(struct i2c_client *client)
+{
+	struct regulator_dev **rdev = i2c_get_clientdata(client);
+	int i;
+
+	for (i = 0; i <= MAX1586_V6; i++)
+		if (rdev[i])
+			regulator_unregister(rdev[i]);
+	kfree(rdev);
+	i2c_set_clientdata(client, NULL);
+
+	return 0;
+}
+
+static const struct i2c_device_id max1586_id[] = {
+	{ "max1586", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max1586_id);
+
+static struct i2c_driver max1586_pmic_driver = {
+	.probe = max1586_pmic_probe,
+	.remove = max1586_pmic_remove,
+	.driver		= {
+		.name	= "max1586",
+	},
+	.id_table	= max1586_id,
+};
+
+static int __init max1586_pmic_init(void)
+{
+	return i2c_add_driver(&max1586_pmic_driver);
+}
+subsys_initcall(max1586_pmic_init);
+
+static void __exit max1586_pmic_exit(void)
+{
+	i2c_del_driver(&max1586_pmic_driver);
+}
+module_exit(max1586_pmic_exit);
+
+/* Module information */
+MODULE_DESCRIPTION("MAXIM 1586 voltage regulator driver");
+MODULE_AUTHOR("Robert Jarzmik");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/regulator/max1586.h b/include/linux/regulator/max1586.h
new file mode 100644
index 00000000000..2056973396b
--- /dev/null
+++ b/include/linux/regulator/max1586.h
@@ -0,0 +1,52 @@
+/*
+ * max1586.h  --  Voltage regulation for the Maxim 1586
+ *
+ * Copyright (C) 2008 Robert Jarzmik
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef REGULATOR_MAX1586
+#define REGULATOR_MAX1586
+
+#include <linux/regulator/machine.h>
+
+#define MAX1586_V3 0
+#define MAX1586_V6 1
+
+/**
+ * max1586_subdev_data - regulator data
+ * @id: regulator Id (either MAX1586_V3 or MAX1586_V6)
+ * @name: regulator cute name (example for V3: "vcc_core")
+ * @platform_data: regulator init data (contraints, supplies, ...)
+ */
+struct max1586_subdev_data {
+	int				id;
+	char				*name;
+	struct regulator_init_data	*platform_data;
+};
+
+/**
+ * max1586_platform_data - platform data for max1586
+ * @num_subdevs: number of regultors used (may be 1 or 2)
+ * @subdevs: regulator used
+ *           At most, there will be a regulator for V3 and one for V6 voltages.
+ */
+struct max1586_platform_data {
+	int num_subdevs;
+	struct max1586_subdev_data *subdevs;
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 1d98cccf7f8b944ba4ea56d14bbb7c2eeee59bfe Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Sun, 26 Apr 2009 16:49:39 +0300
Subject: regulator: add userspace-consumer driver

The userspace-consumer driver allows control of voltage and current
regulator state from userspace. This is required for fine-grained
power management of devices that are completely controller by userspace
applications, e.g. a GPS transciever connected to a serial port.

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/Kconfig                    |  10 ++
 drivers/regulator/Makefile                   |   1 +
 drivers/regulator/userspace-consumer.c       | 200 +++++++++++++++++++++++++++
 include/linux/regulator/userspace-consumer.h |  25 ++++
 4 files changed, 236 insertions(+)
 create mode 100644 drivers/regulator/userspace-consumer.c
 create mode 100644 include/linux/regulator/userspace-consumer.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 707da4d2353..5bec17cf1d5 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -47,6 +47,16 @@ config REGULATOR_VIRTUAL_CONSUMER
 
           If unsure, say no.
 
+config REGULATOR_USERSPACE_CONSUMER
+	tristate "Userspace regulator consumer support"
+	default n
+	help
+	  There are some classes of devices that are controlled entirely
+	  from user space. Usersapce consumer driver provides ability to
+	  control power supplies for such devices.
+
+          If unsure, say no.
+
 config REGULATOR_BQ24022
 	tristate "TI bq24022 Dual Input 1-Cell Li-Ion Charger IC"
 	default n
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 1d7de87a8e1..faf7bcc1af9 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -6,6 +6,7 @@
 obj-$(CONFIG_REGULATOR) += core.o
 obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
+obj-$(CONFIG_REGULATOR_USERSPACE_CONSUMER) += userspace-consumer.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
 obj-$(CONFIG_REGULATOR_MAX1586) += max1586.o
diff --git a/drivers/regulator/userspace-consumer.c b/drivers/regulator/userspace-consumer.c
new file mode 100644
index 00000000000..71fcf9df00f
--- /dev/null
+++ b/drivers/regulator/userspace-consumer.c
@@ -0,0 +1,200 @@
+/*
+ * userspace-consumer.c
+ *
+ * Copyright 2009 CompuLab, Ltd.
+ *
+ * Author: Mike Rapoport <mike@compulab.co.il>
+ *
+ * Based of virtual consumer driver:
+ *   Copyright 2008 Wolfson Microelectronics PLC.
+ *   Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/regulator/userspace-consumer.h>
+
+struct userspace_consumer_data {
+	const char *name;
+
+	struct mutex lock;
+	bool enabled;
+
+	int num_supplies;
+	struct regulator_bulk_data *supplies;
+};
+
+static ssize_t show_name(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct userspace_consumer_data *data = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", data->name);
+}
+
+static ssize_t show_state(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct userspace_consumer_data *data = dev_get_drvdata(dev);
+
+	if (data->enabled)
+		return sprintf(buf, "enabled\n");
+
+	return sprintf(buf, "disabled\n");
+}
+
+static ssize_t set_state(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct userspace_consumer_data *data = dev_get_drvdata(dev);
+	bool enabled;
+	int ret;
+
+	/*
+	 * sysfs_streq() doesn't need the \n's, but we add them so the strings
+	 * will be shared with show_state(), above.
+	 */
+	if (sysfs_streq(buf, "enabled\n") || sysfs_streq(buf, "1"))
+		enabled = true;
+	else if (sysfs_streq(buf, "disabled\n") || sysfs_streq(buf, "0"))
+		enabled = false;
+	else {
+		dev_err(dev, "Configuring invalid mode\n");
+		return count;
+	}
+
+	mutex_lock(&data->lock);
+	if (enabled != data->enabled) {
+		if (enabled)
+			ret = regulator_bulk_enable(data->num_supplies,
+						    data->supplies);
+		else
+			ret = regulator_bulk_disable(data->num_supplies,
+						     data->supplies);
+
+		if (ret == 0)
+			data->enabled = enabled;
+		else
+			dev_err(dev, "Failed to configure state: %d\n", ret);
+	}
+	mutex_unlock(&data->lock);
+
+	return count;
+}
+
+static DEVICE_ATTR(name, 0444, show_name, NULL);
+static DEVICE_ATTR(state, 0644, show_state, set_state);
+
+static struct device_attribute *attributes[] = {
+	&dev_attr_name,
+	&dev_attr_state,
+};
+
+static int regulator_userspace_consumer_probe(struct platform_device *pdev)
+{
+	struct regulator_userspace_consumer_data *pdata;
+	struct userspace_consumer_data *drvdata;
+	int ret, i;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata)
+		return -EINVAL;
+
+	drvdata = kzalloc(sizeof(struct userspace_consumer_data), GFP_KERNEL);
+	if (drvdata == NULL)
+		return -ENOMEM;
+
+	drvdata->name = pdata->name;
+	drvdata->num_supplies = pdata->num_supplies;
+	drvdata->supplies = pdata->supplies;
+
+	mutex_init(&drvdata->lock);
+
+	ret = regulator_bulk_get(&pdev->dev, drvdata->num_supplies,
+				 drvdata->supplies);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to get supplies: %d\n", ret);
+		goto err_alloc_supplies;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(attributes); i++) {
+		ret = device_create_file(&pdev->dev, attributes[i]);
+		if (ret != 0)
+			goto err_create_attrs;
+	}
+
+	if (pdata->init_on)
+		ret = regulator_bulk_enable(drvdata->num_supplies,
+					    drvdata->supplies);
+
+	drvdata->enabled = pdata->init_on;
+
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to set initial state: %d\n", ret);
+		goto err_create_attrs;
+	}
+
+	platform_set_drvdata(pdev, drvdata);
+
+	return 0;
+
+err_create_attrs:
+	for (i = 0; i < ARRAY_SIZE(attributes); i++)
+		device_remove_file(&pdev->dev, attributes[i]);
+
+	regulator_bulk_free(drvdata->num_supplies, drvdata->supplies);
+
+err_alloc_supplies:
+	kfree(drvdata);
+	return ret;
+}
+
+static int regulator_userspace_consumer_remove(struct platform_device *pdev)
+{
+	struct userspace_consumer_data *data = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(attributes); i++)
+		device_remove_file(&pdev->dev, attributes[i]);
+
+	if (data->enabled)
+		regulator_bulk_disable(data->num_supplies, data->supplies);
+
+	regulator_bulk_free(data->num_supplies, data->supplies);
+	kfree(data);
+
+	return 0;
+}
+
+static struct platform_driver regulator_userspace_consumer_driver = {
+	.probe		= regulator_userspace_consumer_probe,
+	.remove		= regulator_userspace_consumer_remove,
+	.driver		= {
+		.name		= "reg-userspace-consumer",
+	},
+};
+
+
+static int __init regulator_userspace_consumer_init(void)
+{
+	return platform_driver_register(&regulator_userspace_consumer_driver);
+}
+module_init(regulator_userspace_consumer_init);
+
+static void __exit regulator_userspace_consumer_exit(void)
+{
+	platform_driver_unregister(&regulator_userspace_consumer_driver);
+}
+module_exit(regulator_userspace_consumer_exit);
+
+MODULE_AUTHOR("Mike Rapoport <mike@compulab.co.il>");
+MODULE_DESCRIPTION("Userspace consumer for voltage and current regulators");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/regulator/userspace-consumer.h b/include/linux/regulator/userspace-consumer.h
new file mode 100644
index 00000000000..b4554ce9d4b
--- /dev/null
+++ b/include/linux/regulator/userspace-consumer.h
@@ -0,0 +1,25 @@
+#ifndef __REGULATOR_PLATFORM_CONSUMER_H_
+#define __REGULATOR_PLATFORM_CONSUMER_H_
+
+struct regulator_consumer_supply;
+
+/**
+ * struct regulator_userspace_consumer_data - line consumer
+ * initialisation data.
+ *
+ * @name: Name for the consumer line
+ * @num_supplies: Number of supplies feeding the line
+ * @supplies: Supplies configuration.
+ * @init_on: Set if the regulators supplying the line should be
+ *           enabled during initialisation
+ */
+struct regulator_userspace_consumer_data {
+	const char *name;
+
+	int num_supplies;
+	struct regulator_bulk_data *supplies;
+
+	bool init_on;
+};
+
+#endif /* __REGULATOR_PLATFORM_CONSUMER_H_ */
-- 
cgit v1.2.3-70-g09d2


From 0cbdf7bce5b98807b946d1a96956f30dcae24a50 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Tue, 19 May 2009 07:33:55 +0200
Subject: LP3971 PMIC regulator driver (updated and combined version)

This patch adds regulator drivers for National Semiconductors LP3971 PMIC.
This LP3971 PMIC controller has 3 DC/DC voltage converters and 5 low
drop-out (LDO) regulators. LP3971 PMIC controller uses I2C interface.

Reviewed-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/Kconfig        |   7 +
 drivers/regulator/Makefile       |   1 +
 drivers/regulator/lp3971.c       | 562 +++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/lp3971.h |  51 ++++
 4 files changed, 621 insertions(+)
 create mode 100644 drivers/regulator/lp3971.c
 create mode 100644 include/linux/regulator/lp3971.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 5bec17cf1d5..f4317798e47 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -110,4 +110,11 @@ config REGULATOR_PCF50633
 	 Say Y here to support the voltage regulators and convertors
 	 on PCF50633
 
+config REGULATOR_LP3971
+	tristate "National Semiconductors LP3971 PMIC regulator driver"
+	depends on I2C
+	help
+	 Say Y here to support the voltage regulators and convertors
+	 on National Semiconductors LP3971 PMIC
+
 endif
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index faf7bcc1af9..4d762c4cccf 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
 obj-$(CONFIG_REGULATOR_USERSPACE_CONSUMER) += userspace-consumer.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
+obj-$(CONFIG_REGULATOR_LP3971) += lp3971.o
 obj-$(CONFIG_REGULATOR_MAX1586) += max1586.o
 obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
diff --git a/drivers/regulator/lp3971.c b/drivers/regulator/lp3971.c
new file mode 100644
index 00000000000..35abebea4ed
--- /dev/null
+++ b/drivers/regulator/lp3971.c
@@ -0,0 +1,562 @@
+/*
+ * Regulator driver for National Semiconductors LP3971 PMIC chip
+ *
+ *  Copyright (C) 2009 Samsung Electronics
+ *  Author: Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * Based on wm8350.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/lp3971.h>
+
+struct lp3971 {
+	struct device *dev;
+	struct mutex io_lock;
+	struct i2c_client *i2c;
+	int num_regulators;
+	struct regulator_dev **rdev;
+};
+
+static u8 lp3971_reg_read(struct lp3971 *lp3971, u8 reg);
+static int lp3971_set_bits(struct lp3971 *lp3971, u8 reg, u16 mask, u16 val);
+
+#define LP3971_SYS_CONTROL1_REG 0x07
+
+/* System control register 1 initial value,
+   bits 4 and 5 are EPROM programmable */
+#define SYS_CONTROL1_INIT_VAL 0x40
+#define SYS_CONTROL1_INIT_MASK 0xCF
+
+#define LP3971_BUCK_VOL_ENABLE_REG 0x10
+#define LP3971_BUCK_VOL_CHANGE_REG 0x20
+
+/*	Voltage control registers shift:
+	LP3971_BUCK1 -> 0
+	LP3971_BUCK2 -> 4
+	LP3971_BUCK3 -> 6
+*/
+#define BUCK_VOL_CHANGE_SHIFT(x) (((1 << x) & ~0x01) << 1)
+#define BUCK_VOL_CHANGE_FLAG_GO 0x01
+#define BUCK_VOL_CHANGE_FLAG_TARGET 0x02
+#define BUCK_VOL_CHANGE_FLAG_MASK 0x03
+
+#define LP3971_BUCK1_BASE 0x23
+#define LP3971_BUCK2_BASE 0x29
+#define LP3971_BUCK3_BASE 0x32
+
+const static int buck_base_addr[] = {
+	LP3971_BUCK1_BASE,
+	LP3971_BUCK2_BASE,
+	LP3971_BUCK3_BASE,
+};
+
+#define LP3971_BUCK_TARGET_VOL1_REG(x) (buck_base_addr[x])
+#define LP3971_BUCK_TARGET_VOL2_REG(x) (buck_base_addr[x]+1)
+
+const static int buck_voltage_map[] = {
+	   0,  800,  850,  900,  950, 1000, 1050, 1100,
+	1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500,
+	1550, 1600, 1650, 1700, 1800, 1900, 2500, 2800,
+	3000, 3300,
+};
+
+#define BUCK_TARGET_VOL_MASK 0x3f
+#define BUCK_TARGET_VOL_MIN_IDX 0x01
+#define BUCK_TARGET_VOL_MAX_IDX 0x19
+
+#define LP3971_BUCK_RAMP_REG(x)	(buck_base_addr[x]+2)
+
+#define LP3971_LDO_ENABLE_REG 0x12
+#define LP3971_LDO_VOL_CONTR_BASE 0x39
+
+/*	Voltage control registers:
+	LP3971_LDO1 -> LP3971_LDO_VOL_CONTR_BASE + 0
+	LP3971_LDO2 -> LP3971_LDO_VOL_CONTR_BASE + 0
+	LP3971_LDO3 -> LP3971_LDO_VOL_CONTR_BASE + 1
+	LP3971_LDO4 -> LP3971_LDO_VOL_CONTR_BASE + 1
+	LP3971_LDO5 -> LP3971_LDO_VOL_CONTR_BASE + 2
+*/
+#define LP3971_LDO_VOL_CONTR_REG(x)	(LP3971_LDO_VOL_CONTR_BASE + (x >> 1))
+
+/*	Voltage control registers shift:
+	LP3971_LDO1 -> 0, LP3971_LDO2 -> 4
+	LP3971_LDO3 -> 0, LP3971_LDO4 -> 4
+	LP3971_LDO5 -> 0
+*/
+#define LDO_VOL_CONTR_SHIFT(x) ((x & 1) << 2)
+#define LDO_VOL_CONTR_MASK 0x0f
+
+const static int ldo45_voltage_map[] = {
+	1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350,
+	1400, 1500, 1800, 1900, 2500, 2800, 3000, 3300,
+};
+
+const static int ldo123_voltage_map[] = {
+	1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500,
+	2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300,
+};
+
+const static int *ldo_voltage_map[] = {
+	ldo123_voltage_map, /* LDO1 */
+	ldo123_voltage_map, /* LDO2 */
+	ldo123_voltage_map, /* LDO3 */
+	ldo45_voltage_map, /* LDO4 */
+	ldo45_voltage_map, /* LDO5 */
+};
+
+#define LDO_VOL_VALUE_MAP(x) (ldo_voltage_map[(x - LP3971_LDO1)])
+
+#define LDO_VOL_MIN_IDX 0x00
+#define LDO_VOL_MAX_IDX 0x0f
+
+static int lp3971_ldo_list_voltage(struct regulator_dev *dev, unsigned index)
+{
+	int ldo = rdev_get_id(dev) - LP3971_LDO1;
+	return 1000 * LDO_VOL_VALUE_MAP(ldo)[index];
+}
+
+static int lp3971_ldo_is_enabled(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int ldo = rdev_get_id(dev) - LP3971_LDO1;
+	u16 mask = 1 << (1 + ldo);
+	u16 val;
+
+	val = lp3971_reg_read(lp3971, LP3971_LDO_ENABLE_REG);
+	return (val & mask) != 0;
+}
+
+static int lp3971_ldo_enable(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int ldo = rdev_get_id(dev) - LP3971_LDO1;
+	u16 mask = 1 << (1 + ldo);
+
+	return lp3971_set_bits(lp3971, LP3971_LDO_ENABLE_REG, mask, mask);
+}
+
+static int lp3971_ldo_disable(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int ldo = rdev_get_id(dev) - LP3971_LDO1;
+	u16 mask = 1 << (1 + ldo);
+
+	return lp3971_set_bits(lp3971, LP3971_LDO_ENABLE_REG, mask, 0);
+}
+
+static int lp3971_ldo_get_voltage(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int ldo = rdev_get_id(dev) - LP3971_LDO1;
+	u16 val, reg;
+
+	reg = lp3971_reg_read(lp3971, LP3971_LDO_VOL_CONTR_REG(ldo));
+	val = (reg >> LDO_VOL_CONTR_SHIFT(ldo)) & LDO_VOL_CONTR_MASK;
+
+	return 1000 * LDO_VOL_VALUE_MAP(ldo)[val];
+}
+
+static int lp3971_ldo_set_voltage(struct regulator_dev *dev,
+				  int min_uV, int max_uV)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int ldo = rdev_get_id(dev) - LP3971_LDO1;
+	int min_vol = min_uV / 1000, max_vol = max_uV / 1000;
+	const int *vol_map = LDO_VOL_VALUE_MAP(ldo);
+	u16 val;
+
+	if (min_vol < vol_map[LDO_VOL_MIN_IDX] ||
+	    min_vol > vol_map[LDO_VOL_MAX_IDX])
+		return -EINVAL;
+
+	for (val = LDO_VOL_MIN_IDX; val <= LDO_VOL_MAX_IDX; val++)
+		if (vol_map[val] >= min_vol)
+			break;
+
+	if (vol_map[val] > max_vol)
+		return -EINVAL;
+
+	return lp3971_set_bits(lp3971, LP3971_LDO_VOL_CONTR_REG(ldo),
+		LDO_VOL_CONTR_MASK << LDO_VOL_CONTR_SHIFT(ldo), val);
+}
+
+static struct regulator_ops lp3971_ldo_ops = {
+	.list_voltage = lp3971_ldo_list_voltage,
+	.is_enabled = lp3971_ldo_is_enabled,
+	.enable = lp3971_ldo_enable,
+	.disable = lp3971_ldo_disable,
+	.get_voltage = lp3971_ldo_get_voltage,
+	.set_voltage = lp3971_ldo_set_voltage,
+};
+
+static int lp3971_dcdc_list_voltage(struct regulator_dev *dev, unsigned index)
+{
+	return 1000 * buck_voltage_map[index];
+}
+
+static int lp3971_dcdc_is_enabled(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int buck = rdev_get_id(dev) - LP3971_DCDC1;
+	u16 mask = 1 << (buck * 2);
+	u16 val;
+
+	val = lp3971_reg_read(lp3971, LP3971_BUCK_VOL_ENABLE_REG);
+	return (val & mask) != 0;
+}
+
+static int lp3971_dcdc_enable(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int buck = rdev_get_id(dev) - LP3971_DCDC1;
+	u16 mask = 1 << (buck * 2);
+
+	return lp3971_set_bits(lp3971, LP3971_BUCK_VOL_ENABLE_REG, mask, mask);
+}
+
+static int lp3971_dcdc_disable(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int buck = rdev_get_id(dev) - LP3971_DCDC1;
+	u16 mask = 1 << (buck * 2);
+
+	return lp3971_set_bits(lp3971, LP3971_BUCK_VOL_ENABLE_REG, mask, 0);
+}
+
+static int lp3971_dcdc_get_voltage(struct regulator_dev *dev)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int buck = rdev_get_id(dev) - LP3971_DCDC1;
+	u16 reg;
+	int val;
+
+	reg = lp3971_reg_read(lp3971, LP3971_BUCK_TARGET_VOL1_REG(buck));
+	reg &= BUCK_TARGET_VOL_MASK;
+
+	if (reg <= BUCK_TARGET_VOL_MAX_IDX)
+		val = 1000 * buck_voltage_map[reg];
+	else {
+		val = 0;
+		dev_warn(&dev->dev, "chip reported incorrect voltage value.\n");
+	}
+
+	return val;
+}
+
+static int lp3971_dcdc_set_voltage(struct regulator_dev *dev,
+				  int min_uV, int max_uV)
+{
+	struct lp3971 *lp3971 = rdev_get_drvdata(dev);
+	int buck = rdev_get_id(dev) - LP3971_DCDC1;
+	int min_vol = min_uV / 1000, max_vol = max_uV / 1000;
+	const int *vol_map = buck_voltage_map;
+	u16 val;
+	int ret;
+
+	if (min_vol < vol_map[BUCK_TARGET_VOL_MIN_IDX] ||
+	    min_vol > vol_map[BUCK_TARGET_VOL_MAX_IDX])
+		return -EINVAL;
+
+	for (val = BUCK_TARGET_VOL_MIN_IDX; val <= BUCK_TARGET_VOL_MAX_IDX;
+	     val++)
+		if (vol_map[val] >= min_vol)
+			break;
+
+	if (vol_map[val] > max_vol)
+		return -EINVAL;
+
+	ret = lp3971_set_bits(lp3971, LP3971_BUCK_TARGET_VOL1_REG(buck),
+	       BUCK_TARGET_VOL_MASK, val);
+	if (ret)
+		return ret;
+
+	ret = lp3971_set_bits(lp3971, LP3971_BUCK_VOL_CHANGE_REG,
+	       BUCK_VOL_CHANGE_FLAG_MASK << BUCK_VOL_CHANGE_SHIFT(buck),
+	       BUCK_VOL_CHANGE_FLAG_GO << BUCK_VOL_CHANGE_SHIFT(buck));
+	if (ret)
+		return ret;
+
+	return lp3971_set_bits(lp3971, LP3971_BUCK_VOL_CHANGE_REG,
+	       BUCK_VOL_CHANGE_FLAG_MASK << BUCK_VOL_CHANGE_SHIFT(buck),
+	       0 << BUCK_VOL_CHANGE_SHIFT(buck));
+}
+
+static struct regulator_ops lp3971_dcdc_ops = {
+	.list_voltage = lp3971_dcdc_list_voltage,
+	.is_enabled = lp3971_dcdc_is_enabled,
+	.enable = lp3971_dcdc_enable,
+	.disable = lp3971_dcdc_disable,
+	.get_voltage = lp3971_dcdc_get_voltage,
+	.set_voltage = lp3971_dcdc_set_voltage,
+};
+
+static struct regulator_desc regulators[] = {
+	{
+		.name = "LDO1",
+		.id = LP3971_LDO1,
+		.ops = &lp3971_ldo_ops,
+		.n_voltages = ARRAY_SIZE(ldo123_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "LDO2",
+		.id = LP3971_LDO2,
+		.ops = &lp3971_ldo_ops,
+		.n_voltages = ARRAY_SIZE(ldo123_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "LDO3",
+		.id = LP3971_LDO3,
+		.ops = &lp3971_ldo_ops,
+		.n_voltages = ARRAY_SIZE(ldo123_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "LDO4",
+		.id = LP3971_LDO4,
+		.ops = &lp3971_ldo_ops,
+		.n_voltages = ARRAY_SIZE(ldo45_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "LDO5",
+		.id = LP3971_LDO5,
+		.ops = &lp3971_ldo_ops,
+		.n_voltages = ARRAY_SIZE(ldo45_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "DCDC1",
+		.id = LP3971_DCDC1,
+		.ops = &lp3971_dcdc_ops,
+		.n_voltages = ARRAY_SIZE(buck_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "DCDC2",
+		.id = LP3971_DCDC2,
+		.ops = &lp3971_dcdc_ops,
+		.n_voltages = ARRAY_SIZE(buck_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "DCDC3",
+		.id = LP3971_DCDC3,
+		.ops = &lp3971_dcdc_ops,
+		.n_voltages = ARRAY_SIZE(buck_voltage_map),
+		.type = REGULATOR_VOLTAGE,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int lp3971_i2c_read(struct i2c_client *i2c, char reg, int count,
+	u16 *dest)
+{
+	int ret;
+
+	if (count != 1)
+		return -EIO;
+	ret = i2c_smbus_read_byte_data(i2c, reg);
+	if (ret < 0 || count != 1)
+		return -EIO;
+
+	*dest = ret;
+	return 0;
+}
+
+static int lp3971_i2c_write(struct i2c_client *i2c, char reg, int count,
+	const u16 *src)
+{
+	int ret;
+
+	if (count != 1)
+		return -EIO;
+	ret = i2c_smbus_write_byte_data(i2c, reg, *src);
+	if (ret >= 0)
+		return 0;
+
+	return ret;
+}
+
+static u8 lp3971_reg_read(struct lp3971 *lp3971, u8 reg)
+{
+	u16 val = 0;
+
+	mutex_lock(&lp3971->io_lock);
+
+	lp3971_i2c_read(lp3971->i2c, reg, 1, &val);
+
+	dev_dbg(lp3971->dev, "reg read 0x%02x -> 0x%02x\n", (int)reg,
+		(unsigned)val&0xff);
+
+	mutex_unlock(&lp3971->io_lock);
+
+	return val & 0xff;
+}
+
+static int lp3971_set_bits(struct lp3971 *lp3971, u8 reg, u16 mask, u16 val)
+{
+	u16 tmp;
+	int ret;
+
+	mutex_lock(&lp3971->io_lock);
+
+	ret = lp3971_i2c_read(lp3971->i2c, reg, 1, &tmp);
+	tmp = (tmp & ~mask) | val;
+	if (ret == 0) {
+		ret = lp3971_i2c_write(lp3971->i2c, reg, 1, &tmp);
+		dev_dbg(lp3971->dev, "reg write 0x%02x -> 0x%02x\n", (int)reg,
+			(unsigned)val&0xff);
+	}
+	mutex_unlock(&lp3971->io_lock);
+
+	return ret;
+}
+
+static int setup_regulators(struct lp3971 *lp3971,
+	struct lp3971_platform_data *pdata)
+{
+	int i, err;
+	int num_regulators = pdata->num_regulators;
+	lp3971->num_regulators = num_regulators;
+	lp3971->rdev = kzalloc(sizeof(struct regulator_dev *) * num_regulators,
+		GFP_KERNEL);
+
+	/* Instantiate the regulators */
+	for (i = 0; i < num_regulators; i++) {
+		int id = pdata->regulators[i].id;
+		lp3971->rdev[i] = regulator_register(&regulators[id],
+			lp3971->dev, pdata->regulators[i].initdata, lp3971);
+
+		err = IS_ERR(lp3971->rdev[i]);
+		if (err) {
+			dev_err(lp3971->dev, "regulator init failed: %d\n",
+				err);
+			goto error;
+		}
+	}
+
+	return 0;
+error:
+	for (i = 0; i < num_regulators; i++)
+		if (lp3971->rdev[i])
+			regulator_unregister(lp3971->rdev[i]);
+	kfree(lp3971->rdev);
+	lp3971->rdev = NULL;
+	return err;
+}
+
+static int __devinit lp3971_i2c_probe(struct i2c_client *i2c,
+			    const struct i2c_device_id *id)
+{
+	struct lp3971 *lp3971;
+	struct lp3971_platform_data *pdata = i2c->dev.platform_data;
+	int ret;
+	u16 val;
+
+	lp3971 = kzalloc(sizeof(struct lp3971), GFP_KERNEL);
+	if (lp3971 == NULL) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	lp3971->i2c = i2c;
+	lp3971->dev = &i2c->dev;
+	i2c_set_clientdata(i2c, lp3971);
+
+	mutex_init(&lp3971->io_lock);
+
+	/* Detect LP3971 */
+	ret = lp3971_i2c_read(i2c, LP3971_SYS_CONTROL1_REG, 1, &val);
+	if (ret == 0 && (val & SYS_CONTROL1_INIT_MASK) != SYS_CONTROL1_INIT_VAL)
+		ret = -ENODEV;
+	if (ret < 0) {
+		dev_err(&i2c->dev, "failed to detect device\n");
+		goto err_detect;
+	}
+
+	if (pdata) {
+		ret = setup_regulators(lp3971, pdata);
+		if (ret < 0)
+			goto err_detect;
+	} else
+		dev_warn(lp3971->dev, "No platform init data supplied\n");
+
+	return 0;
+
+err_detect:
+	i2c_set_clientdata(i2c, NULL);
+	kfree(lp3971);
+err:
+	return ret;
+}
+
+static int __devexit lp3971_i2c_remove(struct i2c_client *i2c)
+{
+	struct lp3971 *lp3971 = i2c_get_clientdata(i2c);
+	int i;
+	for (i = 0; i < lp3971->num_regulators; i++)
+		if (lp3971->rdev[i])
+			regulator_unregister(lp3971->rdev[i]);
+	kfree(lp3971->rdev);
+	i2c_set_clientdata(i2c, NULL);
+	kfree(lp3971);
+
+	return 0;
+}
+
+static const struct i2c_device_id lp3971_i2c_id[] = {
+       { "lp3971", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(i2c, lp3971_i2c_id);
+
+static struct i2c_driver lp3971_i2c_driver = {
+	.driver = {
+		.name = "LP3971",
+		.owner = THIS_MODULE,
+	},
+	.probe    = lp3971_i2c_probe,
+	.remove   = lp3971_i2c_remove,
+	.id_table = lp3971_i2c_id,
+};
+
+static int __init lp3971_module_init(void)
+{
+	int ret = -ENODEV;
+
+	ret = i2c_add_driver(&lp3971_i2c_driver);
+	if (ret != 0)
+		pr_err("Failed to register I2C driver: %d\n", ret);
+
+	return ret;
+}
+module_init(lp3971_module_init);
+
+static void __exit lp3971_module_exit(void)
+{
+	i2c_del_driver(&lp3971_i2c_driver);
+}
+module_exit(lp3971_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marek Szyprowski <m.szyprowski@samsung.com>");
+MODULE_DESCRIPTION("LP3971 PMIC driver");
diff --git a/include/linux/regulator/lp3971.h b/include/linux/regulator/lp3971.h
new file mode 100644
index 00000000000..61401649fe7
--- /dev/null
+++ b/include/linux/regulator/lp3971.h
@@ -0,0 +1,51 @@
+/*
+ * National Semiconductors LP3971 PMIC chip client interface
+ *
+ *  Copyright (C) 2009 Samsung Electronics
+ *  Author: Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * Based on wm8400.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __LINUX_REGULATOR_LP3971_H
+#define __LINUX_REGULATOR_LP3971_H
+
+#include <linux/regulator/machine.h>
+
+#define LP3971_LDO1  0
+#define LP3971_LDO2  1
+#define LP3971_LDO3  2
+#define LP3971_LDO4  3
+#define LP3971_LDO5  4
+
+#define LP3971_DCDC1 5
+#define LP3971_DCDC2 6
+#define LP3971_DCDC3 7
+
+#define LP3971_NUM_REGULATORS 8
+
+struct lp3971_regulator_subdev {
+	int id;
+	struct regulator_init_data *initdata;
+};
+
+struct lp3971_platform_data {
+	int num_regulators;
+	struct lp3971_regulator_subdev *regulators;
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From b110a8fb242bc34e4b7686252899ce0fca956e2c Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Thu, 28 May 2009 07:15:16 +0200
Subject: regulator/max1586: support increased V3 voltage range

The V3 regulator can be configured with an external resistor
connected to the feedback pin (R24 in the data sheet) to
increase the voltage range.

For example, hx4700 has R24 = 3.32 kOhm to achieve a maximum
V3 voltage of 1.55 V which is needed for 624 MHz CPU frequency.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/max1586.c       | 71 ++++++++++++++++++++++++++++-----------
 include/linux/regulator/max1586.h | 11 ++++++
 2 files changed, 63 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
index bbbb55fcfe8..92799f40c86 100644
--- a/drivers/regulator/max1586.c
+++ b/drivers/regulator/max1586.c
@@ -29,7 +29,6 @@
 
 #define MAX1586_V3_MIN_UV   700000
 #define MAX1586_V3_MAX_UV  1475000
-#define MAX1586_V3_STEP_UV   25000
 
 #define MAX1586_V6_MIN_UV        0
 #define MAX1586_V6_MAX_UV  3000000
@@ -37,33 +36,52 @@
 #define I2C_V3_SELECT (0 << 5)
 #define I2C_V6_SELECT (1 << 5)
 
+struct max1586_data {
+	struct i2c_client *client;
+
+	/* min/max V3 voltage */
+	int min_uV;
+	int max_uV;
+
+	struct regulator_dev *rdev[0];
+};
+
 /*
  * V3 voltage
  * On I2C bus, sending a "x" byte to the max1586 means :
  *   set V3 to 0.700V + (x & 0x1f) * 0.025V
+ * This voltage can be increased by external resistors
+ * R24 and R25=100kOhm as described in the data sheet.
+ * The gain is approximately: 1 + R24/R25 + R24/185.5kOhm
  */
-static int max1586_v3_calc_voltage(unsigned selector)
+static int max1586_v3_calc_voltage(struct max1586_data *max1586,
+	unsigned selector)
 {
-	return MAX1586_V3_MIN_UV + (MAX1586_V3_STEP_UV * selector);
+	unsigned range_uV = max1586->max_uV - max1586->min_uV;
+
+	return max1586->min_uV + (selector * range_uV / MAX1586_V3_MAX_VSEL);
 }
 
 static int max1586_v3_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 {
-	struct i2c_client *client = rdev_get_drvdata(rdev);
+	struct max1586_data *max1586 = rdev_get_drvdata(rdev);
+	struct i2c_client *client = max1586->client;
+	unsigned range_uV = max1586->max_uV - max1586->min_uV;
 	unsigned selector;
 	u8 v3_prog;
 
-	if (min_uV < MAX1586_V3_MIN_UV || min_uV > MAX1586_V3_MAX_UV)
-		return -EINVAL;
-	if (max_uV < MAX1586_V3_MIN_UV || max_uV > MAX1586_V3_MAX_UV)
+	if (min_uV > max1586->max_uV || max_uV < max1586->min_uV)
 		return -EINVAL;
+	if (min_uV < max1586->min_uV)
+		min_uV = max1586->min_uV;
 
-	selector = (min_uV - MAX1586_V3_MIN_UV) / MAX1586_V3_STEP_UV;
-	if (max1586_v3_calc_voltage(selector) > max_uV)
+	selector = ((min_uV - max1586->min_uV) * MAX1586_V3_MAX_VSEL +
+			range_uV - 1) / range_uV;
+	if (max1586_v3_calc_voltage(max1586, selector) > max_uV)
 		return -EINVAL;
 
 	dev_dbg(&client->dev, "changing voltage v3 to %dmv\n",
-		max1586_v3_calc_voltage(selector) / 1000);
+		max1586_v3_calc_voltage(max1586, selector) / 1000);
 
 	v3_prog = I2C_V3_SELECT | (u8) selector;
 	return i2c_smbus_write_byte(client, v3_prog);
@@ -71,9 +89,11 @@ static int max1586_v3_set(struct regulator_dev *rdev, int min_uV, int max_uV)
 
 static int max1586_v3_list(struct regulator_dev *rdev, unsigned selector)
 {
+	struct max1586_data *max1586 = rdev_get_drvdata(rdev);
+
 	if (selector > MAX1586_V3_MAX_VSEL)
 		return -EINVAL;
-	return max1586_v3_calc_voltage(selector);
+	return max1586_v3_calc_voltage(max1586, selector);
 }
 
 /*
@@ -164,14 +184,25 @@ static int max1586_pmic_probe(struct i2c_client *client,
 {
 	struct regulator_dev **rdev;
 	struct max1586_platform_data *pdata = client->dev.platform_data;
-	int i, id, ret = 0;
+	struct max1586_data *max1586;
+	int i, id, ret = -ENOMEM;
+
+	max1586 = kzalloc(sizeof(struct max1586_data) +
+			sizeof(struct regulator_dev *) * (MAX1586_V6 + 1),
+			GFP_KERNEL);
+	if (!max1586)
+		goto out;
 
-	rdev = kzalloc(sizeof(struct regulator_dev *) * (MAX1586_V6 + 1),
-		       GFP_KERNEL);
-	if (!rdev)
-		return -ENOMEM;
+	max1586->client = client;
 
-	ret = -EINVAL;
+	if (!pdata->v3_gain) {
+		ret = -EINVAL;
+		goto out_unmap;
+	}
+	max1586->min_uV = MAX1586_V3_MIN_UV * pdata->v3_gain / 1000000;
+	max1586->max_uV = MAX1586_V3_MAX_UV * pdata->v3_gain / 1000000;
+
+	rdev = max1586->rdev;
 	for (i = 0; i < pdata->num_subdevs && i <= MAX1586_V6; i++) {
 		id = pdata->subdevs[i].id;
 		if (!pdata->subdevs[i].platform_data)
@@ -182,7 +213,7 @@ static int max1586_pmic_probe(struct i2c_client *client,
 		}
 		rdev[i] = regulator_register(&max1586_reg[id], &client->dev,
 					     pdata->subdevs[i].platform_data,
-					     client);
+					     max1586);
 		if (IS_ERR(rdev[i])) {
 			ret = PTR_ERR(rdev[i]);
 			dev_err(&client->dev, "failed to register %s\n",
@@ -198,7 +229,9 @@ static int max1586_pmic_probe(struct i2c_client *client,
 err:
 	while (--i >= 0)
 		regulator_unregister(rdev[i]);
-	kfree(rdev);
+out_unmap:
+	kfree(max1586);
+out:
 	return ret;
 }
 
diff --git a/include/linux/regulator/max1586.h b/include/linux/regulator/max1586.h
index 2056973396b..44563192bf1 100644
--- a/include/linux/regulator/max1586.h
+++ b/include/linux/regulator/max1586.h
@@ -26,6 +26,12 @@
 #define MAX1586_V3 0
 #define MAX1586_V6 1
 
+/* precalculated values for v3_gain */
+#define MAX1586_GAIN_NO_R24   1000000  /* 700000 .. 1475000 mV */
+#define MAX1586_GAIN_R24_3k32 1051098  /* 735768 .. 1550369 mV */
+#define MAX1586_GAIN_R24_5k11 1078648  /* 755053 .. 1591005 mV */
+#define MAX1586_GAIN_R24_7k5  1115432  /* 780802 .. 1645262 mV */
+
 /**
  * max1586_subdev_data - regulator data
  * @id: regulator Id (either MAX1586_V3 or MAX1586_V6)
@@ -43,10 +49,15 @@ struct max1586_subdev_data {
  * @num_subdevs: number of regultors used (may be 1 or 2)
  * @subdevs: regulator used
  *           At most, there will be a regulator for V3 and one for V6 voltages.
+ * @v3_gain: gain on the V3 voltage output multiplied by 1e6.
+ *           This can be calculated as ((1 + R24/R25 + R24/185.5kOhm) * 1e6)
+ *           for an external resistor configuration as described in the
+ *           data sheet (R25=100kOhm).
  */
 struct max1586_platform_data {
 	int num_subdevs;
 	struct max1586_subdev_data *subdevs;
+	int v3_gain;
 };
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 2dff440525f8faba8836e9f05297b76f23b4af30 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Sat, 31 May 2008 15:56:17 +0200
Subject: kmemcheck: add mm functions

With kmemcheck enabled, the slab allocator needs to do this:

1. Tell kmemcheck to allocate the shadow memory which stores the status of
   each byte in the allocation proper, e.g. whether it is initialized or
   uninitialized.
2. Tell kmemcheck which parts of memory that should be marked uninitialized.
   There are actually a few more states, such as "not yet allocated" and
   "recently freed".

If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
memory that can take page faults because of kmemcheck.

If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
request memory with the __GFP_NOTRACK flag. This does not prevent the page
faults from occuring, however, but marks the object in question as being
initialized so that no warnings will ever be produced for this object.

In addition to (and in contrast to) __GFP_NOTRACK, the
__GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should
not be tracked _because_ it would produce a false positive. Their values
are identical, but need not be so in the future (for example, we could now
enable/disable false positives with a config option).

Parts of this patch were contributed by Pekka Enberg but merged for
atomicity.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/kernel/process.c |   2 +-
 include/linux/gfp.h       |   9 +++-
 include/linux/kmemcheck.h |  47 +++++++++++++++++++++
 include/linux/slab.h      |   7 ++++
 kernel/fork.c             |  14 +++----
 mm/Makefile               |   1 +
 mm/kmemcheck.c            | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 174 insertions(+), 9 deletions(-)
 create mode 100644 mm/kmemcheck.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3bb2be1649b..994dd6a4a2a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,7 +63,7 @@ void arch_task_cache_init(void)
         task_xstate_cachep =
         	kmem_cache_create("task_xstate", xstate_size,
 				  __alignof__(union thread_xstate),
-				  SLAB_PANIC, NULL);
+				  SLAB_PANIC | SLAB_NOTRACK, NULL);
 }
 
 /*
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0bbc15f5453..daeaa8fe1bb 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -51,8 +51,15 @@ struct vm_area_struct;
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
 #define __GFP_MOVABLE	((__force gfp_t)0x100000u)  /* Page is movable */
+#define __GFP_NOTRACK	((__force gfp_t)0x200000u)  /* Don't track with kmemcheck */
 
-#define __GFP_BITS_SHIFT 21	/* Room for 21 __GFP_FOO bits */
+/*
+ * This may seem redundant, but it's a way of annotating false positives vs.
+ * allocations that simply cannot be supported (e.g. page tables).
+ */
+#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
+
+#define __GFP_BITS_SHIFT 22	/* Room for 22 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 39480c91b2f..5b65f4ebead 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -7,11 +7,58 @@
 #ifdef CONFIG_KMEMCHECK
 extern int kmemcheck_enabled;
 
+/* The slab-related functions. */
+void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
+			    struct page *page, int order);
+void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order);
+void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+			  size_t size);
+void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
+
+void kmemcheck_show_pages(struct page *p, unsigned int n);
+void kmemcheck_hide_pages(struct page *p, unsigned int n);
+
+bool kmemcheck_page_is_tracked(struct page *p);
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n);
+void kmemcheck_mark_uninitialized(void *address, unsigned int n);
+void kmemcheck_mark_initialized(void *address, unsigned int n);
+void kmemcheck_mark_freed(void *address, unsigned int n);
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
+
 int kmemcheck_show_addr(unsigned long address);
 int kmemcheck_hide_addr(unsigned long address);
 #else
 #define kmemcheck_enabled 0
 
+static inline void
+kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
+		       struct page *page, int order)
+{
+}
+
+static inline void
+kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order)
+{
+}
+
+static inline void
+kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+		     size_t size)
+{
+}
+
+static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
+				       size_t size)
+{
+}
+
+static inline bool kmemcheck_page_is_tracked(struct page *p)
+{
+	return false;
+}
 #endif /* CONFIG_KMEMCHECK */
 
 #endif /* LINUX_KMEMCHECK_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 48803064ced..e339fcf17cd 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -64,6 +64,13 @@
 
 #define SLAB_NOLEAKTRACE	0x00800000UL	/* Avoid kmemleak tracing */
 
+/* Don't track use of uninitialized memory */
+#ifdef CONFIG_KMEMCHECK
+# define SLAB_NOTRACK		0x01000000UL
+#else
+# define SLAB_NOTRACK		0x00000000UL
+#endif
+
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4430eb1376f..be022c200da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct", sizeof(struct task_struct),
-			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -1470,20 +1470,20 @@ void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
-			sighand_ctor);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+			SLAB_NOTRACK, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 }
diff --git a/mm/Makefile b/mm/Makefile
index e89acb090b4..c379ce08354 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 00000000000..eaa41b80261
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,103 @@
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kmemcheck.h>
+
+void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
+			   struct page *page, int order)
+{
+	struct page *shadow;
+	int pages;
+	int i;
+
+	pages = 1 << order;
+
+	/*
+	 * With kmemcheck enabled, we need to allocate a memory area for the
+	 * shadow bits as well.
+	 */
+	shadow = alloc_pages_node(node, flags, order);
+	if (!shadow) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "kmemcheck: failed to allocate "
+				"shadow bitmap\n");
+		return;
+	}
+
+	for(i = 0; i < pages; ++i)
+		page[i].shadow = page_address(&shadow[i]);
+
+	/*
+	 * Mark it as non-present for the MMU so that our accesses to
+	 * this memory will trigger a page fault and let us analyze
+	 * the memory accesses.
+	 */
+	kmemcheck_hide_pages(page, pages);
+
+	/*
+	 * Objects from caches that have a constructor don't get
+	 * cleared when they're allocated, so we need to do it here.
+	 */
+	if (s->ctor)
+		kmemcheck_mark_uninitialized_pages(page, pages);
+	else
+		kmemcheck_mark_unallocated_pages(page, pages);
+}
+
+void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order)
+{
+	struct page *shadow;
+	int pages;
+	int i;
+
+	pages = 1 << order;
+
+	kmemcheck_show_pages(page, pages);
+
+	shadow = virt_to_page(page[0].shadow);
+
+	for(i = 0; i < pages; ++i)
+		page[i].shadow = NULL;
+
+	__free_pages(shadow, order);
+}
+
+void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+			  size_t size)
+{
+	/*
+	 * Has already been memset(), which initializes the shadow for us
+	 * as well.
+	 */
+	if (gfpflags & __GFP_ZERO)
+		return;
+
+	/* No need to initialize the shadow of a non-tracked slab. */
+	if (s->flags & SLAB_NOTRACK)
+		return;
+
+	if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
+		/*
+		 * Allow notracked objects to be allocated from
+		 * tracked caches. Note however that these objects
+		 * will still get page faults on access, they just
+		 * won't ever be flagged as uninitialized. If page
+		 * faults are not acceptable, the slab cache itself
+		 * should be marked NOTRACK.
+		 */
+		kmemcheck_mark_initialized(object, size);
+	} else if (!s->ctor) {
+		/*
+		 * New objects should be marked uninitialized before
+		 * they're returned to the called.
+		 */
+		kmemcheck_mark_uninitialized(object, size);
+	}
+}
+
+void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
+{
+	/* TODO: RCU freeing is unsupported for now; hide false positives. */
+	if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
+		kmemcheck_mark_freed(object, size);
+}
-- 
cgit v1.2.3-70-g09d2


From d7002857dee6e9a3ce1f78d23f37caba106b29c5 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Sun, 20 Jul 2008 10:44:54 +0200
Subject: kmemcheck: add DMA hooks

This patch hooks into the DMA API to prevent the reporting of the
false positives that would otherwise be reported when memory is
accessed that is also used directly by devices.

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/include/asm/dma-mapping.h |  2 ++
 include/linux/kmemcheck.h          | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c6..d57d0c1857b 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
  * Documentation/DMA-API.txt for documentation.
  */
 
+#include <linux/kmemcheck.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
@@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
+	kmemcheck_mark_initialized(ptr, size);
 	addr = ops->map_page(hwdev, virt_to_page(ptr),
 			     (unsigned long)ptr & ~PAGE_MASK, size,
 			     dir, NULL);
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 5b65f4ebead..71f21ae33d1 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -59,6 +59,22 @@ static inline bool kmemcheck_page_is_tracked(struct page *p)
 {
 	return false;
 }
+
+static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+}
 #endif /* CONFIG_KMEMCHECK */
 
 #endif /* LINUX_KMEMCHECK_H */
-- 
cgit v1.2.3-70-g09d2


From b1eeab67682a5e397aecf172046b3a8bd4808ae4 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Tue, 25 Nov 2008 16:55:53 +0100
Subject: kmemcheck: add hooks for the page allocator

This adds support for tracking the initializedness of memory that
was allocated with the page allocator. Highmem requests are not
tracked.

Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>

[build fix for !CONFIG_KMEMCHECK]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/include/asm/thread_info.h |  4 ++--
 arch/x86/mm/kmemcheck/shadow.c     |  8 +++++++
 include/linux/gfp.h                |  5 +++++
 include/linux/kmemcheck.h          | 35 ++++++++++++++++++++++++-----
 mm/kmemcheck.c                     | 45 +++++++++++++++++++++++++++-----------
 mm/page_alloc.c                    | 18 +++++++++++++++
 mm/slab.c                          | 15 ++++++++-----
 mm/slub.c                          | 23 ++++++++++++++-----
 8 files changed, 122 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 602c769fc98..b0783520988 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -154,9 +154,9 @@ struct thread_info {
 
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
 #else
-#define THREAD_FLAGS GFP_KERNEL
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
 #endif
 
 #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e7346d3873b..e773b6bd007 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -116,6 +116,14 @@ void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
 		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
 }
 
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i)
+		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
+}
+
 enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
 {
 	uint8_t *x;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index daeaa8fe1bb..3885e7f7556 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -51,7 +51,12 @@ struct vm_area_struct;
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
 #define __GFP_MOVABLE	((__force gfp_t)0x100000u)  /* Page is movable */
+
+#ifdef CONFIG_KMEMCHECK
 #define __GFP_NOTRACK	((__force gfp_t)0x200000u)  /* Don't track with kmemcheck */
+#else
+#define __GFP_NOTRACK	((__force gfp_t)0)
+#endif
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 71f21ae33d1..093d23969b1 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -8,13 +8,15 @@
 extern int kmemcheck_enabled;
 
 /* The slab-related functions. */
-void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
-			    struct page *page, int order);
-void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order);
+void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
+void kmemcheck_free_shadow(struct page *page, int order);
 void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
 			  size_t size);
 void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
 
+void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
+			       gfp_t gfpflags);
+
 void kmemcheck_show_pages(struct page *p, unsigned int n);
 void kmemcheck_hide_pages(struct page *p, unsigned int n);
 
@@ -27,6 +29,7 @@ void kmemcheck_mark_freed(void *address, unsigned int n);
 
 void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
 void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
 
 int kmemcheck_show_addr(unsigned long address);
 int kmemcheck_hide_addr(unsigned long address);
@@ -34,13 +37,12 @@ int kmemcheck_hide_addr(unsigned long address);
 #define kmemcheck_enabled 0
 
 static inline void
-kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
-		       struct page *page, int order)
+kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 {
 }
 
 static inline void
-kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order)
+kmemcheck_free_shadow(struct page *page, int order)
 {
 }
 
@@ -55,6 +57,11 @@ static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
 {
 }
 
+static inline void kmemcheck_pagealloc_alloc(struct page *p,
+	unsigned int order, gfp_t gfpflags)
+{
+}
+
 static inline bool kmemcheck_page_is_tracked(struct page *p)
 {
 	return false;
@@ -75,6 +82,22 @@ static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
 static inline void kmemcheck_mark_freed(void *address, unsigned int n)
 {
 }
+
+static inline void kmemcheck_mark_unallocated_pages(struct page *p,
+						    unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
+						      unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_initialized_pages(struct page *p,
+						    unsigned int n)
+{
+}
+
 #endif /* CONFIG_KMEMCHECK */
 
 #endif /* LINUX_KMEMCHECK_H */
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index eaa41b80261..fd814fd6131 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,10 +1,10 @@
+#include <linux/gfp.h>
 #include <linux/mm_types.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/kmemcheck.h>
 
-void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
-			   struct page *page, int order)
+void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 {
 	struct page *shadow;
 	int pages;
@@ -16,7 +16,7 @@ void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
 	 * With kmemcheck enabled, we need to allocate a memory area for the
 	 * shadow bits as well.
 	 */
-	shadow = alloc_pages_node(node, flags, order);
+	shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
 	if (!shadow) {
 		if (printk_ratelimit())
 			printk(KERN_ERR "kmemcheck: failed to allocate "
@@ -33,23 +33,17 @@ void kmemcheck_alloc_shadow(struct kmem_cache *s, gfp_t flags, int node,
 	 * the memory accesses.
 	 */
 	kmemcheck_hide_pages(page, pages);
-
-	/*
-	 * Objects from caches that have a constructor don't get
-	 * cleared when they're allocated, so we need to do it here.
-	 */
-	if (s->ctor)
-		kmemcheck_mark_uninitialized_pages(page, pages);
-	else
-		kmemcheck_mark_unallocated_pages(page, pages);
 }
 
-void kmemcheck_free_shadow(struct kmem_cache *s, struct page *page, int order)
+void kmemcheck_free_shadow(struct page *page, int order)
 {
 	struct page *shadow;
 	int pages;
 	int i;
 
+	if (!kmemcheck_page_is_tracked(page))
+		return;
+
 	pages = 1 << order;
 
 	kmemcheck_show_pages(page, pages);
@@ -101,3 +95,28 @@ void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
 	if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
 		kmemcheck_mark_freed(object, size);
 }
+
+void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
+			       gfp_t gfpflags)
+{
+	int pages;
+
+	if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
+		return;
+
+	pages = 1 << order;
+
+	/*
+	 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
+	 * can become uninitialized by copying uninitialized memory
+	 * into them.
+	 */
+
+	/* XXX: Can use zone->node for node? */
+	kmemcheck_alloc_shadow(page, order, gfpflags, -1);
+
+	if (gfpflags & __GFP_ZERO)
+		kmemcheck_mark_initialized_pages(page, pages);
+	else
+		kmemcheck_mark_uninitialized_pages(page, pages);
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9a..0727896a88a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -546,6 +547,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	int i;
 	int bad = 0;
 
+	kmemcheck_free_shadow(page, order);
+
 	for (i = 0 ; i < (1 << order) ; ++i)
 		bad += free_pages_check(page + i);
 	if (bad)
@@ -994,6 +997,8 @@ static void free_hot_cold_page(struct page *page, int cold)
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 
+	kmemcheck_free_shadow(page, 0);
+
 	if (PageAnon(page))
 		page->mapping = NULL;
 	if (free_pages_check(page))
@@ -1047,6 +1052,16 @@ void split_page(struct page *page, unsigned int order)
 
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
+
+#ifdef CONFIG_KMEMCHECK
+	/*
+	 * Split shadow pages too, because free(page[0]) would
+	 * otherwise free the whole shadow.
+	 */
+	if (kmemcheck_page_is_tracked(page))
+		split_page(virt_to_page(page[0].shadow), order);
+#endif
+
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
@@ -1667,7 +1682,10 @@ nopage:
 		dump_stack();
 		show_mem();
 	}
+	return page;
 got_pg:
+	if (kmemcheck_enabled)
+		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_internal);
diff --git a/mm/slab.c b/mm/slab.c
index 95b6c5eb40b..6a1ad0b9a94 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1612,7 +1612,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 
-	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+	page = alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page)
 		return NULL;
 
@@ -1626,8 +1626,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	for (i = 0; i < nr_pages; i++)
 		__SetPageSlab(page + i);
 
-	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK))
-		kmemcheck_alloc_shadow(cachep, flags, nodeid, page, cachep->gfporder);
+	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
+		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
+
+		if (cachep->ctor)
+			kmemcheck_mark_uninitialized_pages(page, nr_pages);
+		else
+			kmemcheck_mark_unallocated_pages(page, nr_pages);
+	}
 
 	return page_address(page);
 }
@@ -1641,8 +1647,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
-	if (kmemcheck_page_is_tracked(page))
-		kmemcheck_free_shadow(cachep, page, cachep->gfporder);
+	kmemcheck_free_shadow(page, cachep->gfporder);
 
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		sub_zone_page_state(page_zone(page),
diff --git a/mm/slub.c b/mm/slub.c
index 1cebaa747ad..898fb5047dc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1066,6 +1066,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
 {
 	int order = oo_order(oo);
 
+	flags |= __GFP_NOTRACK;
+
 	if (node == -1)
 		return alloc_pages(flags, order);
 	else
@@ -1097,7 +1099,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (kmemcheck_enabled
 		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
 	{
-		kmemcheck_alloc_shadow(s, flags, node, page, compound_order(page));
+		int pages = 1 << oo_order(oo);
+
+		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+
+		/*
+		 * Objects from caches that have a constructor don't get
+		 * cleared when they're allocated, so we need to do it here.
+		 */
+		if (s->ctor)
+			kmemcheck_mark_uninitialized_pages(page, pages);
+		else
+			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
 	page->objects = oo_objects(oo);
@@ -1173,8 +1186,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 		__ClearPageSlubDebug(page);
 	}
 
-	if (kmemcheck_page_is_tracked(page))
-		kmemcheck_free_shadow(s, page, compound_order(page));
+	kmemcheck_free_shadow(page, compound_order(page));
 
 	mod_zone_page_state(page_zone(page),
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -2734,9 +2746,10 @@ EXPORT_SYMBOL(__kmalloc);
 
 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 {
-	struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
-						get_order(size));
+	struct page *page;
 
+	flags |= __GFP_COMP | __GFP_NOTRACK;
+	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		return page_address(page);
 	else
-- 
cgit v1.2.3-70-g09d2


From fc7d0c9f2122e8bf58deaf1252b0e750df5b0e91 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Sat, 30 Aug 2008 12:16:05 +0200
Subject: kmemcheck: introduce bitfield API

Add the bitfield API which can be used to annotate bitfields in structs
and get rid of false positive reports.

According to Al Viro, the syntax we were using (putting #ifdef inside
macro arguments) was not valid C. He also suggested using begin/end
markers instead, which is what we do now.

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/kmemcheck.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 093d23969b1..47b39b7c7e8 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -33,6 +33,7 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
 
 int kmemcheck_show_addr(unsigned long address);
 int kmemcheck_hide_addr(unsigned long address);
+
 #else
 #define kmemcheck_enabled 0
 
@@ -100,4 +101,53 @@ static inline void kmemcheck_mark_initialized_pages(struct page *p,
 
 #endif /* CONFIG_KMEMCHECK */
 
+/*
+ * Bitfield annotations
+ *
+ * How to use: If you have a struct using bitfields, for example
+ *
+ *     struct a {
+ *             int x:8, y:8;
+ *     };
+ *
+ * then this should be rewritten as
+ *
+ *     struct a {
+ *             kmemcheck_bitfield_begin(flags);
+ *             int x:8, y:8;
+ *             kmemcheck_bitfield_end(flags);
+ *     };
+ *
+ * Now the "flags_begin" and "flags_end" members may be used to refer to the
+ * beginning and end, respectively, of the bitfield (and things like
+ * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
+ * fields should be annotated:
+ *
+ *     struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
+ *     kmemcheck_annotate_bitfield(a, flags);
+ *
+ * Note: We provide the same definitions for both kmemcheck and non-
+ * kmemcheck kernels. This makes it harder to introduce accidental errors. It
+ * is also allowed to pass NULL pointers to kmemcheck_annotate_bitfield().
+ */
+#define kmemcheck_bitfield_begin(name)	\
+	int name##_begin[0];
+
+#define kmemcheck_bitfield_end(name)	\
+	int name##_end[0];
+
+#define kmemcheck_annotate_bitfield(ptr, name)				\
+	do if (ptr) {							\
+		int _n = (long) &((ptr)->name##_end)			\
+			- (long) &((ptr)->name##_begin);		\
+		BUILD_BUG_ON(_n < 0);					\
+									\
+		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
+	} while (0)
+
+#define kmemcheck_annotate_variable(var)				\
+	do {								\
+		kmemcheck_mark_initialized(&(var), sizeof(var));	\
+	} while (0)							\
+
 #endif /* LINUX_KMEMCHECK_H */
-- 
cgit v1.2.3-70-g09d2


From fe55f6d5c0cfec4a710ef6ff63f162b99d5f7842 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Sat, 30 Aug 2008 12:16:35 +0200
Subject: net: use kmemcheck bitfields API for skbuff

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/skbuff.h | 7 +++++++
 net/core/skbuff.c      | 8 ++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fd389162f0..ed6537fc5b4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,6 +15,7 @@
 #define _LINUX_SKBUFF_H
 
 #include <linux/kernel.h>
+#include <linux/kmemcheck.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
 #include <linux/cache.h>
@@ -346,6 +347,7 @@ struct sk_buff {
 		};
 	};
 	__u32			priority;
+	kmemcheck_bitfield_begin(flags1);
 	__u8			local_df:1,
 				cloned:1,
 				ip_summed:2,
@@ -356,6 +358,7 @@ struct sk_buff {
 				ipvs_property:1,
 				peeked:1,
 				nf_trace:1;
+	kmemcheck_bitfield_end(flags1);
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
@@ -375,6 +378,8 @@ struct sk_buff {
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
 #endif
+
+	kmemcheck_bitfield_begin(flags2);
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
@@ -382,6 +387,8 @@ struct sk_buff {
 	__u8			do_not_encrypt:1;
 	__u8			requeue:1;
 #endif
+	kmemcheck_bitfield_end(flags2);
+
 	/* 0/13/14 bit hole */
 
 #ifdef CONFIG_NET_DMA
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c2e4fb8f354..f0c4c6ad774 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -39,6 +39,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/in.h>
@@ -201,6 +202,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	skb->data = data;
 	skb_reset_tail_pointer(skb);
 	skb->end = skb->tail + size;
+	kmemcheck_annotate_bitfield(skb, flags1);
+	kmemcheck_annotate_bitfield(skb, flags2);
 	/* make sure we initialize shinfo sequentially */
 	shinfo = skb_shinfo(skb);
 	atomic_set(&shinfo->dataref, 1);
@@ -217,6 +220,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
 
+		kmemcheck_annotate_bitfield(child, flags1);
+		kmemcheck_annotate_bitfield(child, flags2);
 		skb->fclone = SKB_FCLONE_ORIG;
 		atomic_set(fclone_ref, 1);
 
@@ -633,6 +638,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)
 			return NULL;
+
+		kmemcheck_annotate_bitfield(n, flags1);
+		kmemcheck_annotate_bitfield(n, flags2);
 		n->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From c53bd2e1949ddbe06fe2a6079c0658d58ce25edb Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Thu, 26 Feb 2009 14:05:59 +0100
Subject: c2port: annotate bitfield for kmemcheck

This silences a false positive warning with kmemcheck.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 drivers/misc/c2port/core.c | 2 ++
 include/linux/c2port.h     | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 0207dd59090..b5346b4db91 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
+#include <linux/kmemcheck.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/idr.h>
@@ -891,6 +892,7 @@ struct c2port_device *c2port_device_register(char *name,
 		return ERR_PTR(-EINVAL);
 
 	c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
+	kmemcheck_annotate_bitfield(c2dev, flags);
 	if (unlikely(!c2dev))
 		return ERR_PTR(-ENOMEM);
 
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 7b5a2388ba6..2a5cd867c36 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/kmemcheck.h>
 
 #define C2PORT_NAME_LEN			32
 
@@ -20,8 +21,10 @@
 /* Main struct */
 struct c2port_ops;
 struct c2port_device {
+	kmemcheck_bitfield_begin(flags);
 	unsigned int access:1;
 	unsigned int flash_access:1;
+	kmemcheck_bitfield_end(flags);
 
 	int id;
 	char name[C2PORT_NAME_LEN];
-- 
cgit v1.2.3-70-g09d2


From 1744a21d57d9c60136461adb6afa85e51b3e94d9 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Sat, 28 Feb 2009 08:29:44 +0100
Subject: trace: annotate bitfields in struct ring_buffer_event

This gets rid of a heap of false-positive warnings from the tracer
code due to the use of bitfields.

[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/ring_buffer.h | 4 ++++
 kernel/trace/ring_buffer.c  | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8670f1575fe..29f8599e6be 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_RING_BUFFER_H
 #define _LINUX_RING_BUFFER_H
 
+#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 
@@ -11,7 +12,10 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
+	kmemcheck_bitfield_begin(bitfield);
 	u32		type_len:5, time_delta:27;
+	kmemcheck_bitfield_end(bitfield);
+
 	u32		array[];
 };
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2e642b2b725..dc4dc70171c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -1270,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail < BUF_PAGE_SIZE) {
 		/* Mark the rest of the page with padding */
 		event = __rb_page_index(tail_page, tail);
+		kmemcheck_annotate_bitfield(event, bitfield);
 		rb_event_set_padding(event);
 	}
 
@@ -1327,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		return NULL;
 
 	event = __rb_page_index(tail_page, tail);
+	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(event, type, length);
 
 	/* The passed in type is zero for DATA */
-- 
cgit v1.2.3-70-g09d2


From 3446a8aa7ebcbc0a799e5e8fc4f2da0738d6bc21 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Sat, 16 May 2009 11:22:14 +0200
Subject: fs: introduce __getname_gfp()

The purpose of this change is to allow __getname() users to pass a
custom GFP mask to kmem_cache_alloc(). This is needed for annotating
a certain kmemcheck false positive.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 include/linux/fs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ede84fa7da5..6d12174fbe1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1919,8 +1919,9 @@ extern void __init vfs_caches_init(unsigned long);
 
 extern struct kmem_cache *names_cachep;
 
-#define __getname()	kmem_cache_alloc(names_cachep, GFP_KERNEL)
-#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
+#define __getname_gfp(gfp)	kmem_cache_alloc(names_cachep, (gfp))
+#define __getname()		__getname_gfp(GFP_KERNEL)
+#define __putname(name)		kmem_cache_free(names_cachep, (void *)(name))
 #ifndef CONFIG_AUDITSYSCALL
 #define putname(name)   __putname(name)
 #else
-- 
cgit v1.2.3-70-g09d2


From 465a454f254ee2ff7acc4aececbe31f8af046bc0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Jun 2009 12:31:37 +0200
Subject: x86, mm: Add __get_user_pages_fast()

Introduce a gup_fast() variant which is usable from IRQ/NMI context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Nick Piggin <npiggin@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/gup.c  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |  6 ++++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798..697d5727c11 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 	return 1;
 }
 
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					(void __user *)start, len)))
+		return 0;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad613ed66ab..b457bc047ab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -862,6 +862,12 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
+/*
+ * doesn't attempt to fault and will return short.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages);
+
 /*
  * A callback you can register to apply pressure to ageable caches.
  *
-- 
cgit v1.2.3-70-g09d2


From 9974458e2f9a11dbd2f4bd14fab5a79af4907b41 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Jun 2009 21:45:16 +1000
Subject: perf_counter: Make set_perf_counter_pending() declaration common

At present, every architecture that supports perf_counters has to
declare set_perf_counter_pending() in its arch-specific headers.
This consolidates the declarations into a single declaration in one
common place, include/linux/perf_counter.h.  On powerpc, we continue
to provide a static inline definition of set_perf_counter_pending()
in the powerpc hw_irq.h.

Also, this removes from the x86 perf_counter.h the unused null
definitions of {test,clear}_perf_counter_pending.

Reported-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: benh@kernel.crashing.org
LKML-Reference: <18998.13388.920691.523227@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/hw_irq.h       | 1 -
 arch/powerpc/include/asm/perf_counter.h | 2 ++
 arch/x86/include/asm/perf_counter.h     | 5 -----
 include/linux/perf_counter.h            | 1 +
 4 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 53512374e1c..1974cf191b0 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -163,7 +163,6 @@ static inline unsigned long test_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(void) {}
 static inline void clear_perf_counter_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index cc7c887705b..b398a84edce 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -10,6 +10,8 @@
  */
 #include <linux/types.h>
 
+#include <asm/hw_irq.h>
+
 #define MAX_HWCOUNTERS		8
 #define MAX_EVENT_ALTERNATIVES	8
 #define MAX_LIMITED_HWCOUNTERS	2
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 876ed97147b..5fb33e160ea 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,11 +84,6 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-extern void set_perf_counter_pending(void);
-
-#define clear_perf_counter_pending()	do { } while (0)
-#define test_perf_counter_pending()	(0)
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(void);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1b3118a1023..eccae437fe3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -604,6 +604,7 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_free_task(struct task_struct *task);
+extern void set_perf_counter_pending(void);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void __perf_disable(void);
-- 
cgit v1.2.3-70-g09d2


From 78be6914cb5c6d648617c51bb99bf81f28471d89 Mon Sep 17 00:00:00 2001
From: Wu Zhangjin <wuzj@lemote.com>
Date: Sun, 14 Jun 2009 14:52:30 +0800
Subject: tracing: fix undeclared 'PAGE_SIZE' in include/linux/trace_seq.h

when compiling linux-mips with kmemtrace enabled, there will be an
error:

include/linux/trace_seq.h:12: error: 'PAGE_SIZE' undeclared here (not in
				a function)

I checked the source code and found trace_seq.h used PAGE_SIZE but not
included the relative header file, so, fix it via adding the header file
<asm/page.h>

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Wu Zhangjin <wuzj@lemote.com>
LKML-Reference: <1244962350-28702-1-git-send-email-wuzhangjin@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index c68bccba207..c134dd1fe6b 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -3,6 +3,8 @@
 
 #include <linux/fs.h>
 
+#include <asm/page.h>
+
 /*
  * Trace sequences are used to allow a function to call several other functions
  * to create a string of data to use (up to a max of PAGE_SIZE.
-- 
cgit v1.2.3-70-g09d2


From 6dae44f9a55f13765c877687602cd2bf1a057cfd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 15 Jun 2009 18:52:52 +0200
Subject: ata: add ata_id_pio_need_iordy() helper (v2)

v2:
Minor fixes per Sergei's review.

Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ata.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 915da43edee..9c75921f0c1 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -800,6 +800,20 @@ static inline int ata_id_is_ssd(const u16 *id)
 	return id[ATA_ID_ROT_SPEED] == 0x01;
 }
 
+static inline int ata_id_pio_need_iordy(const u16 *id, const u8 pio)
+{
+	/* CF spec. r4.1 Table 22 says no IORDY on PIO5 and PIO6. */
+	if (pio > 4 && ata_id_is_cfa(id))
+		return 0;
+	/* For PIO3 and higher it is mandatory. */
+	if (pio > 2)
+		return 1;
+	/* Turn it on when possible. */
+	if (ata_id_has_iordy(id))
+		return 1;
+	return 0;
+}
+
 static inline int ata_drive_40wire(const u16 *dev_id)
 {
 	if (ata_id_is_sata(dev_id))
-- 
cgit v1.2.3-70-g09d2


From c9ef59ff01b6bd1c7360a64fcc8556a1193c2ed0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 15 Jun 2009 18:52:53 +0200
Subject: ide: IORDY handling fixes

Add ide_pio_need_iordy() helper and convert host drivers to use it.

This fixes it8172, it8213, pdc202xx_old, piix, slc90e66 and siimage
host drivers to handle IORDY correctly.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c      |  3 +--
 drivers/ide/ide-xfer-mode.c |  6 ++++++
 drivers/ide/it8172.c        |  2 +-
 drivers/ide/it8213.c        |  2 +-
 drivers/ide/pdc202xx_old.c  |  2 +-
 drivers/ide/piix.c          |  2 +-
 drivers/ide/siimage.c       | 15 ++++++++-------
 drivers/ide/sl82c105.c      |  3 +--
 drivers/ide/slc90e66.c      |  2 +-
 include/linux/ide.h         |  1 +
 10 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index fc0949a8cfd..dbfeda42b94 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -185,8 +185,7 @@ static void at91_ide_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	timing = ide_timing_find_mode(XFER_PIO_0 + pio);
 	BUG_ON(!timing);
 
-	if ((pio > 2 || ata_id_has_iordy(drive->id)) &&
-	    !(ata_id_is_cfa(drive->id) && pio > 4))
+	if (ide_pio_need_iordy(drive, pio))
 		use_iordy = 1;
 
 	apply_timings(chipselect, pio, timing, use_iordy);
diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c
index af44be9d546..0b47ca13907 100644
--- a/drivers/ide/ide-xfer-mode.c
+++ b/drivers/ide/ide-xfer-mode.c
@@ -107,6 +107,12 @@ u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
 }
 EXPORT_SYMBOL_GPL(ide_get_best_pio_mode);
 
+int ide_pio_need_iordy(ide_drive_t *drive, const u8 pio)
+{
+	return ata_id_pio_need_iordy(drive->id, pio);
+}
+EXPORT_SYMBOL_GPL(ide_pio_need_iordy);
+
 int ide_set_pio_mode(ide_drive_t *drive, const u8 mode)
 {
 	ide_hwif_t *hwif = drive->hwif;
diff --git a/drivers/ide/it8172.c b/drivers/ide/it8172.c
index e021078cd06..0d266a5b524 100644
--- a/drivers/ide/it8172.c
+++ b/drivers/ide/it8172.c
@@ -66,7 +66,7 @@ static void it8172_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	if (drive->media == ide_disk)
 		/* enable prefetch */
 		drive_enables |= 0x0004 << (drive->dn * 4);
-	if (ata_id_has_iordy(drive->id))
+	if (ide_pio_need_iordy(drive, pio))
 		/* enable IORDY sample-point */
 		drive_enables |= 0x0002 << (drive->dn * 4);
 
diff --git a/drivers/ide/it8213.c b/drivers/ide/it8213.c
index d7969b6d139..47976167796 100644
--- a/drivers/ide/it8213.c
+++ b/drivers/ide/it8213.c
@@ -50,7 +50,7 @@ static void it8213_set_pio_mode(ide_drive_t *drive, const u8 pio)
 		control |= 1;	/* Programmable timing on */
 	if (drive->media != ide_disk)
 		control |= 4;	/* ATAPI */
-	if (pio > 2)
+	if (ide_pio_need_iordy(drive, pio))
 		control |= 2;	/* IORDY */
 	if (is_slave) {
 		master_data |=  0x4000;
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index b6abf7e52ca..4f5b536282a 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -73,7 +73,7 @@ static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed)
 		 * Prefetch_EN / IORDY_EN / PA[3:0] bits of register A
 		 */
 		AP &= ~0x3f;
-		if (ata_id_iordy_disable(drive->id))
+		if (ide_pio_need_iordy(drive, speed - XFER_PIO_0))
 			AP |= 0x20;	/* set IORDY_EN bit */
 		if (drive->media == ide_disk)
 			AP |= 0x10;	/* set Prefetch_EN bit */
diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c
index 69860dea382..bf14f39bd3a 100644
--- a/drivers/ide/piix.c
+++ b/drivers/ide/piix.c
@@ -98,7 +98,7 @@ static void piix_set_pio_mode(ide_drive_t *drive, const u8 pio)
 		control |= 1;	/* Programmable timing on */
 	if (drive->media == ide_disk)
 		control |= 4;	/* Prefetch, post write */
-	if (pio > 2)
+	if (ide_pio_need_iordy(drive, pio))
 		control |= 2;	/* IORDY */
 	if (is_slave) {
 		master_data |=  0x4000;
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index bd82d228608..6a643fdf0e6 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -32,7 +32,6 @@
  *  smarter code in libata.
  *
  * TODO:
- * - IORDY fixes
  * - VDMA support
  */
 
@@ -234,8 +233,7 @@ static u8 sil_sata_udma_filter(ide_drive_t *drive)
  *	@pio: PIO mode number
  *
  *	Load the timing settings for this device mode into the
- *	controller. If we are in PIO mode 3 or 4 turn on IORDY
- *	monitoring (bit 9). The TF timing is bits 31:16
+ *	controller.
  */
 
 static void sil_set_pio_mode(ide_drive_t *drive, u8 pio)
@@ -276,13 +274,16 @@ static void sil_set_pio_mode(ide_drive_t *drive, u8 pio)
 	/* now set up IORDY */
 	speedp = sil_ioread16(dev, tfaddr - 2);
 	speedp &= ~0x200;
-	if (pio > 2)
-		speedp |= 0x200;
-	sil_iowrite16(dev, speedp, tfaddr - 2);
 
 	mode = sil_ioread8(dev, base + addr_mask);
 	mode &= ~(unit ? 0x30 : 0x03);
-	mode |= unit ? 0x10 : 0x01;
+
+	if (ide_pio_need_iordy(drive, pio)) {
+		speedp |= 0x200;
+		mode |= unit ? 0x10 : 0x01;
+	}
+
+	sil_iowrite16(dev, speedp, tfaddr - 2);
 	sil_iowrite8(dev, mode, base + addr_mask);
 }
 
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index 0924abff52f..88ac47085cd 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -61,8 +61,7 @@ static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio)
 	if (cmd_off == 0)
 		cmd_off = 1;
 
-	if ((pio > 2 || ata_id_has_iordy(drive->id)) &&
-	    !(pio > 4 && ata_id_is_cfa(drive->id)))
+	if (ide_pio_need_iordy(drive, pio))
 		iordy = 0x40;
 
 	return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy;
diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c
index f55d7d6313e..9aec78d3bcf 100644
--- a/drivers/ide/slc90e66.c
+++ b/drivers/ide/slc90e66.c
@@ -44,7 +44,7 @@ static void slc90e66_set_pio_mode(ide_drive_t *drive, const u8 pio)
 		control |= 1;	/* Programmable timing on */
 	if (drive->media == ide_disk)
 		control |= 4;	/* Prefetch, post write */
-	if (pio > 2)
+	if (ide_pio_need_iordy(drive, pio))
 		control |= 2;	/* IORDY */
 	if (is_slave) {
 		master_data |=  0x4000;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cdb29b6c195..8c7f5e50e91 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1514,6 +1514,7 @@ int ide_timing_compute(ide_drive_t *, u8, struct ide_timing *, int, int);
 int ide_scan_pio_blacklist(char *);
 const char *ide_xfer_verbose(u8);
 u8 ide_get_best_pio_mode(ide_drive_t *, u8, u8);
+int ide_pio_need_iordy(ide_drive_t *, const u8);
 int ide_set_pio_mode(ide_drive_t *, u8);
 int ide_set_dma_mode(ide_drive_t *, u8);
 void ide_set_pio(ide_drive_t *, u8);
-- 
cgit v1.2.3-70-g09d2


From 5880b5de7101cc123778c5d17d4f3986351f3122 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 15 Jun 2009 18:52:54 +0200
Subject: ide: don't enable IORDY at a probe time

* Add 'unsigned long port_flags' field to ide_hwif_t.

* Add IDE_PFLAG_PROBING port flag and keep it set during probing.

* Fix ide_pio_need_iordy() to not enable IORDY at a probe time
  (IORDY may lead to controller lock up on certain controllers
   if the port is not occupied).

Loosely based on the recent libata's fix by Tejun, thanks to Alan
for the hint that IDE may also need it.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c     | 16 +++++++++++++++-
 drivers/ide/ide-xfer-mode.c |  6 ++++++
 include/linux/ide.h         |  6 ++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index f371b0de314..fdd04bcd556 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1378,6 +1378,9 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 
 		ide_init_port(hwif, i & 1, d);
 		ide_port_cable_detect(hwif);
+
+		hwif->port_flags |= IDE_PFLAG_PROBING;
+
 		ide_port_init_devices(hwif);
 	}
 
@@ -1388,6 +1391,8 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 		if (ide_probe_port(hwif) == 0)
 			hwif->present = 1;
 
+		hwif->port_flags &= ~IDE_PFLAG_PROBING;
+
 		if ((hwif->host_flags & IDE_HFLAG_4DRIVES) == 0 ||
 		    hwif->mate == NULL || hwif->mate->present == 0) {
 			if (ide_register_port(hwif)) {
@@ -1569,11 +1574,20 @@ EXPORT_SYMBOL_GPL(ide_host_remove);
 
 void ide_port_scan(ide_hwif_t *hwif)
 {
+	int rc;
+
 	ide_port_apply_params(hwif);
 	ide_port_cable_detect(hwif);
+
+	hwif->port_flags |= IDE_PFLAG_PROBING;
+
 	ide_port_init_devices(hwif);
 
-	if (ide_probe_port(hwif) < 0)
+	rc = ide_probe_port(hwif);
+
+	hwif->port_flags &= ~IDE_PFLAG_PROBING;
+
+	if (rc < 0)
 		return;
 
 	hwif->present = 1;
diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c
index 0b47ca13907..46d203ce60c 100644
--- a/drivers/ide/ide-xfer-mode.c
+++ b/drivers/ide/ide-xfer-mode.c
@@ -109,6 +109,12 @@ EXPORT_SYMBOL_GPL(ide_get_best_pio_mode);
 
 int ide_pio_need_iordy(ide_drive_t *drive, const u8 pio)
 {
+	/*
+	 * IORDY may lead to controller lock up on certain controllers
+	 * if the port is not occupied.
+	 */
+	if (pio == 0 && (drive->hwif->port_flags & IDE_PFLAG_PROBING))
+		return 0;
 	return ata_id_pio_need_iordy(drive->id, pio);
 }
 EXPORT_SYMBOL_GPL(ide_pio_need_iordy);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 8c7f5e50e91..8771d49aa87 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -661,6 +661,10 @@ struct ide_dma_ops {
 	u8	(*dma_sff_read_status)(struct hwif_s *);
 };
 
+enum {
+	IDE_PFLAG_PROBING		= (1 << 0),
+};
+
 struct ide_host;
 
 typedef struct hwif_s {
@@ -677,6 +681,8 @@ typedef struct hwif_s {
 
 	ide_drive_t	*devices[MAX_DRIVES + 1];
 
+	unsigned long	port_flags;
+
 	u8 major;	/* our major number */
 	u8 index;	/* 0 for ide0; 1 for ide1; ... */
 	u8 channel;	/* for dual-port chips: 0=primary, 1=secondary */
-- 
cgit v1.2.3-70-g09d2


From f4d3ffa52a402ec9e8699571cf3811763d284459 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Mon, 15 Jun 2009 18:52:58 +0200
Subject: ide: move ack_intr() method into 'struct ide_port_ops' (take 2)

Move the ack_intr() method into 'struct ide_port_ops', also renaming it to
test_irq() while at it...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/buddha.c    | 15 +++++++++------
 drivers/ide/falconide.c |  1 -
 drivers/ide/gayle.c     | 14 +++++++++-----
 drivers/ide/ide-io.c    |  3 ++-
 drivers/ide/ide-probe.c |  1 -
 drivers/ide/macide.c    | 12 ++++--------
 drivers/ide/q40ide.c    |  7 ++-----
 include/linux/ide.h     | 10 +---------
 8 files changed, 27 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
index 9cd7b115763..ab4f169d083 100644
--- a/drivers/ide/buddha.c
+++ b/drivers/ide/buddha.c
@@ -99,7 +99,7 @@ static const char *buddha_board_name[] = { "Buddha", "Catweasel", "X-Surf" };
      *  Check and acknowledge the interrupt status
      */
 
-static int buddha_ack_intr(ide_hwif_t *hwif)
+static int buddha_test_irq(ide_hwif_t *hwif)
 {
     unsigned char ch;
 
@@ -118,8 +118,7 @@ static void xsurf_clear_irq(ide_drive_t *drive)
 }
 
 static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base,
-				      unsigned long ctl, unsigned long irq_port,
-				      ide_ack_intr_t *ack_intr)
+				      unsigned long ctl, unsigned long irq_port)
 {
 	int i;
 
@@ -134,14 +133,19 @@ static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base,
 	hw->io_ports.irq_addr = irq_port;
 
 	hw->irq = IRQ_AMIGA_PORTS;
-	hw->ack_intr = ack_intr;
 }
 
+static const struct ide_port_ops buddha_port_ops = {
+	.test_irq		= buddha_test_irq,
+};
+
 static const struct ide_port_ops xsurf_port_ops = {
 	.clear_irq		= xsurf_clear_irq,
+	.test_irq		= buddha_test_irq,
 };
 
 static const struct ide_port_info buddha_port_info = {
+	.port_ops		= &buddha_port_ops,
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
 	.irq_flags		= IRQF_SHARED,
 	.chipset		= ide_generic,
@@ -217,8 +221,7 @@ fail_base2:
 				irq_port = buddha_board + xsurf_irqports[i];
 			}
 
-			buddha_setup_ports(&hw[i], base, ctl, irq_port,
-					   buddha_ack_intr);
+			buddha_setup_ports(&hw[i], base, ctl, irq_port);
 
 			hws[i] = &hw[i];
 		}
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 22fa27389c3..a5a07ccb81a 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -128,7 +128,6 @@ static void __init falconide_setup_ports(struct ide_hw *hw)
 	hw->io_ports.ctl_addr = ATA_HD_BASE + ATA_HD_CONTROL;
 
 	hw->irq = IRQ_MFP_IDE;
-	hw->ack_intr = NULL;
 }
 
     /*
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index c5dd1e5cca4..b9e517de6a8 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -66,7 +66,7 @@ MODULE_PARM_DESC(doubler, "enable support for IDE doublers");
      *  Check and acknowledge the interrupt status
      */
 
-static int gayle_ack_intr(ide_hwif_t *hwif)
+static int gayle_test_irq(ide_hwif_t *hwif)
 {
     unsigned char ch;
 
@@ -85,8 +85,7 @@ static void gayle_a1200_clear_irq(ide_drive_t *drive)
 }
 
 static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base,
-				     unsigned long ctl, unsigned long irq_port,
-				     ide_ack_intr_t *ack_intr)
+				     unsigned long ctl, unsigned long irq_port)
 {
 	int i;
 
@@ -101,11 +100,15 @@ static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base,
 	hw->io_ports.irq_addr = irq_port;
 
 	hw->irq = IRQ_AMIGA_PORTS;
-	hw->ack_intr = ack_intr;
 }
 
+static const struct ide_port_ops gayle_a4000_port_ops = {
+	.test_irq		= gayle_test_irq,
+};
+
 static const struct ide_port_ops gayle_a1200_port_ops = {
 	.clear_irq		= gayle_a1200_clear_irq,
+	.test_irq		= gayle_test_irq,
 };
 
 static const struct ide_port_info gayle_port_info = {
@@ -148,6 +151,7 @@ found:
 	if (a4000) {
 	    phys_base = GAYLE_BASE_4000;
 	    irqport = (unsigned long)ZTWO_VADDR(GAYLE_IRQ_4000);
+	    d.port_ops = &gayle_a4000_port_ops;
 	} else {
 	    phys_base = GAYLE_BASE_1200;
 	    irqport = (unsigned long)ZTWO_VADDR(GAYLE_IRQ_1200);
@@ -164,7 +168,7 @@ found:
 	base = (unsigned long)ZTWO_VADDR(phys_base + i * GAYLE_NEXT_PORT);
 	ctrlport = GAYLE_HAS_CONTROL_REG ? (base + GAYLE_CONTROL) : 0;
 
-	gayle_setup_ports(&hw[i], base, ctrlport, irqport, gayle_ack_intr);
+	gayle_setup_ports(&hw[i], base, ctrlport, irqport);
 
 	hws[i] = &hw[i];
     }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9e53efe9fb2..1059f809b80 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -804,7 +804,8 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 
 	spin_lock_irqsave(&hwif->lock, flags);
 
-	if (hwif->ack_intr && hwif->ack_intr(hwif) == 0)
+	if (hwif->port_ops && hwif->port_ops->test_irq &&
+	    hwif->port_ops->test_irq(hwif) == 0)
 		goto out;
 
 	handler = hwif->handler;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index fdd04bcd556..c2e7159d793 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1170,7 +1170,6 @@ static void ide_init_port_hw(ide_hwif_t *hwif, struct ide_hw *hw)
 	hwif->irq = hw->irq;
 	hwif->dev = hw->dev;
 	hwif->gendev.parent = hw->parent ? hw->parent : hw->dev;
-	hwif->ack_intr = hw->ack_intr;
 	hwif->config_data = hw->config;
 }
 
diff --git a/drivers/ide/macide.c b/drivers/ide/macide.c
index 05cdab35a75..505ec43e560 100644
--- a/drivers/ide/macide.c
+++ b/drivers/ide/macide.c
@@ -53,7 +53,7 @@
 
 volatile unsigned char *ide_ifr = (unsigned char *) (IDE_BASE + IDE_IFR);
 
-int macide_ack_intr(ide_hwif_t* hwif)
+int macide_test_irq(ide_hwif_t *hwif)
 {
 	if (*ide_ifr & 0x20)
 		return 1;
@@ -66,7 +66,7 @@ static void macide_clear_irq(ide_drive_t *drive)
 }
 
 static void __init macide_setup_ports(struct ide_hw *hw, unsigned long base,
-				      int irq, ide_ack_intr_t *ack_intr)
+				      int irq)
 {
 	int i;
 
@@ -78,11 +78,11 @@ static void __init macide_setup_ports(struct ide_hw *hw, unsigned long base,
 	hw->io_ports.ctl_addr = base + IDE_CONTROL;
 
 	hw->irq = irq;
-	hw->ack_intr = ack_intr;
 }
 
 static const struct ide_port_ops macide_port_ops = {
 	.clear_irq		= macide_clear_irq,
+	.test_irq		= macide_test_irq,
 };
 
 static const struct ide_port_info macide_port_info = {
@@ -101,7 +101,6 @@ static const char *mac_ide_name[] =
 
 static int __init macide_init(void)
 {
-	ide_ack_intr_t *ack_intr;
 	unsigned long base;
 	int irq;
 	struct ide_hw hw, *hws[] = { &hw };
@@ -113,17 +112,14 @@ static int __init macide_init(void)
 	switch (macintosh_config->ide_type) {
 	case MAC_IDE_QUADRA:
 		base = IDE_BASE;
-		ack_intr = macide_ack_intr;
 		irq = IRQ_NUBUS_F;
 		break;
 	case MAC_IDE_PB:
 		base = IDE_BASE;
-		ack_intr = macide_ack_intr;
 		irq = IRQ_NUBUS_C;
 		break;
 	case MAC_IDE_BABOON:
 		base = BABOON_BASE;
-		ack_intr = NULL;
 		d.port_ops = NULL;
 		irq = IRQ_BABOON_1;
 		break;
@@ -134,7 +130,7 @@ static int __init macide_init(void)
 	printk(KERN_INFO "ide: Macintosh %s IDE controller\n",
 			 mac_ide_name[macintosh_config->ide_type - 1]);
 
-	macide_setup_ports(&hw, base, irq, ack_intr);
+	macide_setup_ports(&hw, base, irq);
 
 	return ide_host_add(&d, hws, 1, NULL);
 }
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index ab49a97023d..90786083b43 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -51,9 +51,7 @@ static int q40ide_default_irq(unsigned long base)
 /*
  * Addresses are pretranslated for Q40 ISA access.
  */
-static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base,
-			ide_ack_intr_t *ack_intr,
-			int irq)
+static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base, int irq)
 {
 	memset(hw, 0, sizeof(*hw));
 	/* BIG FAT WARNING: 
@@ -69,7 +67,6 @@ static void q40_ide_setup_ports(struct ide_hw *hw, unsigned long base,
 	hw->io_ports.ctl_addr = Q40_ISA_IO_B(base + 0x206);
 
 	hw->irq = irq;
-	hw->ack_intr = ack_intr;
 }
 
 static void q40ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
@@ -156,7 +153,7 @@ static int __init q40ide_init(void)
 		release_region(pcide_bases[i], 8);
 		continue;
 	}
-	q40_ide_setup_ports(&hw[i], pcide_bases[i], NULL,
+	q40_ide_setup_ports(&hw[i], pcide_bases[i],
 			q40ide_default_irq(pcide_bases[i]));
 
 	hws[i] = &hw[i];
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 8771d49aa87..08c91e21cf4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -156,12 +156,6 @@ enum {
 #define REQ_PARK_HEADS		0x22
 #define REQ_UNPARK_HEADS	0x23
 
-/*
- * Check for an interrupt and acknowledge the interrupt status
- */
-struct hwif_s;
-typedef int (ide_ack_intr_t)(struct hwif_s *);
-
 /*
  * hwif_chipset_t is used to keep track of the specific hardware
  * chipset used by each IDE interface, if known.
@@ -185,7 +179,6 @@ struct ide_hw {
 	};
 
 	int		irq;			/* our irq number */
-	ide_ack_intr_t	*ack_intr;		/* acknowledge interrupt */
 	struct device	*dev, *parent;
 	unsigned long	config;
 };
@@ -636,6 +629,7 @@ struct ide_port_ops {
 	void	(*maskproc)(ide_drive_t *, int);
 	void	(*quirkproc)(ide_drive_t *);
 	void	(*clear_irq)(ide_drive_t *);
+	int	(*test_irq)(struct hwif_s *);
 
 	u8	(*mdma_filter)(ide_drive_t *);
 	u8	(*udma_filter)(ide_drive_t *);
@@ -701,8 +695,6 @@ typedef struct hwif_s {
 
 	struct device *dev;
 
-	ide_ack_intr_t *ack_intr;
-
 	void (*rw_disk)(ide_drive_t *, struct request *);
 
 	const struct ide_tp_ops		*tp_ops;
-- 
cgit v1.2.3-70-g09d2


From 5bfb151f1f565e6082304a30e8c81dfb6ed0b0c8 Mon Sep 17 00:00:00 2001
From: Joao Ramos <joao.ramos@inov.pt>
Date: Mon, 15 Jun 2009 22:13:44 +0200
Subject: ide: do not access ide_drive_t 'drive_data' field directly

Change ide_drive_t 'drive_data' field from 'unsigned int' type to 'void *'
type, allowing a wider range of values/types to be stored in this field.

Added 'ide_get_drivedata' and 'ide_set_drivedata' helpers to get and set
the 'drive_data' field.

Fixed all host drivers to maintain coherency with the change in the
'drive_data' field type.

Signed-off-by: Joao Ramos <joao.ramos@inov.pt>
[bart: fix qd65xx build, cast to 'unsigned long', minor Coding Style fixups]
Acked-by: Sergei Shtylyov <sshtylyov@ru.montavista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cmd64x.c   |  8 +++++---
 drivers/ide/cs5536.c   | 23 +++++++++++++++--------
 drivers/ide/ht6560b.c  | 33 ++++++++++++++++++++++++---------
 drivers/ide/icside.c   | 10 ++++++----
 drivers/ide/opti621.c  |  8 +++++---
 drivers/ide/qd65xx.c   | 11 +++++++----
 drivers/ide/qd65xx.h   | 11 +++++++++--
 drivers/ide/sl82c105.c | 18 ++++++++++++------
 include/linux/ide.h    | 12 +++++++++++-
 9 files changed, 94 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index fd904e923a9..03c86209446 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -118,8 +118,9 @@ static void cmd64x_tune_pio(ide_drive_t *drive, const u8 pio)
 	ide_hwif_t *hwif	= drive->hwif;
 	struct pci_dev *dev	= to_pci_dev(hwif->dev);
 	struct ide_timing *t	= ide_timing_find_mode(XFER_PIO_0 + pio);
+	unsigned long setup_count;
 	unsigned int cycle_time;
-	u8 setup_count, arttim = 0;
+	u8 arttim = 0;
 
 	static const u8 setup_values[] = {0x40, 0x40, 0x40, 0x80, 0, 0xc0};
 	static const u8 arttim_regs[4] = {ARTTIM0, ARTTIM1, ARTTIM23, ARTTIM23};
@@ -140,10 +141,11 @@ static void cmd64x_tune_pio(ide_drive_t *drive, const u8 pio)
 	if (hwif->channel) {
 		ide_drive_t *pair = ide_get_pair_dev(drive);
 
-		drive->drive_data = setup_count;
+		ide_set_drivedata(drive, (void *)setup_count);
 
 		if (pair)
-			setup_count = max_t(u8, setup_count, pair->drive_data);
+			setup_count = max_t(u8, setup_count,
+					(unsigned long)ide_get_drivedata(pair));
 	}
 
 	if (setup_count > 5)		/* shouldn't actually happen... */
diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c
index 0332a95eefd..9623b852c61 100644
--- a/drivers/ide/cs5536.c
+++ b/drivers/ide/cs5536.c
@@ -146,14 +146,16 @@ static void cs5536_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	struct pci_dev *pdev = to_pci_dev(drive->hwif->dev);
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	int cshift = (drive->dn & 1) ? IDE_CAST_D1_SHIFT : IDE_CAST_D0_SHIFT;
+	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
 	u32 cast;
 	u8 cmd_pio = pio;
 
 	if (pair)
 		cmd_pio = min(pio, ide_get_best_pio_mode(pair, 255, 4));
 
-	drive->drive_data &= (IDE_DRV_MASK << 8);
-	drive->drive_data |= drv_timings[pio];
+	timings &= (IDE_DRV_MASK << 8);
+	timings |= drv_timings[pio];
+	ide_set_drivedata(drive, (void *)timings);
 
 	cs5536_program_dtc(drive, drv_timings[pio]);
 
@@ -186,6 +188,7 @@ static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode)
 
 	struct pci_dev *pdev = to_pci_dev(drive->hwif->dev);
 	int dshift = (drive->dn & 1) ? IDE_D1_SHIFT : IDE_D0_SHIFT;
+	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
 	u32 etc;
 
 	cs5536_read(pdev, ETC, &etc);
@@ -195,8 +198,9 @@ static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode)
 		etc |= udma_timings[mode - XFER_UDMA_0] << dshift;
 	} else { /* MWDMA */
 		etc &= ~(IDE_ETC_UDMA_MASK << dshift);
-		drive->drive_data &= IDE_DRV_MASK;
-		drive->drive_data |= mwdma_timings[mode - XFER_MW_DMA_0] << 8;
+		timings &= IDE_DRV_MASK;
+		timings |= mwdma_timings[mode - XFER_MW_DMA_0] << 8;
+		ide_set_drivedata(drive, (void *)timings);
 	}
 
 	cs5536_write(pdev, ETC, etc);
@@ -204,9 +208,11 @@ static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode)
 
 static void cs5536_dma_start(ide_drive_t *drive)
 {
+	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
+
 	if (drive->current_speed < XFER_UDMA_0 &&
-	    (drive->drive_data >> 8) != (drive->drive_data & IDE_DRV_MASK))
-		cs5536_program_dtc(drive, drive->drive_data >> 8);
+	    (timings >> 8) != (timings & IDE_DRV_MASK))
+		cs5536_program_dtc(drive, timings >> 8);
 
 	ide_dma_start(drive);
 }
@@ -214,10 +220,11 @@ static void cs5536_dma_start(ide_drive_t *drive)
 static int cs5536_dma_end(ide_drive_t *drive)
 {
 	int ret = ide_dma_end(drive);
+	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
 
 	if (drive->current_speed < XFER_UDMA_0 &&
-	    (drive->drive_data >> 8) != (drive->drive_data & IDE_DRV_MASK))
-		cs5536_program_dtc(drive, drive->drive_data & IDE_DRV_MASK);
+	    (timings >> 8) != (timings & IDE_DRV_MASK))
+		cs5536_program_dtc(drive, timings & IDE_DRV_MASK);
 
 	return ret;
 }
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
index 2fb0f296500..aafed8060e1 100644
--- a/drivers/ide/ht6560b.c
+++ b/drivers/ide/ht6560b.c
@@ -44,7 +44,12 @@
  *    bit3 (0x08): "1" 3 cycle time, "0" 2 cycle time	      (?)
  */
 #define HT_CONFIG_PORT	  0x3e6
-#define HT_CONFIG(drivea) (u8)(((drivea)->drive_data & 0xff00) >> 8)
+
+static inline u8 HT_CONFIG(ide_drive_t *drive)
+{
+	return ((unsigned long)ide_get_drivedata(drive) & 0xff00) >> 8;
+}
+
 /*
  * FIFO + PREFETCH (both a/b-model)
  */
@@ -90,7 +95,11 @@
  * Active Time for each drive. Smaller value gives higher speed.
  * In case of failures you should probably fall back to a higher value.
  */
-#define HT_TIMING(drivea) (u8)((drivea)->drive_data & 0x00ff)
+static inline u8 HT_TIMING(ide_drive_t *drive)
+{
+	return (unsigned long)ide_get_drivedata(drive) & 0x00ff;
+}
+
 #define HT_TIMING_DEFAULT 0xff
 
 /*
@@ -242,23 +251,27 @@ static DEFINE_SPINLOCK(ht6560b_lock);
  */
 static void ht_set_prefetch(ide_drive_t *drive, u8 state)
 {
-	unsigned long flags;
+	unsigned long flags, config;
 	int t = HT_PREFETCH_MODE << 8;
 
 	spin_lock_irqsave(&ht6560b_lock, flags);
 
+	config = (unsigned long)ide_get_drivedata(drive);
+
 	/*
 	 *  Prefetch mode and unmask irq seems to conflict
 	 */
 	if (state) {
-		drive->drive_data |= t;   /* enable prefetch mode */
+		config |= t;   /* enable prefetch mode */
 		drive->dev_flags |= IDE_DFLAG_NO_UNMASK;
 		drive->dev_flags &= ~IDE_DFLAG_UNMASK;
 	} else {
-		drive->drive_data &= ~t;  /* disable prefetch mode */
+		config &= ~t;  /* disable prefetch mode */
 		drive->dev_flags &= ~IDE_DFLAG_NO_UNMASK;
 	}
 
+	ide_set_drivedata(drive, (void *)config);
+
 	spin_unlock_irqrestore(&ht6560b_lock, flags);
 
 #ifdef DEBUG
@@ -268,7 +281,7 @@ static void ht_set_prefetch(ide_drive_t *drive, u8 state)
 
 static void ht6560b_set_pio_mode(ide_drive_t *drive, const u8 pio)
 {
-	unsigned long flags;
+	unsigned long flags, config;
 	u8 timing;
 	
 	switch (pio) {
@@ -281,8 +294,10 @@ static void ht6560b_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	timing = ht_pio2timings(drive, pio);
 
 	spin_lock_irqsave(&ht6560b_lock, flags);
-	drive->drive_data &= 0xff00;
-	drive->drive_data |= timing;
+	config = (unsigned long)ide_get_drivedata(drive);
+	config &= 0xff00;
+	config |= timing;
+	ide_set_drivedata(drive, (void *)config);
 	spin_unlock_irqrestore(&ht6560b_lock, flags);
 
 #ifdef DEBUG
@@ -299,7 +314,7 @@ static void __init ht6560b_init_dev(ide_drive_t *drive)
 	if (hwif->channel)
 		t |= (HT_SECONDARY_IF << 8);
 
-	drive->drive_data = t;
+	ide_set_drivedata(drive, (void *)t);
 }
 
 static int probe_ht6560b;
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 5af3d0ffaf0..0f67f1abbbd 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -187,7 +187,8 @@ static const expansioncard_ops_t icside_ops_arcin_v6 = {
  */
 static void icside_set_dma_mode(ide_drive_t *drive, const u8 xfer_mode)
 {
-	int cycle_time, use_dma_info = 0;
+	unsigned long cycle_time;
+	int use_dma_info = 0;
 
 	switch (xfer_mode) {
 	case XFER_MW_DMA_2:
@@ -218,10 +219,11 @@ static void icside_set_dma_mode(ide_drive_t *drive, const u8 xfer_mode)
 	if (use_dma_info && drive->id[ATA_ID_EIDE_DMA_TIME] > cycle_time)
 		cycle_time = drive->id[ATA_ID_EIDE_DMA_TIME];
 
-	drive->drive_data = cycle_time;
+	ide_set_drivedata(drive, (void *)cycle_time);
 
 	printk("%s: %s selected (peak %dMB/s)\n", drive->name,
-		ide_xfer_verbose(xfer_mode), 2000 / drive->drive_data);
+		ide_xfer_verbose(xfer_mode),
+		2000 / (unsigned long)ide_get_drivedata(drive));
 }
 
 static const struct ide_port_ops icside_v6_port_ops = {
@@ -277,7 +279,7 @@ static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	/*
 	 * Select the correct timing for this drive.
 	 */
-	set_dma_speed(ec->dma, drive->drive_data);
+	set_dma_speed(ec->dma, (unsigned long)ide_get_drivedata(drive));
 
 	/*
 	 * Tell the DMA engine about the SG table and
diff --git a/drivers/ide/opti621.c b/drivers/ide/opti621.c
index 6048eda3cd6..f1d70d6630f 100644
--- a/drivers/ide/opti621.c
+++ b/drivers/ide/opti621.c
@@ -138,6 +138,7 @@ static void opti621_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	ide_hwif_t *hwif = drive->hwif;
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	unsigned long flags;
+	unsigned long mode = XFER_PIO_0 + pio, pair_mode;
 	u8 tim, misc, addr_pio = pio, clk;
 
 	/* DRDY is default 2 (by OPTi Databook) */
@@ -150,11 +151,12 @@ static void opti621_set_pio_mode(ide_drive_t *drive, const u8 pio)
 		{ 0x48, 0x34, 0x21, 0x10, 0x10 }	/* 25 MHz */
 	};
 
-	drive->drive_data = XFER_PIO_0 + pio;
+	ide_set_drivedata(drive, (void *)mode);
 
 	if (pair) {
-		if (pair->drive_data && pair->drive_data < drive->drive_data)
-			addr_pio = pair->drive_data - XFER_PIO_0;
+		pair_mode = (unsigned long)ide_get_drivedata(pair);
+		if (pair_mode && pair_mode < mode)
+			addr_pio = pair_mode - XFER_PIO_0;
 	}
 
 	spin_lock_irqsave(&opti621_lock, flags);
diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c
index c9a13498689..74696edc8d1 100644
--- a/drivers/ide/qd65xx.c
+++ b/drivers/ide/qd65xx.c
@@ -180,8 +180,11 @@ static int qd_find_disk_type (ide_drive_t *drive,
 
 static void qd_set_timing (ide_drive_t *drive, u8 timing)
 {
-	drive->drive_data &= 0xff00;
-	drive->drive_data |= timing;
+	unsigned long data = (unsigned long)ide_get_drivedata(drive);
+
+	data &= 0xff00;
+	data |= timing;
+	ide_set_drivedata(drive, (void *)data);
 
 	printk(KERN_DEBUG "%s: %#x\n", drive->name, timing);
 }
@@ -292,7 +295,7 @@ static void __init qd6500_init_dev(ide_drive_t *drive)
 	u8 base = (hwif->config_data & 0xff00) >> 8;
 	u8 config = QD_CONFIG(hwif);
 
-	drive->drive_data = QD6500_DEF_DATA;
+	ide_set_drivedata(drive, (void *)QD6500_DEF_DATA);
 }
 
 static void __init qd6580_init_dev(ide_drive_t *drive)
@@ -308,7 +311,7 @@ static void __init qd6580_init_dev(ide_drive_t *drive)
 	} else
 		t2 = t1 = hwif->channel ? QD6580_DEF_DATA2 : QD6580_DEF_DATA;
 
-	drive->drive_data = (drive->dn & 1) ? t2 : t1;
+	ide_set_drivedata(drive, (void *)((drive->dn & 1) ? t2 : t1));
 }
 
 static const struct ide_tp_ops qd65xx_tp_ops = {
diff --git a/drivers/ide/qd65xx.h b/drivers/ide/qd65xx.h
index d7e67a1a1dc..1fba2a5f281 100644
--- a/drivers/ide/qd65xx.h
+++ b/drivers/ide/qd65xx.h
@@ -31,8 +31,15 @@
 
 #define QD_CONFIG(hwif)		((hwif)->config_data & 0x00ff)
 
-#define QD_TIMING(drive)	(u8)(((drive)->drive_data) & 0x00ff)
-#define QD_TIMREG(drive)	(u8)((((drive)->drive_data) & 0xff00) >> 8)
+static inline u8 QD_TIMING(ide_drive_t *drive)
+{
+	return (unsigned long)ide_get_drivedata(drive) & 0x00ff;
+}
+
+static inline u8 QD_TIMREG(ide_drive_t *drive)
+{
+	return ((unsigned long)ide_get_drivedata(drive) & 0xff00) >> 8;
+}
 
 #define QD6500_DEF_DATA		((QD_TIM1_PORT<<8) | (QD_ID3 ? 0x0c : 0x08))
 #define QD6580_DEF_DATA		((QD_TIM1_PORT<<8) | (QD_ID3 ? 0x0a : 0x00))
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index 6246bea585c..d698da470d6 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -73,6 +73,7 @@ static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio)
 static void sl82c105_set_pio_mode(ide_drive_t *drive, const u8 pio)
 {
 	struct pci_dev *dev	= to_pci_dev(drive->hwif->dev);
+	unsigned long timings	= (unsigned long)ide_get_drivedata(drive);
 	int reg			= 0x44 + drive->dn * 4;
 	u16 drv_ctrl;
 
@@ -82,8 +83,9 @@ static void sl82c105_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	 * Store the PIO timings so that we can restore them
 	 * in case DMA will be turned off...
 	 */
-	drive->drive_data &= 0xffff0000;
-	drive->drive_data |= drv_ctrl;
+	timings &= 0xffff0000;
+	timings |= drv_ctrl;
+	ide_set_drivedata(drive, (void *)timings);
 
 	pci_write_config_word(dev, reg,  drv_ctrl);
 	pci_read_config_word (dev, reg, &drv_ctrl);
@@ -99,6 +101,7 @@ static void sl82c105_set_pio_mode(ide_drive_t *drive, const u8 pio)
 static void sl82c105_set_dma_mode(ide_drive_t *drive, const u8 speed)
 {
 	static u16 mwdma_timings[] = {0x0707, 0x0201, 0x0200};
+	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
 	u16 drv_ctrl;
 
  	DBG(("sl82c105_tune_chipset(drive:%s, speed:%s)\n",
@@ -110,8 +113,9 @@ static void sl82c105_set_dma_mode(ide_drive_t *drive, const u8 speed)
 	 * Store the DMA timings so that we can actually program
 	 * them when DMA will be turned on...
 	 */
-	drive->drive_data &= 0x0000ffff;
-	drive->drive_data |= (unsigned long)drv_ctrl << 16;
+	timings &= 0x0000ffff;
+	timings |= (unsigned long)drv_ctrl << 16;
+	ide_set_drivedata(drive, (void *)timings);
 }
 
 static int sl82c105_test_irq(ide_hwif_t *hwif)
@@ -194,7 +198,8 @@ static void sl82c105_dma_start(ide_drive_t *drive)
 
 	DBG(("%s(drive:%s)\n", __func__, drive->name));
 
-	pci_write_config_word(dev, reg, drive->drive_data >> 16);
+	pci_write_config_word(dev, reg,
+			      (unsigned long)ide_get_drivedata(drive) >> 16);
 
 	sl82c105_reset_host(dev);
 	ide_dma_start(drive);
@@ -219,7 +224,8 @@ static int sl82c105_dma_end(ide_drive_t *drive)
 
 	ret = ide_dma_end(drive);
 
-	pci_write_config_word(dev, reg, drive->drive_data);
+	pci_write_config_word(dev, reg,
+			      (unsigned long)ide_get_drivedata(drive));
 
 	return ret;
 }
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 08c91e21cf4..95c6e00a72e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -532,7 +532,7 @@ struct ide_drive_s {
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
-	unsigned int	drive_data;	/* used by set_pio_mode/dev_select() */
+	void		*drive_data;	/* used by set_pio_mode/dev_select() */
 	unsigned int	failures;	/* current failure count */
 	unsigned int	max_failures;	/* maximum allowed failure count */
 	u64		probed_capacity;/* initial/native media capacity */
@@ -1550,6 +1550,16 @@ static inline ide_drive_t *ide_get_pair_dev(ide_drive_t *drive)
 	return (peer->dev_flags & IDE_DFLAG_PRESENT) ? peer : NULL;
 }
 
+static inline void *ide_get_drivedata(ide_drive_t *drive)
+{
+	return drive->drive_data;
+}
+
+static inline void ide_set_drivedata(ide_drive_t *drive, void *data)
+{
+	drive->drive_data = data;
+}
+
 #define ide_port_for_each_dev(i, dev, port) \
 	for ((i) = 0; ((dev) = (port)->devices[i]) || (i) < MAX_DRIVES; (i)++)
 
-- 
cgit v1.2.3-70-g09d2


From 3959214f971417f4162926ac52ad4cd042958caa Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 24 Mar 2009 15:43:30 +0100
Subject: sched: delayed cleanup of user_struct

During bootup performance tracing we see repeated occurrences of
/sys/kernel/uid/* events for the same uid, leading to a,
in this case, rather pointless userspace processing for the
same uid over and over.

This is usually caused by tools which change their uid to "nobody",
to run without privileges to read data supplied by untrusted users.

This change delays the execution of the (already existing) scheduled
work, to cleanup the uid after one second, so the allocated and announced
uid can possibly be re-used by another process.

This is the current behavior, where almost every invocation of a
binary, which changes the uid, creates two events:
  $ read START < /sys/kernel/uevent_seqnum; \
  for i in `seq 100`; do su --shell=/bin/true bin; done; \
  read END < /sys/kernel/uevent_seqnum; \
  echo $(($END - $START))
  178

With the delayed cleanup, we get only two events, and userspace finishes
a bit faster too:
  $ read START < /sys/kernel/uevent_seqnum; \
  for i in `seq 100`; do su --shell=/bin/true bin; done; \
  read END < /sys/kernel/uevent_seqnum; \
  echo $(($END - $START))
  1

Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sched.h |  2 +-
 kernel/user.c         | 67 ++++++++++++++++++++++++++++++---------------------
 2 files changed, 40 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa53007..7531b1c2820 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -674,7 +674,7 @@ struct user_struct {
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;
-	struct work_struct work;
+	struct delayed_work work;
 #endif
 #endif
 
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1..2c000e7132a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
 	put_user_ns(up->user_ns);
 }
 
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-	struct user_struct *user;
-	struct hlist_node *h;
-
-	hlist_for_each_entry(user, h, hashent, uidhash_node) {
-		if (user->uid == uid) {
-			atomic_inc(&user->__count);
-			return user;
-		}
-	}
-
-	return NULL;
-}
-
 #ifdef CONFIG_USER_SCHED
 
 static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
 
 #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
 
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+	struct user_struct *user;
+	struct hlist_node *h;
+
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
+		if (user->uid == uid) {
+			/* possibly resurrect an "almost deleted" object */
+			if (atomic_inc_return(&user->__count) == 1)
+				cancel_delayed_work(&user->work);
+			return user;
+		}
+	}
+
+	return NULL;
+}
+
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
 
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
 	return uids_user_create(&root_user);
 }
 
-/* work function to remove sysfs directory for a user and free up
+/* delayed work function to remove sysfs directory for a user and free up
  * corresponding structures.
  */
 static void cleanup_user_struct(struct work_struct *w)
 {
-	struct user_struct *up = container_of(w, struct user_struct, work);
+	struct user_struct *up = container_of(w, struct user_struct, work.work);
 	unsigned long flags;
 	int remove_user = 0;
 
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
 	 */
 	uids_mutex_lock();
 
-	local_irq_save(flags);
-
-	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+	spin_lock_irqsave(&uidhash_lock, flags);
+	if (atomic_read(&up->__count) == 0) {
 		uid_hash_remove(up);
 		remove_user = 1;
-		spin_unlock_irqrestore(&uidhash_lock, flags);
-	} else {
-		local_irq_restore(flags);
 	}
+	spin_unlock_irqrestore(&uidhash_lock, flags);
 
 	if (!remove_user)
 		goto done;
@@ -331,16 +330,28 @@ done:
  */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
-	/* restore back the count */
-	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
-
-	INIT_WORK(&up->work, cleanup_user_struct);
-	schedule_work(&up->work);
+	INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
+	schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
 }
 
 #else	/* CONFIG_USER_SCHED && CONFIG_SYSFS */
 
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+	struct user_struct *user;
+	struct hlist_node *h;
+
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
+		if (user->uid == uid) {
+			atomic_inc(&user->__count);
+			return user;
+		}
+	}
+
+	return NULL;
+}
+
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
-- 
cgit v1.2.3-70-g09d2


From 309b7d60a345f402bec3cf9caadb53de4028e2aa Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 24 Apr 2009 14:57:00 +0200
Subject: driver core: add BUS_NOTIFY_UNBOUND_DRIVER event

This patch adds a new bus notifier event which is emitted _after_ a
device is removed from its driver. This event will be used by the
dma-api debug code to check if a driver has released all dma allocations
for that device.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/dd.c      | 4 ++++
 include/linux/device.h | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 742cbe6b042..efd00de183b 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -320,6 +320,10 @@ static void __device_release_driver(struct device *dev)
 		devres_release_all(dev);
 		dev->driver = NULL;
 		klist_remove(&dev->p->knode_driver);
+		if (dev->bus)
+			blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
+						     BUS_NOTIFY_UNBOUND_DRIVER,
+						     dev);
 	}
 }
 
diff --git a/include/linux/device.h b/include/linux/device.h
index a4a7b10aaa4..4410464b134 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -114,6 +114,8 @@ extern int bus_unregister_notifier(struct bus_type *bus,
 #define BUS_NOTIFY_BOUND_DRIVER		0x00000003 /* driver bound to device */
 #define BUS_NOTIFY_UNBIND_DRIVER	0x00000004 /* driver about to be
 						      unbound */
+#define BUS_NOTIFY_UNBOUND_DRIVER	0x00000005 /* driver is unbound
+						      from the device */
 
 extern struct kset *bus_get_kset(struct bus_type *bus);
 extern struct klist *bus_get_device_klist(struct bus_type *bus);
-- 
cgit v1.2.3-70-g09d2


From c0afe7ba5e71d8ab66bc42f90b3e237581d3c509 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.ml.walleij@gmail.com>
Date: Mon, 27 Apr 2009 02:38:16 +0200
Subject: driver core: Const-correct platform getbyname functions

This converts resource and IRQ getbyname functions for the platform
bus to use const char *, I ran into compiler moanings when I tried
using a const char * for looking up a certain resource.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/platform.c         | 5 +++--
 include/linux/platform_device.h | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index ead3f64c41d..59df6290951 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -69,7 +69,8 @@ EXPORT_SYMBOL_GPL(platform_get_irq);
  * @name: resource name
  */
 struct resource *platform_get_resource_byname(struct platform_device *dev,
-					      unsigned int type, char *name)
+					      unsigned int type,
+					      const char *name)
 {
 	int i;
 
@@ -88,7 +89,7 @@ EXPORT_SYMBOL_GPL(platform_get_resource_byname);
  * @dev: platform device
  * @name: IRQ name
  */
-int platform_get_irq_byname(struct platform_device *dev, char *name)
+int platform_get_irq_byname(struct platform_device *dev, const char *name)
 {
 	struct resource *r = platform_get_resource_byname(dev, IORESOURCE_IRQ,
 							  name);
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index b67bb5d7b22..8dc5123b630 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -36,8 +36,8 @@ extern struct device platform_bus;
 
 extern struct resource *platform_get_resource(struct platform_device *, unsigned int, unsigned int);
 extern int platform_get_irq(struct platform_device *, unsigned int);
-extern struct resource *platform_get_resource_byname(struct platform_device *, unsigned int, char *);
-extern int platform_get_irq_byname(struct platform_device *, char *);
+extern struct resource *platform_get_resource_byname(struct platform_device *, unsigned int, const char *);
+extern int platform_get_irq_byname(struct platform_device *, const char *);
 extern int platform_add_devices(struct platform_device **, int);
 
 extern struct platform_device *platform_device_register_simple(const char *, int id,
-- 
cgit v1.2.3-70-g09d2


From 5e8e9245f9bb4c6ed2a2f4af42f2ec9733dc5378 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 27 May 2009 00:49:37 +0200
Subject: firmware: FIRMWARE_NAME_MAX removal

As we're allocating the firmware name dynamically, we no longer need this
definition.
This patch must be applied only after the 5 previous patches from this pacth
set have been applied.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/firmware.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index c8ecf5b2a20..d3154462843 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-#define FIRMWARE_NAME_MAX 30 
 #define FW_ACTION_NOHOTPLUG 0
 #define FW_ACTION_HOTPLUG 1
 
-- 
cgit v1.2.3-70-g09d2


From 6fcf53acccf85b4b0d0260e66c692a341760f464 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Apr 2009 15:23:42 +0200
Subject: Driver Core: add nodename callbacks

This adds the nodename callback for struct class, struct device_type and
struct device, to allow drivers to send userspace hints on the device
name and subdirectory that should be used for it.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c    | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/device.h |  3 +++
 2 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4d59975c24a..7ecb1938e59 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -162,10 +162,18 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 	struct device *dev = to_dev(kobj);
 	int retval = 0;
 
-	/* add the major/minor if present */
+	/* add device node properties if present */
 	if (MAJOR(dev->devt)) {
+		const char *tmp;
+		const char *name;
+
 		add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
 		add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
+		name = device_get_nodename(dev, &tmp);
+		if (name) {
+			add_uevent_var(env, "DEVNAME=%s", name);
+			kfree(tmp);
+		}
 	}
 
 	if (dev->type && dev->type->name)
@@ -1128,6 +1136,47 @@ static struct device *next_device(struct klist_iter *i)
 	return dev;
 }
 
+/**
+ * device_get_nodename - path of device node file
+ * @dev: device
+ * @tmp: possibly allocated string
+ *
+ * Return the relative path of a possible device node.
+ * Non-default names may need to allocate a memory to compose
+ * a name. This memory is returned in tmp and needs to be
+ * freed by the caller.
+ */
+const char *device_get_nodename(struct device *dev, const char **tmp)
+{
+	char *s;
+
+	*tmp = NULL;
+
+	/* the device type may provide a specific name */
+	if (dev->type && dev->type->nodename)
+		*tmp = dev->type->nodename(dev);
+	if (*tmp)
+		return *tmp;
+
+	/* the class may provide a specific name */
+	if (dev->class && dev->class->nodename)
+		*tmp = dev->class->nodename(dev);
+	if (*tmp)
+		return *tmp;
+
+	/* return name without allocation, tmp == NULL */
+	if (strchr(dev_name(dev), '!') == NULL)
+		return dev_name(dev);
+
+	/* replace '!' in the name with '/' */
+	*tmp = kstrdup(dev_name(dev), GFP_KERNEL);
+	if (!*tmp)
+		return NULL;
+	while ((s = strchr(*tmp, '!')))
+		s[0] = '/';
+	return *tmp;
+}
+
 /**
  * device_for_each_child - device child iterator.
  * @parent: parent struct device.
diff --git a/include/linux/device.h b/include/linux/device.h
index 4410464b134..ed4e39f2c42 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -194,6 +194,7 @@ struct class {
 	struct kobject			*dev_kobj;
 
 	int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
+	char *(*nodename)(struct device *dev);
 
 	void (*class_release)(struct class *class);
 	void (*dev_release)(struct device *dev);
@@ -289,6 +290,7 @@ struct device_type {
 	const char *name;
 	struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
+	char *(*nodename)(struct device *dev);
 	void (*release)(struct device *dev);
 
 	struct dev_pm_ops *pm;
@@ -488,6 +490,7 @@ extern struct device *device_find_child(struct device *dev, void *data,
 extern int device_rename(struct device *dev, char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
+extern const char *device_get_nodename(struct device *dev, const char **tmp);
 
 /*
  * Root device objects for grouping under /sys/devices
-- 
cgit v1.2.3-70-g09d2


From d405640539555b601e52f7d18f1f0b1345d18bf5 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Apr 2009 15:23:42 +0200
Subject: Driver Core: misc: add nodename support for misc devices.

This adds support for misc devices to report their requested nodename to
userspace.  It also updates a number of misc drivers to provide the
needed subdirectory and device name to be used for them.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/microcode_core.c |  1 +
 drivers/char/hw_random/core.c    |  1 +
 drivers/char/misc.c              | 15 ++++++++++++---
 drivers/md/dm-ioctl.c            |  1 +
 drivers/net/tun.c                |  1 +
 include/linux/miscdevice.h       |  1 +
 6 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9c4461501fc..9371448290a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = {
 static struct miscdevice microcode_dev = {
 	.minor			= MICROCODE_MINOR,
 	.name			= "microcode",
+	.devnode		= "cpu/microcode",
 	.fops			= &microcode_fops,
 };
 
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index e5d583c84e4..fc93e2fc7c7 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -153,6 +153,7 @@ static const struct file_operations rng_chrdev_ops = {
 static struct miscdevice rng_miscdev = {
 	.minor		= RNG_MISCDEV_MINOR,
 	.name		= RNG_MODULE_NAME,
+	.devnode	= "hwrng",
 	.fops		= &rng_chrdev_ops,
 };
 
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index a5e0db9d766..62c99fa59e2 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -168,7 +168,6 @@ static const struct file_operations misc_fops = {
 	.open		= misc_open,
 };
 
-
 /**
  *	misc_register	-	register a miscellaneous device
  *	@misc: device structure
@@ -217,8 +216,8 @@ int misc_register(struct miscdevice * misc)
 		misc_minors[misc->minor >> 3] |= 1 << (misc->minor & 7);
 	dev = MKDEV(MISC_MAJOR, misc->minor);
 
-	misc->this_device = device_create(misc_class, misc->parent, dev, NULL,
-					  "%s", misc->name);
+	misc->this_device = device_create(misc_class, misc->parent, dev,
+					  misc, "%s", misc->name);
 	if (IS_ERR(misc->this_device)) {
 		err = PTR_ERR(misc->this_device);
 		goto out;
@@ -264,6 +263,15 @@ int misc_deregister(struct miscdevice *misc)
 EXPORT_SYMBOL(misc_register);
 EXPORT_SYMBOL(misc_deregister);
 
+static char *misc_nodename(struct device *dev)
+{
+	struct miscdevice *c = dev_get_drvdata(dev);
+
+	if (c->devnode)
+		return kstrdup(c->devnode, GFP_KERNEL);
+	return NULL;
+}
+
 static int __init misc_init(void)
 {
 	int err;
@@ -279,6 +287,7 @@ static int __init misc_init(void)
 	err = -EIO;
 	if (register_chrdev(MISC_MAJOR,"misc",&misc_fops))
 		goto fail_printk;
+	misc_class->nodename = misc_nodename;
 	return 0;
 
 fail_printk:
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 823ceba6efa..1128d3fba79 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1513,6 +1513,7 @@ static const struct file_operations _ctl_fops = {
 static struct miscdevice _dm_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DM_NAME,
+	.devnode	= "mapper/control",
 	.fops  		= &_ctl_fops
 };
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 811d3517fce..11a0ba47b67 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1366,6 +1366,7 @@ static const struct file_operations tun_fops = {
 static struct miscdevice tun_miscdev = {
 	.minor = TUN_MINOR,
 	.name = "tun",
+	.devnode = "net/tun",
 	.fops = &tun_fops,
 };
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index beb6ec99cfe..05211774462 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -41,6 +41,7 @@ struct miscdevice  {
 	struct list_head list;
 	struct device *parent;
 	struct device *this_device;
+	const char *devnode;
 };
 
 extern int misc_register(struct miscdevice * misc);
-- 
cgit v1.2.3-70-g09d2


From f7a386c5b8ff34cd84ae922603d1c6f9d234edee Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Apr 2009 15:23:42 +0200
Subject: Driver Core: usb: add nodename support for usb drivers.

This adds support for USB drivers to report their requested nodename to
userspace.  It also updates a number of USB drivers to provide the
needed subdirectory and device name to be used for them.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/hid/usbhid/hiddev.c     |  7 ++++++-
 drivers/media/video/dabusb.c    |  6 ++++++
 drivers/usb/class/usblp.c       |  6 ++++++
 drivers/usb/core/file.c         | 13 ++++++++++++-
 drivers/usb/core/usb.c          | 11 +++++++++++
 drivers/usb/misc/iowarrior.c    |  6 ++++++
 drivers/usb/misc/legousbtower.c |  6 ++++++
 include/linux/usb.h             |  3 +++
 8 files changed, 56 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index e9b436d2d94..9e9421525fb 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -850,8 +850,14 @@ static const struct file_operations hiddev_fops = {
 #endif
 };
 
+static char *hiddev_nodename(struct device *dev)
+{
+	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
+}
+
 static struct usb_class_driver hiddev_class = {
 	.name =		"hiddev%d",
+	.nodename =	hiddev_nodename,
 	.fops =		&hiddev_fops,
 	.minor_base =	HIDDEV_MINOR_BASE,
 };
@@ -955,7 +961,6 @@ static int hiddev_usbd_probe(struct usb_interface *intf,
 	return -ENODEV;
 }
 
-
 static /* const */ struct usb_driver hiddev_driver = {
 	.name =		"hiddev",
 	.probe =	hiddev_usbd_probe,
diff --git a/drivers/media/video/dabusb.c b/drivers/media/video/dabusb.c
index ba3709bec3f..ec2f45dde16 100644
--- a/drivers/media/video/dabusb.c
+++ b/drivers/media/video/dabusb.c
@@ -747,8 +747,14 @@ static const struct file_operations dabusb_fops =
 	.release =	dabusb_release,
 };
 
+static char *dabusb_nodename(struct device *dev)
+{
+	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
+}
+
 static struct usb_class_driver dabusb_class = {
 	.name =		"dabusb%d",
+	.nodename =	dabusb_nodename,
 	.fops =		&dabusb_fops,
 	.minor_base =	DABUSB_MINOR,
 };
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index d2747a49b97..26c09f0257d 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1057,8 +1057,14 @@ static const struct file_operations usblp_fops = {
 	.release =	usblp_release,
 };
 
+static char *usblp_nodename(struct device *dev)
+{
+	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
+}
+
 static struct usb_class_driver usblp_class = {
 	.name =		"lp%d",
+	.nodename =	usblp_nodename,
 	.fops =		&usblp_fops,
 	.minor_base =	USBLP_MINOR_BASE,
 };
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 997e659ff69..5cef88929b3 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -67,6 +67,16 @@ static struct usb_class {
 	struct class *class;
 } *usb_class;
 
+static char *usb_nodename(struct device *dev)
+{
+	struct usb_class_driver *drv;
+
+	drv = dev_get_drvdata(dev);
+	if (!drv || !drv->nodename)
+		return NULL;
+	return drv->nodename(dev);
+}
+
 static int init_usb_class(void)
 {
 	int result = 0;
@@ -90,6 +100,7 @@ static int init_usb_class(void)
 		kfree(usb_class);
 		usb_class = NULL;
 	}
+	usb_class->class->nodename = usb_nodename;
 
 exit:
 	return result;
@@ -198,7 +209,7 @@ int usb_register_dev(struct usb_interface *intf,
 	else
 		temp = name;
 	intf->usb_dev = device_create(usb_class->class, &intf->dev,
-				      MKDEV(USB_MAJOR, minor), NULL,
+				      MKDEV(USB_MAJOR, minor), class_driver,
 				      "%s", temp);
 	if (IS_ERR(intf->usb_dev)) {
 		down_write(&minor_rwsem);
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 7eee400d3e3..927a27dd2f8 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -305,10 +305,21 @@ static struct dev_pm_ops usb_device_pm_ops = {
 
 #endif	/* CONFIG_PM */
 
+
+static char *usb_nodename(struct device *dev)
+{
+	struct usb_device *usb_dev;
+
+	usb_dev = to_usb_device(dev);
+	return kasprintf(GFP_KERNEL, "bus/usb/%03d/%03d",
+			 usb_dev->bus->busnum, usb_dev->devnum);
+}
+
 struct device_type usb_device_type = {
 	.name =		"usb_device",
 	.release =	usb_release_dev,
 	.uevent =	usb_dev_uevent,
+	.nodename = 	usb_nodename,
 	.pm =		&usb_device_pm_ops,
 };
 
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index a4ef77ef917..3c5fe5cee05 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -726,12 +726,18 @@ static const struct file_operations iowarrior_fops = {
 	.poll = iowarrior_poll,
 };
 
+static char *iowarrior_nodename(struct device *dev)
+{
+	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
+}
+
 /*
  * usb class driver info in order to get a minor number from the usb core,
  * and to have the device registered with devfs and the driver core
  */
 static struct usb_class_driver iowarrior_class = {
 	.name = "iowarrior%d",
+	.nodename = iowarrior_nodename,
 	.fops = &iowarrior_fops,
 	.minor_base = IOWARRIOR_MINOR_BASE,
 };
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index ab0f3226158..c1e2433f640 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -266,12 +266,18 @@ static const struct file_operations tower_fops = {
 	.llseek =	tower_llseek,
 };
 
+static char *legousbtower_nodename(struct device *dev)
+{
+	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
+}
+
 /*
  * usb class driver info in order to get a minor number from the usb core,
  * and to have the device registered with the driver core
  */
 static struct usb_class_driver tower_class = {
 	.name =		"legousbtower%d",
+	.nodename = 	legousbtower_nodename,
 	.fops =		&tower_fops,
 	.minor_base =	LEGO_USB_TOWER_MINOR_BASE,
 };
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3aa2cd1f8d0..34cdfcac455 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -869,6 +869,8 @@ struct usb_driver {
  * struct usb_device_driver - identifies USB device driver to usbcore
  * @name: The driver name should be unique among USB drivers,
  *	and should normally be the same as the module name.
+ * @nodename: Callback to provide a naming hint for a possible
+ *	device node to create.
  * @probe: Called to see if the driver is willing to manage a particular
  *	device.  If it is, probe returns zero and uses dev_set_drvdata()
  *	to associate driver-specific data with the device.  If unwilling
@@ -912,6 +914,7 @@ extern struct bus_type usb_bus_type;
  */
 struct usb_class_driver {
 	char *name;
+	char *(*nodename)(struct device *dev);
 	const struct file_operations *fops;
 	int minor_base;
 };
-- 
cgit v1.2.3-70-g09d2


From b03f38b685e2e1db174fb8982930e789a516f414 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Apr 2009 15:23:42 +0200
Subject: Driver Core: block: add nodename support for block drivers.

This adds support for block drivers to report their requested nodename
to userspace.  It also updates a number of block drivers to provide the
needed subdirectory and device name to be used for them.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c           | 10 ++++++++++
 drivers/block/pktcdvd.c |  7 +++++++
 include/linux/genhd.h   |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index fe7ccc0a618..f4c64c2b303 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -996,10 +996,20 @@ struct class block_class = {
 	.name		= "block",
 };
 
+static char *block_nodename(struct device *dev)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	if (disk->nodename)
+		return disk->nodename(disk);
+	return NULL;
+}
+
 static struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
+	.nodename	= block_nodename,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d57f1175948..37e0f81cada 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2855,6 +2855,11 @@ static struct block_device_operations pktcdvd_ops = {
 	.media_changed =	pkt_media_changed,
 };
 
+static char *pktcdvd_nodename(struct gendisk *gd)
+{
+	return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
+}
+
 /*
  * Set up mapping from pktcdvd device to CD-ROM device.
  */
@@ -2907,6 +2912,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	disk->fops = &pktcdvd_ops;
 	disk->flags = GENHD_FL_REMOVABLE;
 	strcpy(disk->disk_name, pd->name);
+	disk->nodename = pktcdvd_nodename;
 	disk->private_data = pd;
 	disk->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!disk->queue)
@@ -3062,6 +3068,7 @@ static const struct file_operations pkt_ctl_fops = {
 static struct miscdevice pkt_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DRIVER_NAME,
+	.name  		= "pktcdvd/control",
 	.fops  		= &pkt_ctl_fops
 };
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7cbd38d363a..45fc320a53c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -142,7 +142,7 @@ struct gendisk {
                                          * disks that can't be partitioned. */
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
-
+	char *(*nodename)(struct gendisk *gd);
 	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other
 	 * non-critical accesses use RCU.  Always access through
-- 
cgit v1.2.3-70-g09d2


From 4b9d0d3b81ec0900971cbe8aba8ba0ae9c08a087 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 30 Apr 2009 14:43:31 -0700
Subject: eisa: remove driver_data direct access of struct device

In the near future, the driver core is going to not allow direct access
to the driver_data pointer in struct device.  Instead, the functions
dev_get_drvdata() and dev_set_drvdata() should be used.  These functions
have been around since the beginning, so are backwards compatible with
all older kernel versions.


Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/eisa/pci_eisa.c     | 2 +-
 drivers/eisa/virtual_root.c | 2 +-
 include/linux/eisa.h        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/eisa/pci_eisa.c b/drivers/eisa/pci_eisa.c
index 74edb1d0110..0dd0f633b18 100644
--- a/drivers/eisa/pci_eisa.c
+++ b/drivers/eisa/pci_eisa.c
@@ -31,11 +31,11 @@ static int __init pci_eisa_init(struct pci_dev *pdev,
 	}
 
 	pci_eisa_root.dev              = &pdev->dev;
-	pci_eisa_root.dev->driver_data = &pci_eisa_root;
 	pci_eisa_root.res	       = pdev->bus->resource[0];
 	pci_eisa_root.bus_base_addr    = pdev->bus->resource[0]->start;
 	pci_eisa_root.slots	       = EISA_MAX_SLOTS;
 	pci_eisa_root.dma_mask         = pdev->dma_mask;
+	dev_set_drvdata(pci_eisa_root.dev, &pci_eisa_root);
 
 	if (eisa_root_register (&pci_eisa_root)) {
 		printk (KERN_ERR "pci_eisa : Could not register EISA root\n");
diff --git a/drivers/eisa/virtual_root.c b/drivers/eisa/virtual_root.c
index 3074879f231..535e4f9c83f 100644
--- a/drivers/eisa/virtual_root.c
+++ b/drivers/eisa/virtual_root.c
@@ -57,7 +57,7 @@ static int __init virtual_eisa_root_init (void)
 
 	eisa_bus_root.force_probe = force_probe;
 	
-	eisa_root_dev.dev.driver_data = &eisa_bus_root;
+	dev_set_drvdata(&eisa_root_dev.dev, &eisa_bus_root);
 
 	if (eisa_root_register (&eisa_bus_root)) {
 		/* A real bridge may have been registered before
diff --git a/include/linux/eisa.h b/include/linux/eisa.h
index e61c0be2a45..6925249a5ac 100644
--- a/include/linux/eisa.h
+++ b/include/linux/eisa.h
@@ -78,12 +78,12 @@ static inline void eisa_driver_unregister (struct eisa_driver *edrv) { }
 /* Mimics pci.h... */
 static inline void *eisa_get_drvdata (struct eisa_device *edev)
 {
-        return edev->dev.driver_data;
+        return dev_get_drvdata(&edev->dev);
 }
 
 static inline void eisa_set_drvdata (struct eisa_device *edev, void *data)
 {
-        edev->dev.driver_data = data;
+        dev_set_drvdata(&edev->dev, data);
 }
 
 /* The EISA root device. There's rumours about machines with multiple
-- 
cgit v1.2.3-70-g09d2


From 156f5a7801195fa2ce44aeeb62d6cf8468f3332a Mon Sep 17 00:00:00 2001
From: GeunSik Lim <leemgs1@gmail.com>
Date: Tue, 2 Jun 2009 15:01:37 +0900
Subject: debugfs: Fix terminology inconsistency of dir name to mount debugfs
 filesystem.

Many developers use "/debug/" or "/debugfs/" or "/sys/kernel/debug/"
directory name to mount debugfs filesystem for ftrace according to
./Documentation/tracers/ftrace.txt file.

And, three directory names(ex:/debug/, /debugfs/, /sys/kernel/debug/) is
existed in kernel source like ftrace, DRM, Wireless, Documentation,
Network[sky2]files to mount debugfs filesystem.

debugfs means debug filesystem for debugging easy to use by greg kroah
hartman. "/sys/kernel/debug/" name is suitable as directory name
of debugfs filesystem.
- debugfs related reference: http://lwn.net/Articles/334546/

Fix inconsistency of directory name to mount debugfs filesystem.

* From Steven Rostedt
  - find_debugfs() and tracing_files() in this patch.

Signed-off-by: GeunSik Lim <geunsik.lim@samsung.com>
Acked-by     : Inaky Perez-Gonzalez <inaky@linux.intel.com>
Reviewed-by  : Steven Rostedt <rostedt@goodmis.org>
Reviewed-by  : James Smart <james.smart@emulex.com>
CC: Jiri Kosina <trivial@kernel.org>
CC: David Airlie <airlied@linux.ie>
CC: Peter Osterlund <petero2@telia.com>
CC: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
CC: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
CC: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/DocBook/debugobjects.tmpl           |   2 +-
 Documentation/cdrom/packet-writing.txt            |   2 +-
 Documentation/fault-injection/fault-injection.txt |  70 +++----
 Documentation/kprobes.txt                         |   6 +-
 Documentation/trace/ftrace.txt                    | 233 +++++++++++++---------
 Documentation/trace/mmiotrace.txt                 |  26 +--
 drivers/block/pktcdvd.c                           |   2 +-
 drivers/gpu/drm/drm_debugfs.c                     |  12 +-
 drivers/gpu/drm/drm_drv.c                         |   2 +-
 drivers/gpu/drm/drm_stub.c                        |   2 +-
 drivers/net/Kconfig                               |   4 +-
 drivers/net/wimax/i2400m/i2400m.h                 |   2 +-
 drivers/net/wireless/ath/ath5k/Kconfig            |   5 +-
 drivers/net/wireless/libertas/README              |  12 +-
 drivers/scsi/lpfc/lpfc_debugfs.c                  |   3 +-
 include/linux/kernel.h                            |   2 +-
 include/linux/tracepoint.h                        |   4 +-
 kernel/trace/Kconfig                              |  10 +-
 kernel/trace/trace.c                              |  23 +--
 scripts/tracing/draw_functrace.py                 |   7 +-
 20 files changed, 238 insertions(+), 191 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 7f5f218015f..08ff908aa7a 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -106,7 +106,7 @@
       number of errors are printk'ed including a full stack trace.
     </para>
     <para>
-      The statistics are available via debugfs/debug_objects/stats.
+      The statistics are available via /sys/kernel/debug/debug_objects/stats.
       They provide information about the number of warnings and the
       number of successful fixups along with information about the
       usage of the internal tracking objects and the state of the
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
index cf1f8126991..1c407778c8b 100644
--- a/Documentation/cdrom/packet-writing.txt
+++ b/Documentation/cdrom/packet-writing.txt
@@ -117,7 +117,7 @@ Using the pktcdvd debugfs interface
 
 To read pktcdvd device infos in human readable form, do:
 
-	# cat /debug/pktcdvd/pktcdvd[0-7]/info
+	# cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
 
 For a description of the debugfs interface look into the file:
 
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 4bc374a1434..07930564079 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -29,16 +29,16 @@ o debugfs entries
 fault-inject-debugfs kernel module provides some debugfs entries for runtime
 configuration of fault-injection capabilities.
 
-- /debug/fail*/probability:
+- /sys/kernel/debug/fail*/probability:
 
 	likelihood of failure injection, in percent.
 	Format: <percent>
 
 	Note that one-failure-per-hundred is a very high error rate
 	for some testcases.  Consider setting probability=100 and configure
-	/debug/fail*/interval for such testcases.
+	/sys/kernel/debug/fail*/interval for such testcases.
 
-- /debug/fail*/interval:
+- /sys/kernel/debug/fail*/interval:
 
 	specifies the interval between failures, for calls to
 	should_fail() that pass all the other tests.
@@ -46,18 +46,18 @@ configuration of fault-injection capabilities.
 	Note that if you enable this, by setting interval>1, you will
 	probably want to set probability=100.
 
-- /debug/fail*/times:
+- /sys/kernel/debug/fail*/times:
 
 	specifies how many times failures may happen at most.
 	A value of -1 means "no limit".
 
-- /debug/fail*/space:
+- /sys/kernel/debug/fail*/space:
 
 	specifies an initial resource "budget", decremented by "size"
 	on each call to should_fail(,size).  Failure injection is
 	suppressed until "space" reaches zero.
 
-- /debug/fail*/verbose
+- /sys/kernel/debug/fail*/verbose
 
 	Format: { 0 | 1 | 2 }
 	specifies the verbosity of the messages when failure is
@@ -65,17 +65,17 @@ configuration of fault-injection capabilities.
 	log line per failure; '2' will print a call trace too -- useful
 	to debug the problems revealed by fault injection.
 
-- /debug/fail*/task-filter:
+- /sys/kernel/debug/fail*/task-filter:
 
 	Format: { 'Y' | 'N' }
 	A value of 'N' disables filtering by process (default).
 	Any positive value limits failures to only processes indicated by
 	/proc/<pid>/make-it-fail==1.
 
-- /debug/fail*/require-start:
-- /debug/fail*/require-end:
-- /debug/fail*/reject-start:
-- /debug/fail*/reject-end:
+- /sys/kernel/debug/fail*/require-start:
+- /sys/kernel/debug/fail*/require-end:
+- /sys/kernel/debug/fail*/reject-start:
+- /sys/kernel/debug/fail*/reject-end:
 
 	specifies the range of virtual addresses tested during
 	stacktrace walking.  Failure is injected only if some caller
@@ -84,26 +84,26 @@ configuration of fault-injection capabilities.
 	Default required range is [0,ULONG_MAX) (whole of virtual address space).
 	Default rejected range is [0,0).
 
-- /debug/fail*/stacktrace-depth:
+- /sys/kernel/debug/fail*/stacktrace-depth:
 
 	specifies the maximum stacktrace depth walked during search
 	for a caller within [require-start,require-end) OR
 	[reject-start,reject-end).
 
-- /debug/fail_page_alloc/ignore-gfp-highmem:
+- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
 
 	Format: { 'Y' | 'N' }
 	default is 'N', setting it to 'Y' won't inject failures into
 	highmem/user allocations.
 
-- /debug/failslab/ignore-gfp-wait:
-- /debug/fail_page_alloc/ignore-gfp-wait:
+- /sys/kernel/debug/failslab/ignore-gfp-wait:
+- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
 
 	Format: { 'Y' | 'N' }
 	default is 'N', setting it to 'Y' will inject failures
 	only into non-sleep allocations (GFP_ATOMIC allocations).
 
-- /debug/fail_page_alloc/min-order:
+- /sys/kernel/debug/fail_page_alloc/min-order:
 
 	specifies the minimum page allocation order to be injected
 	failures.
@@ -166,13 +166,13 @@ o Inject slab allocation failures into module init/exit code
 #!/bin/bash
 
 FAILTYPE=failslab
-echo Y > /debug/$FAILTYPE/task-filter
-echo 10 > /debug/$FAILTYPE/probability
-echo 100 > /debug/$FAILTYPE/interval
-echo -1 > /debug/$FAILTYPE/times
-echo 0 > /debug/$FAILTYPE/space
-echo 2 > /debug/$FAILTYPE/verbose
-echo 1 > /debug/$FAILTYPE/ignore-gfp-wait
+echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
+echo 10 > /sys/kernel/debug/$FAILTYPE/probability
+echo 100 > /sys/kernel/debug/$FAILTYPE/interval
+echo -1 > /sys/kernel/debug/$FAILTYPE/times
+echo 0 > /sys/kernel/debug/$FAILTYPE/space
+echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
+echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
 
 faulty_system()
 {
@@ -217,20 +217,20 @@ then
 	exit 1
 fi
 
-cat /sys/module/$module/sections/.text > /debug/$FAILTYPE/require-start
-cat /sys/module/$module/sections/.data > /debug/$FAILTYPE/require-end
+cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
+cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
 
-echo N > /debug/$FAILTYPE/task-filter
-echo 10 > /debug/$FAILTYPE/probability
-echo 100 > /debug/$FAILTYPE/interval
-echo -1 > /debug/$FAILTYPE/times
-echo 0 > /debug/$FAILTYPE/space
-echo 2 > /debug/$FAILTYPE/verbose
-echo 1 > /debug/$FAILTYPE/ignore-gfp-wait
-echo 1 > /debug/$FAILTYPE/ignore-gfp-highmem
-echo 10 > /debug/$FAILTYPE/stacktrace-depth
+echo N > /sys/kernel/debug/$FAILTYPE/task-filter
+echo 10 > /sys/kernel/debug/$FAILTYPE/probability
+echo 100 > /sys/kernel/debug/$FAILTYPE/interval
+echo -1 > /sys/kernel/debug/$FAILTYPE/times
+echo 0 > /sys/kernel/debug/$FAILTYPE/space
+echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
+echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
+echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
 
-trap "echo 0 > /debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
+trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
 
 echo "Injecting errors into the module $module... (interrupt to stop)"
 sleep 1000000
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 1e7a769a10f..053037a1fe6 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -507,9 +507,9 @@ http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
 Appendix A: The kprobes debugfs interface
 
 With recent kernels (> 2.6.20) the list of registered kprobes is visible
-under the /debug/kprobes/ directory (assuming debugfs is mounted at /debug).
+under the /sys/kernel/debug/kprobes/ directory (assuming debugfs is mounted at //sys/kernel/debug).
 
-/debug/kprobes/list: Lists all registered probes on the system
+/sys/kernel/debug/kprobes/list: Lists all registered probes on the system
 
 c015d71a  k  vfs_read+0x0
 c011a316  j  do_fork+0x0
@@ -525,7 +525,7 @@ virtual addresses that correspond to modules that've been unloaded),
 such probes are marked with [GONE]. If the probe is temporarily disabled,
 such probes are marked with [DISABLED].
 
-/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
+/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
 
 Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
 By default, all kprobes are enabled. By echoing "0" to this file, all
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 7bd27f0e288..a39b3c749de 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -7,7 +7,6 @@ Copyright 2008 Red Hat Inc.
                (dual licensed under the GPL v2)
 Reviewers:   Elias Oltmanns, Randy Dunlap, Andrew Morton,
 	     John Kacur, and David Teigland.
-
 Written for: 2.6.28-rc2
 
 Introduction
@@ -33,13 +32,26 @@ The File System
 Ftrace uses the debugfs file system to hold the control files as
 well as the files to display output.
 
-To mount the debugfs system:
+When debugfs is configured into the kernel (which selecting any ftrace
+option will do) the directory /sys/kernel/debug will be created. To mount
+this directory, you can add to your /etc/fstab file:
+
+ debugfs       /sys/kernel/debug          debugfs defaults        0       0
+
+Or you can mount it at run time with:
+
+ mount -t debugfs nodev /sys/kernel/debug
 
-  # mkdir /debug
-  # mount -t debugfs nodev /debug
+For quicker access to that directory you may want to make a soft link to
+it:
 
-( Note: it is more common to mount at /sys/kernel/debug, but for
-  simplicity this document will use /debug)
+ ln -s /sys/kernel/debug /debug
+
+Any selected ftrace option will also create a directory called tracing
+within the debugfs. The rest of the document will assume that you are in
+the ftrace directory (cd /sys/kernel/debug/tracing) and will only concentrate
+on the files within that directory and not distract from the content with
+the extended "/sys/kernel/debug/tracing" path name.
 
 That's it! (assuming that you have ftrace configured into your kernel)
 
@@ -389,18 +401,18 @@ trace_options
 The trace_options file is used to control what gets printed in
 the trace output. To see what is available, simply cat the file:
 
-  cat /debug/tracing/trace_options
+  cat trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
   noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
 
 To disable one of the options, echo in the option prepended with
 "no".
 
-  echo noprint-parent > /debug/tracing/trace_options
+  echo noprint-parent > trace_options
 
 To enable an option, leave off the "no".
 
-  echo sym-offset > /debug/tracing/trace_options
+  echo sym-offset > trace_options
 
 Here are the available options:
 
@@ -476,11 +488,11 @@ sched_switch
 This tracer simply records schedule switches. Here is an example
 of how to use it.
 
- # echo sched_switch > /debug/tracing/current_tracer
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo sched_switch > current_tracer
+ # echo 1 > tracing_enabled
  # sleep 1
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/trace
+ # echo 0 > tracing_enabled
+ # cat trace
 
 # tracer: sched_switch
 #
@@ -583,13 +595,13 @@ new trace is saved.
 To reset the maximum, echo 0 into tracing_max_latency. Here is
 an example:
 
- # echo irqsoff > /debug/tracing/current_tracer
- # echo 0 > /debug/tracing/tracing_max_latency
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo irqsoff > current_tracer
+ # echo 0 > tracing_max_latency
+ # echo 1 > tracing_enabled
  # ls -ltr
  [...]
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/latency_trace
+ # echo 0 > tracing_enabled
+ # cat latency_trace
 # tracer: irqsoff
 #
 irqsoff latency trace v1.1.5 on 2.6.26
@@ -690,13 +702,13 @@ Like the irqsoff tracer, it records the maximum latency for
 which preemption was disabled. The control of preemptoff tracer
 is much like the irqsoff tracer.
 
- # echo preemptoff > /debug/tracing/current_tracer
- # echo 0 > /debug/tracing/tracing_max_latency
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo preemptoff > current_tracer
+ # echo 0 > tracing_max_latency
+ # echo 1 > tracing_enabled
  # ls -ltr
  [...]
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/latency_trace
+ # echo 0 > tracing_enabled
+ # cat latency_trace
 # tracer: preemptoff
 #
 preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -837,13 +849,13 @@ tracer.
 Again, using this trace is much like the irqsoff and preemptoff
 tracers.
 
- # echo preemptirqsoff > /debug/tracing/current_tracer
- # echo 0 > /debug/tracing/tracing_max_latency
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo preemptirqsoff > current_tracer
+ # echo 0 > tracing_max_latency
+ # echo 1 > tracing_enabled
  # ls -ltr
  [...]
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/latency_trace
+ # echo 0 > tracing_enabled
+ # cat latency_trace
 # tracer: preemptirqsoff
 #
 preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -999,12 +1011,12 @@ slightly differently than we did with the previous tracers.
 Instead of performing an 'ls', we will run 'sleep 1' under
 'chrt' which changes the priority of the task.
 
- # echo wakeup > /debug/tracing/current_tracer
- # echo 0 > /debug/tracing/tracing_max_latency
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo wakeup > current_tracer
+ # echo 0 > tracing_max_latency
+ # echo 1 > tracing_enabled
  # chrt -f 5 sleep 1
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/latency_trace
+ # echo 0 > tracing_enabled
+ # cat latency_trace
 # tracer: wakeup
 #
 wakeup latency trace v1.1.5 on 2.6.26-rc8
@@ -1114,11 +1126,11 @@ can be done from the debug file system. Make sure the
 ftrace_enabled is set; otherwise this tracer is a nop.
 
  # sysctl kernel.ftrace_enabled=1
- # echo function > /debug/tracing/current_tracer
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo function > current_tracer
+ # echo 1 > tracing_enabled
  # usleep 1
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/trace
+ # echo 0 > tracing_enabled
+ # cat trace
 # tracer: function
 #
 #           TASK-PID   CPU#    TIMESTAMP  FUNCTION
@@ -1155,7 +1167,7 @@ int trace_fd;
 [...]
 int main(int argc, char *argv[]) {
 	[...]
-	trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY);
+	trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
 	[...]
 	if (condition_hit()) {
 		write(trace_fd, "0", 1);
@@ -1163,26 +1175,20 @@ int main(int argc, char *argv[]) {
 	[...]
 }
 
-Note: Here we hard coded the path name. The debugfs mount is not
-guaranteed to be at /debug (and is more commonly at
-/sys/kernel/debug). For simple one time traces, the above is
-sufficent. For anything else, a search through /proc/mounts may
-be needed to find where the debugfs file-system is mounted.
-
 
 Single thread tracing
 ---------------------
 
-By writing into /debug/tracing/set_ftrace_pid you can trace a
+By writing into set_ftrace_pid you can trace a
 single thread. For example:
 
-# cat /debug/tracing/set_ftrace_pid
+# cat set_ftrace_pid
 no pid
-# echo 3111 > /debug/tracing/set_ftrace_pid
-# cat /debug/tracing/set_ftrace_pid
+# echo 3111 > set_ftrace_pid
+# cat set_ftrace_pid
 3111
-# echo function > /debug/tracing/current_tracer
-# cat /debug/tracing/trace | head
+# echo function > current_tracer
+# cat trace | head
  # tracer: function
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
@@ -1193,8 +1199,8 @@ no pid
      yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
      yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
      yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
-# echo -1 > /debug/tracing/set_ftrace_pid
-# cat /debug/tracing/trace |head
+# echo -1 > set_ftrace_pid
+# cat trace |head
  # tracer: function
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
@@ -1216,6 +1222,51 @@ something like this simple program:
 #include <fcntl.h>
 #include <unistd.h>
 
+#define _STR(x) #x
+#define STR(x) _STR(x)
+#define MAX_PATH 256
+
+const char *find_debugfs(void)
+{
+       static char debugfs[MAX_PATH+1];
+       static int debugfs_found;
+       char type[100];
+       FILE *fp;
+
+       if (debugfs_found)
+               return debugfs;
+
+       if ((fp = fopen("/proc/mounts","r")) == NULL) {
+               perror("/proc/mounts");
+               return NULL;
+       }
+
+       while (fscanf(fp, "%*s %"
+                     STR(MAX_PATH)
+                     "s %99s %*s %*d %*d\n",
+                     debugfs, type) == 2) {
+               if (strcmp(type, "debugfs") == 0)
+                       break;
+       }
+       fclose(fp);
+
+       if (strcmp(type, "debugfs") != 0) {
+               fprintf(stderr, "debugfs not mounted");
+               return NULL;
+       }
+
+       debugfs_found = 1;
+
+       return debugfs;
+}
+
+const char *tracing_file(const char *file_name)
+{
+       static char trace_file[MAX_PATH+1];
+       snprintf(trace_file, MAX_PATH, "%s/%s", find_debugfs(), file_name);
+       return trace_file;
+}
+
 int main (int argc, char **argv)
 {
         if (argc < 1)
@@ -1226,12 +1277,12 @@ int main (int argc, char **argv)
                 char line[64];
                 int s;
 
-                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
+                ffd = open(tracing_file("current_tracer"), O_WRONLY);
                 if (ffd < 0)
                         exit(-1);
                 write(ffd, "nop", 3);
 
-                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
+                fd = open(tracing_file("set_ftrace_pid"), O_WRONLY);
                 s = sprintf(line, "%d\n", getpid());
                 write(fd, line, s);
 
@@ -1383,22 +1434,22 @@ want, depending on your needs.
   tracing_cpu_mask file) or you might sometimes see unordered
   function calls while cpu tracing switch.
 
-	hide: echo nofuncgraph-cpu > /debug/tracing/trace_options
-	show: echo funcgraph-cpu > /debug/tracing/trace_options
+	hide: echo nofuncgraph-cpu > trace_options
+	show: echo funcgraph-cpu > trace_options
 
 - The duration (function's time of execution) is displayed on
   the closing bracket line of a function or on the same line
   than the current function in case of a leaf one. It is default
   enabled.
 
-	hide: echo nofuncgraph-duration > /debug/tracing/trace_options
-	show: echo funcgraph-duration > /debug/tracing/trace_options
+	hide: echo nofuncgraph-duration > trace_options
+	show: echo funcgraph-duration > trace_options
 
 - The overhead field precedes the duration field in case of
   reached duration thresholds.
 
-	hide: echo nofuncgraph-overhead > /debug/tracing/trace_options
-	show: echo funcgraph-overhead > /debug/tracing/trace_options
+	hide: echo nofuncgraph-overhead > trace_options
+	show: echo funcgraph-overhead > trace_options
 	depends on: funcgraph-duration
 
   ie:
@@ -1427,8 +1478,8 @@ want, depending on your needs.
 - The task/pid field displays the thread cmdline and pid which
   executed the function. It is default disabled.
 
-	hide: echo nofuncgraph-proc > /debug/tracing/trace_options
-	show: echo funcgraph-proc > /debug/tracing/trace_options
+	hide: echo nofuncgraph-proc > trace_options
+	show: echo funcgraph-proc > trace_options
 
   ie:
 
@@ -1451,8 +1502,8 @@ want, depending on your needs.
   system clock since it started. A snapshot of this time is
   given on each entry/exit of functions
 
-	hide: echo nofuncgraph-abstime > /debug/tracing/trace_options
-	show: echo funcgraph-abstime > /debug/tracing/trace_options
+	hide: echo nofuncgraph-abstime > trace_options
+	show: echo funcgraph-abstime > trace_options
 
   ie:
 
@@ -1549,7 +1600,7 @@ listed in:
 
    available_filter_functions
 
- # cat /debug/tracing/available_filter_functions
+ # cat available_filter_functions
 put_prev_task_idle
 kmem_cache_create
 pick_next_task_rt
@@ -1561,12 +1612,12 @@ mutex_lock
 If I am only interested in sys_nanosleep and hrtimer_interrupt:
 
  # echo sys_nanosleep hrtimer_interrupt \
-		> /debug/tracing/set_ftrace_filter
- # echo ftrace > /debug/tracing/current_tracer
- # echo 1 > /debug/tracing/tracing_enabled
+		> set_ftrace_filter
+ # echo ftrace > current_tracer
+ # echo 1 > tracing_enabled
  # usleep 1
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/trace
+ # echo 0 > tracing_enabled
+ # cat trace
 # tracer: ftrace
 #
 #           TASK-PID   CPU#    TIMESTAMP  FUNCTION
@@ -1577,7 +1628,7 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
 
 To see which functions are being traced, you can cat the file:
 
- # cat /debug/tracing/set_ftrace_filter
+ # cat set_ftrace_filter
 hrtimer_interrupt
 sys_nanosleep
 
@@ -1597,7 +1648,7 @@ Note: It is better to use quotes to enclose the wild cards,
       otherwise the shell may expand the parameters into names
       of files in the local directory.
 
- # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
+ # echo 'hrtimer_*' > set_ftrace_filter
 
 Produces:
 
@@ -1618,7 +1669,7 @@ Produces:
 
 Notice that we lost the sys_nanosleep.
 
- # cat /debug/tracing/set_ftrace_filter
+ # cat set_ftrace_filter
 hrtimer_run_queues
 hrtimer_run_pending
 hrtimer_init
@@ -1644,17 +1695,17 @@ To append to the filters, use '>>'
 To clear out a filter so that all functions will be recorded
 again:
 
- # echo > /debug/tracing/set_ftrace_filter
- # cat /debug/tracing/set_ftrace_filter
+ # echo > set_ftrace_filter
+ # cat set_ftrace_filter
  #
 
 Again, now we want to append.
 
- # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
- # cat /debug/tracing/set_ftrace_filter
+ # echo sys_nanosleep > set_ftrace_filter
+ # cat set_ftrace_filter
 sys_nanosleep
- # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
- # cat /debug/tracing/set_ftrace_filter
+ # echo 'hrtimer_*' >> set_ftrace_filter
+ # cat set_ftrace_filter
 hrtimer_run_queues
 hrtimer_run_pending
 hrtimer_init
@@ -1677,7 +1728,7 @@ hrtimer_init_sleeper
 The set_ftrace_notrace prevents those functions from being
 traced.
 
- # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
+ # echo '*preempt*' '*lock*' > set_ftrace_notrace
 
 Produces:
 
@@ -1767,13 +1818,13 @@ the effect on the tracing is different. Every read from
 trace_pipe is consumed. This means that subsequent reads will be
 different. The trace is live.
 
- # echo function > /debug/tracing/current_tracer
- # cat /debug/tracing/trace_pipe > /tmp/trace.out &
+ # echo function > current_tracer
+ # cat trace_pipe > /tmp/trace.out &
 [1] 4153
- # echo 1 > /debug/tracing/tracing_enabled
+ # echo 1 > tracing_enabled
  # usleep 1
- # echo 0 > /debug/tracing/tracing_enabled
- # cat /debug/tracing/trace
+ # echo 0 > tracing_enabled
+ # cat trace
 # tracer: function
 #
 #           TASK-PID   CPU#    TIMESTAMP  FUNCTION
@@ -1809,7 +1860,7 @@ number listed is the number of entries that can be recorded per
 CPU. To know the full size, multiply the number of possible CPUS
 with the number of entries.
 
- # cat /debug/tracing/buffer_size_kb
+ # cat buffer_size_kb
 1408 (units kilobytes)
 
 Note, to modify this, you must have tracing completely disabled.
@@ -1817,18 +1868,18 @@ To do that, echo "nop" into the current_tracer. If the
 current_tracer is not set to "nop", an EINVAL error will be
 returned.
 
- # echo nop > /debug/tracing/current_tracer
- # echo 10000 > /debug/tracing/buffer_size_kb
- # cat /debug/tracing/buffer_size_kb
+ # echo nop > current_tracer
+ # echo 10000 > buffer_size_kb
+ # cat buffer_size_kb
 10000 (units kilobytes)
 
 The number of pages which will be allocated is limited to a
 percentage of available memory. Allocating too much will produce
 an error.
 
- # echo 1000000000000 > /debug/tracing/buffer_size_kb
+ # echo 1000000000000 > buffer_size_kb
 -bash: echo: write error: Cannot allocate memory
- # cat /debug/tracing/buffer_size_kb
+ # cat buffer_size_kb
 85
 
 -----------
diff --git a/Documentation/trace/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc5..162effbfbde 100644
--- a/Documentation/trace/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
@@ -32,41 +32,41 @@ is no way to automatically detect if you are losing events due to CPUs racing.
 Usage Quick Reference
 ---------------------
 
-$ mount -t debugfs debugfs /debug
-$ echo mmiotrace > /debug/tracing/current_tracer
-$ cat /debug/tracing/trace_pipe > mydump.txt &
+$ mount -t debugfs debugfs /sys/kernel/debug
+$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
+$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
 Start X or whatever.
-$ echo "X is up" > /debug/tracing/trace_marker
-$ echo nop > /debug/tracing/current_tracer
+$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
+$ echo nop > /sys/kernel/debug/tracing/current_tracer
 Check for lost events.
 
 
 Usage
 -----
 
-Make sure debugfs is mounted to /debug. If not, (requires root privileges)
-$ mount -t debugfs debugfs /debug
+Make sure debugfs is mounted to /sys/kernel/debug. If not, (requires root privileges)
+$ mount -t debugfs debugfs /sys/kernel/debug
 
 Check that the driver you are about to trace is not loaded.
 
 Activate mmiotrace (requires root privileges):
-$ echo mmiotrace > /debug/tracing/current_tracer
+$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
 
 Start storing the trace:
-$ cat /debug/tracing/trace_pipe > mydump.txt &
+$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
 The 'cat' process should stay running (sleeping) in the background.
 
 Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
 accesses to areas that are ioremapped while mmiotrace is active.
 
 During tracing you can place comments (markers) into the trace by
-$ echo "X is up" > /debug/tracing/trace_marker
+$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
 This makes it easier to see which part of the (huge) trace corresponds to
 which action. It is recommended to place descriptive markers about what you
 do.
 
 Shut down mmiotrace (requires root privileges):
-$ echo nop > /debug/tracing/current_tracer
+$ echo nop > /sys/kernel/debug/tracing/current_tracer
 The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
 pressing ctrl+c.
 
@@ -78,10 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If
 events were lost, the trace is incomplete. You should enlarge the buffers and
 try again. Buffers are enlarged by first seeing how large the current buffers
 are:
-$ cat /debug/tracing/buffer_size_kb
+$ cat /sys/kernel/debug/tracing/buffer_size_kb
 gives you a number. Approximately double this number and write it back, for
 instance:
-$ echo 128000 > /debug/tracing/buffer_size_kb
+$ echo 128000 > /sys/kernel/debug/tracing/buffer_size_kb
 Then start again from the top.
 
 If you are doing a trace for a driver project, e.g. Nouveau, you should also
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 37e0f81cada..83650e00632 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -430,7 +430,7 @@ static void pkt_sysfs_cleanup(void)
 /********************************************************************
   entries in debugfs
 
-  /debugfs/pktcdvd[0-7]/
+  /sys/kernel/debug/pktcdvd[0-7]/
 			info
 
  *******************************************************************/
diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c
index c77c6c6d9d2..6ce0e2667a8 100644
--- a/drivers/gpu/drm/drm_debugfs.c
+++ b/drivers/gpu/drm/drm_debugfs.c
@@ -105,7 +105,7 @@ int drm_debugfs_create_files(struct drm_info_list *files, int count,
 		ent = debugfs_create_file(files[i].name, S_IFREG | S_IRUGO,
 					  root, tmp, &drm_debugfs_fops);
 		if (!ent) {
-			DRM_ERROR("Cannot create /debugfs/dri/%s/%s\n",
+			DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/%s\n",
 				  name, files[i].name);
 			drm_free(tmp, sizeof(struct drm_info_node),
 				 _DRM_DRIVER);
@@ -133,9 +133,9 @@ EXPORT_SYMBOL(drm_debugfs_create_files);
  * \param minor device minor number
  * \param root DRI debugfs dir entry.
  *
- * Create the DRI debugfs root entry "/debugfs/dri", the device debugfs root entry
- * "/debugfs/dri/%minor%/", and each entry in debugfs_list as
- * "/debugfs/dri/%minor%/%name%".
+ * Create the DRI debugfs root entry "/sys/kernel/debug/dri", the device debugfs root entry
+ * "/sys/kernel/debug/dri/%minor%/", and each entry in debugfs_list as
+ * "/sys/kernel/debug/dri/%minor%/%name%".
  */
 int drm_debugfs_init(struct drm_minor *minor, int minor_id,
 		     struct dentry *root)
@@ -148,7 +148,7 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
 	sprintf(name, "%d", minor_id);
 	minor->debugfs_root = debugfs_create_dir(name, root);
 	if (!minor->debugfs_root) {
-		DRM_ERROR("Cannot create /debugfs/dri/%s\n", name);
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s\n", name);
 		return -1;
 	}
 
@@ -165,7 +165,7 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
 		ret = dev->driver->debugfs_init(minor);
 		if (ret) {
 			DRM_ERROR("DRM: Driver failed to initialize "
-				  "/debugfs/dri.\n");
+				  "/sys/kernel/debug/dri.\n");
 			return ret;
 		}
 	}
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 019b7c57823..1bf7efd8d33 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -339,7 +339,7 @@ static int __init drm_core_init(void)
 
 	drm_debugfs_root = debugfs_create_dir("dri", NULL);
 	if (!drm_debugfs_root) {
-		DRM_ERROR("Cannot create /debugfs/dri\n");
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri\n");
 		ret = -1;
 		goto err_p3;
 	}
diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c
index 89050684fe0..387a8de1bc7 100644
--- a/drivers/gpu/drm/drm_stub.c
+++ b/drivers/gpu/drm/drm_stub.c
@@ -343,7 +343,7 @@ static int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int t
 #if defined(CONFIG_DEBUG_FS)
 	ret = drm_debugfs_init(new_minor, minor_id, drm_debugfs_root);
 	if (ret) {
-		DRM_ERROR("DRM: Failed to initialize /debugfs/dri.\n");
+		DRM_ERROR("DRM: Failed to initialize /sys/kernel/debug/dri.\n");
 		goto err_g2;
 	}
 #endif
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 01f282cd098..3b6383168c6 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2206,7 +2206,7 @@ config SKGE_DEBUG
        depends on SKGE && DEBUG_FS
        help
 	 This option adds the ability to dump driver state for debugging.
-	 The file debugfs/skge/ethX displays the state of the internal
+	 The file /sys/kernel/debug/skge/ethX displays the state of the internal
 	 transmit and receive rings.
 
 	 If unsure, say N.
@@ -2232,7 +2232,7 @@ config SKY2_DEBUG
        depends on SKY2 && DEBUG_FS
        help
 	 This option adds the ability to dump driver state for debugging.
-	 The file debugfs/sky2/ethX displays the state of the internal
+	 The file /sys/kernel/debug/sky2/ethX displays the state of the internal
 	 transmit and receive rings.
 
 	 If unsure, say N.
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
index 1fe5da4cf0a..60330f313f2 100644
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -432,7 +432,7 @@ struct i2400m {
 	unsigned ready:1;		/* all probing steps done */
 	unsigned rx_reorder:1;		/* RX reorder is enabled */
 	u8 trace_msg_from_user;		/* echo rx msgs to 'trace' pipe */
-					/* typed u8 so debugfs/u8 can tweak */
+					/* typed u8 so /sys/kernel/debug/u8 can tweak */
 	enum i2400m_system_state state;
 	wait_queue_head_t state_wq;	/* Woken up when on state updates */
 
diff --git a/drivers/net/wireless/ath/ath5k/Kconfig b/drivers/net/wireless/ath/ath5k/Kconfig
index 509b6f94f73..daf0c83527d 100644
--- a/drivers/net/wireless/ath/ath5k/Kconfig
+++ b/drivers/net/wireless/ath/ath5k/Kconfig
@@ -28,11 +28,10 @@ config ATH5K_DEBUG
 	  Say Y, if and you will get debug options for ath5k.
 	  To use this, you need to mount debugfs:
 
-	  mkdir /debug/
-	  mount -t debugfs debug /debug/
+	  mount -t debugfs debug /sys/kernel/debug
 
 	  You will get access to files under:
-	  /debug/ath5k/phy0/
+	  /sys/kernel/debug/ath5k/phy0/
 
 	  To enable debug, pass the debug level to the debug module
 	  parameter. For example:
diff --git a/drivers/net/wireless/libertas/README b/drivers/net/wireless/libertas/README
index d860fc37575..ab6a2d518af 100644
--- a/drivers/net/wireless/libertas/README
+++ b/drivers/net/wireless/libertas/README
@@ -72,7 +72,7 @@ rdrf
 	location that is to be read.  This parameter must be specified in
 	hexadecimal (its possible to preceed preceding the number with a "0x").
 
-	Path: /debugfs/libertas_wireless/ethX/registers/
+	Path: /sys/kernel/debug/libertas_wireless/ethX/registers/
 
 	Usage:
 		echo "0xa123" > rdmac ; cat rdmac
@@ -95,7 +95,7 @@ wrrf
 sleepparams
 	This command is used to set the sleepclock configurations
 
-	Path: /debugfs/libertas_wireless/ethX/
+	Path: /sys/kernel/debug/libertas_wireless/ethX/
 
 	Usage:
 		cat sleepparams: reads the current sleepclock configuration
@@ -115,7 +115,7 @@ subscribed_events
 	The subscribed_events directory contains the interface for the
 	subscribed events API.
 
-	Path: /debugfs/libertas_wireless/ethX/subscribed_events/
+	Path: /sys/kernel/debug/libertas_wireless/ethX/subscribed_events/
 
 	Each event is represented by a filename. Each filename consists of the
 	following three fields:
@@ -165,7 +165,7 @@ subscribed_events
 extscan
 	This command is used to do a specific scan.
 
-	Path: /debugfs/libertas_wireless/ethX/
+	Path: /sys/kernel/debug/libertas_wireless/ethX/
 
 	Usage: echo "SSID" > extscan
 
@@ -179,7 +179,7 @@ getscantable
 	Display the current contents of the driver scan table (ie. get the
 	scan results).
 
-	Path: /debugfs/libertas_wireless/ethX/
+	Path: /sys/kernel/debug/libertas_wireless/ethX/
 
 	Usage:
 		cat getscantable
@@ -188,7 +188,7 @@ setuserscan
 	Initiate a customized scan and retrieve the results
 
 
-	Path: /debugfs/libertas_wireless/ethX/
+	Path: /sys/kernel/debug/libertas_wireless/ethX/
 
     Usage:
        echo "[ARGS]" > setuserscan
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index 2b02b1fb39a..8d0f0de76b6 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -53,8 +53,7 @@
  * debugfs interface
  *
  * To access this interface the user should:
- * # mkdir /debug
- * # mount -t debugfs none /debug
+ * # mount -t debugfs none /sys/kernel/debug
  *
  * The lpfc debugfs directory hierarchy is:
  * lpfc/lpfcX/vportY
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 883cd44ff76..99b7aada28d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -406,7 +406,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
  *
  * Use tracing_on/tracing_off when you want to quickly turn on or off
  * tracing. It simply enables or disables the recording of the trace events.
- * This also corresponds to the user space debugfs/tracing/tracing_on
+ * This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on
  * file, which gives a means for the kernel and userspace to interact.
  * Place a tracing_off() in the kernel where you want tracing to end.
  * From user space, examine the trace, and then echo 1 > tracing_on
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 14df7e635d4..b9dc4ca0246 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -198,7 +198,7 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* This is how the trace record is structured and will
  *	* be saved into the ring buffer. These are the fields
  *	* that will be exposed to user-space in
- *	* /debug/tracing/events/<*>/format.
+ *	* /sys/kernel/debug/tracing/events/<*>/format.
  *	*
  *	* The declared 'local variable' is called '__entry'
  *	*
@@ -258,7 +258,7 @@ static inline void tracepoint_synchronize_unregister(void)
  * tracepoint callback (this is used by programmatic plugins and
  * can also by used by generic instrumentation like SystemTap), and
  * it is also used to expose a structured trace record in
- * /debug/tracing/events/.
+ * /sys/kernel/debug/tracing/events/.
  */
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4a13e5a01ce..61071fecc82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -147,7 +147,7 @@ config IRQSOFF_TRACER
 	  disabled by default and can be runtime (re-)started
 	  via:
 
-	      echo 0 > /debugfs/tracing/tracing_max_latency
+	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
 
 	  (Note that kernel size and overhead increases with this option
 	  enabled. This option and the preempt-off timing option can be
@@ -168,7 +168,7 @@ config PREEMPT_TRACER
 	  disabled by default and can be runtime (re-)started
 	  via:
 
-	      echo 0 > /debugfs/tracing/tracing_max_latency
+	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
 
 	  (Note that kernel size and overhead increases with this option
 	  enabled. This option and the irqs-off timing option can be
@@ -261,7 +261,7 @@ config PROFILE_ANNOTATED_BRANCHES
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
 
-	  /debugfs/tracing/profile_annotated_branch
+	  /sys/kernel/debug/tracing/profile_annotated_branch
 
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
@@ -274,7 +274,7 @@ config PROFILE_ALL_BRANCHES
 	  taken in the kernel is recorded whether it hit or miss.
 	  The results will be displayed in:
 
-	  /debugfs/tracing/profile_branch
+	  /sys/kernel/debug/tracing/profile_branch
 
 	  This option also enables the likely/unlikely profiler.
 
@@ -323,7 +323,7 @@ config STACK_TRACER
 	select KALLSYMS
 	help
 	  This special tracer records the maximum stack footprint of the
-	  kernel and displays it in debugfs/tracing/stack_trace.
+	  kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
 
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8acd9b81a5d..c1878bfb2e1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
 /*
  * Copy the new maximum trace into the separate maximum-trace
  * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
  */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2414,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
 
 static const char readme_msg[] =
 	"tracing mini-HOWTO:\n\n"
-	"# mkdir /debug\n"
-	"# mount -t debugfs nodev /debug\n\n"
-	"# cat /debug/tracing/available_tracers\n"
+	"# mount -t debugfs nodev /sys/kernel/debug\n\n"
+	"# cat /sys/kernel/debug/tracing/available_tracers\n"
 	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
-	"# cat /debug/tracing/current_tracer\n"
+	"# cat /sys/kernel/debug/tracing/current_tracer\n"
 	"nop\n"
-	"# echo sched_switch > /debug/tracing/current_tracer\n"
-	"# cat /debug/tracing/current_tracer\n"
+	"# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
+	"# cat /sys/kernel/debug/tracing/current_tracer\n"
 	"sched_switch\n"
-	"# cat /debug/tracing/trace_options\n"
+	"# cat /sys/kernel/debug/tracing/trace_options\n"
 	"noprint-parent nosym-offset nosym-addr noverbose\n"
-	"# echo print-parent > /debug/tracing/trace_options\n"
-	"# echo 1 > /debug/tracing/tracing_enabled\n"
-	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
-	"# echo 0 > /debug/tracing/tracing_enabled\n"
+	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
+	"# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
+	"# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
 ;
 
 static ssize_t
diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py
index 902f9a99262..db40fa04cd5 100644
--- a/scripts/tracing/draw_functrace.py
+++ b/scripts/tracing/draw_functrace.py
@@ -12,10 +12,9 @@ calls. Only the functions's names and the the call time are provided.
 
 Usage:
 	Be sure that you have CONFIG_FUNCTION_TRACER
-	# mkdir /debugfs
-	# mount -t debug debug /debug
-	# echo function > /debug/tracing/current_tracer
-	$ cat /debug/tracing/trace_pipe > ~/raw_trace_func
+	# mount -t debugfs nodev /sys/kernel/debug
+	# echo function > /sys/kernel/debug/tracing/current_tracer
+	$ cat /sys/kernel/debug/tracing/trace_pipe > ~/raw_trace_func
 	Wait some times but not too much, the script is a bit slow.
 	Break the pipe (Ctrl + Z)
 	$ scripts/draw_functrace.py < raw_trace_func > draw_functrace
-- 
cgit v1.2.3-70-g09d2


From cc835e321a9f3fa5e083436872e198095f4805b9 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 31 Mar 2009 12:28:31 -0700
Subject: USB: nop-usb-xceiv: behave when linked as a module

The NOP OTG transceiver driver needs to be usable from modules.
Make sure its symbols are always accessible at both compile and
link time, and make sure the device instance is allocated from
the heap so that device lifetime rules are obeyed.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/otg/nop-usb-xceiv.c | 25 ++++++++++---------------
 include/linux/usb/otg.h         |  4 ++--
 2 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/otg/nop-usb-xceiv.c b/drivers/usb/otg/nop-usb-xceiv.c
index c567168f89a..9ed5ea56867 100644
--- a/drivers/usb/otg/nop-usb-xceiv.c
+++ b/drivers/usb/otg/nop-usb-xceiv.c
@@ -22,8 +22,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * Current status:
- * 	this is to add "nop" transceiver for all those phy which is
- * 	autonomous such as isp1504 etc.
+ *	This provides a "nop" transceiver for PHYs which are
+ *	autonomous such as isp1504, isp1707, etc.
  */
 
 #include <linux/module.h>
@@ -36,30 +36,25 @@ struct nop_usb_xceiv {
 	struct device		*dev;
 };
 
-static u64 nop_xceiv_dmamask = DMA_BIT_MASK(32);
-
-static struct platform_device nop_xceiv_device = {
-	.name           = "nop_usb_xceiv",
-	.id             = -1,
-	.dev = {
-		.dma_mask               = &nop_xceiv_dmamask,
-		.coherent_dma_mask      = DMA_BIT_MASK(32),
-		.platform_data          = NULL,
-	},
-};
+static struct platform_device *pd;
 
 void usb_nop_xceiv_register(void)
 {
-	if (platform_device_register(&nop_xceiv_device) < 0) {
+	if (pd)
+		return;
+	pd = platform_device_register_simple("nop_usb_xceiv", -1, NULL, 0);
+	if (!pd) {
 		printk(KERN_ERR "Unable to register usb nop transceiver\n");
 		return;
 	}
 }
+EXPORT_SYMBOL(usb_nop_xceiv_register);
 
 void usb_nop_xceiv_unregister(void)
 {
-	platform_device_unregister(&nop_xceiv_device);
+	platform_device_unregister(pd);
 }
+EXPORT_SYMBOL(usb_nop_xceiv_unregister);
 
 static inline struct nop_usb_xceiv *xceiv_to_nop(struct otg_transceiver *x)
 {
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 1aaa826396a..2443c0e7a80 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -80,10 +80,10 @@ struct otg_transceiver {
 
 /* for board-specific init logic */
 extern int otg_set_transceiver(struct otg_transceiver *);
-#ifdef CONFIG_NOP_USB_XCEIV
+
+/* sometimes transceivers are accessed only through e.g. ULPI */
 extern void usb_nop_xceiv_register(void);
 extern void usb_nop_xceiv_unregister(void);
-#endif
 
 
 /* for usb host and peripheral controller drivers */
-- 
cgit v1.2.3-70-g09d2


From 00048b8bde5a6cbd9c3a76f272cc9ddb55705e37 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 24 Apr 2009 14:56:26 -0700
Subject: USB: add usb debugfs directory

Add a common usb directory in debugfs that the usb subsystem can use.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/usb.c | 22 ++++++++++++++++++++++
 include/linux/usb.h    |  3 +++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 7eee400d3e3..5f6873f5f26 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -34,6 +34,7 @@
 #include <linux/usb.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
+#include <linux/debugfs.h>
 
 #include <asm/io.h>
 #include <linux/scatterlist.h>
@@ -1001,6 +1002,22 @@ static struct notifier_block usb_bus_nb = {
 	.notifier_call = usb_bus_notify,
 };
 
+struct dentry *usb_debug_root;
+EXPORT_SYMBOL_GPL(usb_debug_root);
+
+static int usb_debugfs_init(void)
+{
+	usb_debug_root = debugfs_create_dir("usb", NULL);
+	if (!usb_debug_root)
+		return -ENOENT;
+	return 0;
+}
+
+static void usb_debugfs_cleanup(void)
+{
+	debugfs_remove(usb_debug_root);
+}
+
 /*
  * Init
  */
@@ -1012,6 +1029,10 @@ static int __init usb_init(void)
 		return 0;
 	}
 
+	retval = usb_debugfs_init();
+	if (retval)
+		goto out;
+
 	retval = ksuspend_usb_init();
 	if (retval)
 		goto out;
@@ -1083,6 +1104,7 @@ static void __exit usb_exit(void)
 	bus_unregister_notifier(&usb_bus_type, &usb_bus_nb);
 	bus_unregister(&usb_bus_type);
 	ksuspend_usb_cleanup();
+	usb_debugfs_cleanup();
 }
 
 subsys_initcall(usb_init);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3aa2cd1f8d0..29060dad81e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1558,6 +1558,9 @@ extern void usb_unregister_notify(struct notifier_block *nb);
 #define err(format, arg...) printk(KERN_ERR KBUILD_MODNAME ": " \
 	format "\n" , ## arg)
 
+/* debugfs stuff */
+extern struct dentry *usb_debug_root;
+
 #endif  /* __KERNEL__ */
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 74675a58507e769beee7d949dbed788af3c4139d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 30 Apr 2009 10:08:18 -0400
Subject: NLS: update handling of Unicode

This patch (as1239) updates the kernel's treatment of Unicode.  The
character-set conversion routines are well behind the current state of
the Unicode specification: They don't recognize the existence of code
points beyond plane 0 or of surrogate pairs in the UTF-16 encoding.

The old wchar_t 16-bit type is retained because it's still used in
lots of places.  This shouldn't cause any new problems; if a
conversion now results in an invalid 16-bit code then before it must
have yielded an undefined code.

Difficult-to-read names like "utf_mbstowcs" are replaced with more
transparent names like "utf8s_to_utf16s" and the ordering of the
parameters is rationalized (buffer lengths come immediate after the
pointers they refer to, and the inputs precede the outputs).
Fortunately the low-level conversion routines are used in only a few
places; the interfaces to the higher-level uni2char and char2uni
methods have been left unchanged.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/message.c |  10 +--
 fs/befs/linuxvfs.c         |  20 +++---
 fs/fat/dir.c               |  29 ++++----
 fs/fat/namei_vfat.c        |   4 +-
 fs/isofs/joliet.c          |  36 +---------
 fs/ncpfs/ncplib_kernel.c   |   8 ++-
 fs/nls/nls_base.c          | 164 +++++++++++++++++++++++++++++----------------
 fs/nls/nls_utf8.c          |  13 +++-
 include/linux/nls.h        |  35 ++++++++--
 9 files changed, 182 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index e98f928c08e..9bd26dec759 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -780,14 +780,13 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size)
 {
 	unsigned char *tbuf;
 	int err;
-	unsigned int u;
 
 	if (dev->state == USB_STATE_SUSPENDED)
 		return -EHOSTUNREACH;
 	if (size <= 0 || !buf || !index)
 		return -EINVAL;
 	buf[0] = 0;
-	tbuf = kmalloc(256 + 2, GFP_NOIO);
+	tbuf = kmalloc(256, GFP_NOIO);
 	if (!tbuf)
 		return -ENOMEM;
 
@@ -814,12 +813,9 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size)
 	if (err < 0)
 		goto errout;
 
-	for (u = 2; u < err; u += 2)
-		le16_to_cpus((u16 *)&tbuf[u]);
-	tbuf[u] = 0;
-	tbuf[u + 1] = 0;
 	size--;		/* leave room for trailing NULL char in output buffer */
-	err = utf8_wcstombs(buf, (u16 *)&tbuf[2], size);
+	err = utf16s_to_utf8s((wchar_t *) &tbuf[2], (err - 2) / 2,
+			UTF16_LITTLE_ENDIAN, buf, size);
 	buf[err] = 0;
 
 	if (tbuf[1] != USB_DT_STRING)
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9367b6297d8..89cd2deeb4a 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
 {
 	struct nls_table *nls = BEFS_SB(sb)->nls;
 	int i, o;
-	wchar_t uni;
+	unicode_t uni;
 	int unilen, utflen;
 	char *result;
 	/* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
 	for (i = o = 0; i < in_len; i += utflen, o += unilen) {
 
 		/* convert from UTF-8 to Unicode */
-		utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
-		if (utflen < 0) {
+		utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
+		if (utflen < 0)
 			goto conv_err;
-		}
 
 		/* convert from Unicode to nls */
+		if (uni > MAX_WCHAR_T)
+			goto conv_err;
 		unilen = nls->uni2char(uni, &result[o], in_len - o);
-		if (unilen < 0) {
+		if (unilen < 0)
 			goto conv_err;
-		}
 	}
 	result[o] = '\0';
 	*out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
 
 		/* convert from nls to unicode */
 		unilen = nls->char2uni(&in[i], in_len - i, &uni);
-		if (unilen < 0) {
+		if (unilen < 0)
 			goto conv_err;
-		}
 
 		/* convert from unicode to UTF-8 */
-		utflen = utf8_wctomb(&result[o], uni, 3);
-		if (utflen <= 0) {
+		utflen = utf32_to_utf8(uni, &result[o], 3);
+		if (utflen <= 0)
 			goto conv_err;
-		}
 	}
 
 	result[o] = '\0';
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index f3500294eec..7c14c8cbbab 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -22,6 +22,19 @@
 #include <asm/uaccess.h>
 #include "fat.h"
 
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
 				    struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
 				unsigned char *buf, int size)
 {
 	if (sbi->options.utf8)
-		return utf8_wcstombs(buf, uni, size);
+		return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+				UTF16_HOST_ENDIAN, buf, size);
 	else
 		return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
 				   sbi->nls_io);
@@ -324,19 +338,6 @@ parse_long:
 	return 0;
 }
 
-/*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-
 /*
  * Return values: negative -> error, 0 -> not found, positive -> found,
  * value is the total amount of slots, including the shortname entry.
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b50ecbe97f8..f92ad999535 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 	if (utf8) {
 		int name_len = strlen(name);
 
-		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+		*outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
 
 		/*
 		 * We stripped '.'s before and set len appropriately,
-		 * but utf8_mbstowcs doesn't care about len
+		 * but utf8s_to_utf16s doesn't care about len
 		 */
 		*outlen -= (name_len - len);
 
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9..a048de81c09 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
 	return (op - ascii);
 }
 
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-	const __u8 *ip;
-	__u8 *op;
-	int size;
-	__u16 c;
-
-	op = s;
-	ip = pwcs;
-	while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-		c = (*ip << 8) | ip[1];
-		if (c > 0x7f) {
-			size = utf8_wctomb(op, c, maxlen);
-			if (size == -1) {
-				/* Ignore character and move on */
-				maxlen--;
-			} else {
-				op += size;
-				maxlen -= size;
-			}
-		} else {
-			*op++ = (__u8) c;
-		}
-		ip += 2;
-		inlen--;
-	}
-	return (op - s);
-}
-
 int
 get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
 {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
 	nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
 
 	if (utf8) {
-		len = wcsntombs_be(outname, de->name,
-				de->name_len[0] >> 1, PAGE_SIZE);
+		len = utf16s_to_utf8s((const wchar_t *) de->name,
+				de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+				outname, PAGE_SIZE);
 	} else {
 		len = uni16_to_x8(outname, (__be16 *) de->name,
 				de->name_len[0] >> 1, nls);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f11211..0ec6237a597 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
 
 		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
 			int k;
+			unicode_t u;
 
-			k = utf8_mbtowc(&ec, iname, iname_end - iname);
-			if (k < 0)
+			k = utf8_to_utf32(iname, iname_end - iname, &u);
+			if (k < 0 || u > MAX_WCHAR_T)
 				return -EINVAL;
 			iname += k;
+			ec = u;
 		} else {
 			if (*iname == NCP_ESC) {
 				int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
 		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
 			int k;
 
-			k = utf8_wctomb(iname, ec, iname_end - iname);
+			k = utf32_to_utf8(ec, iname, iname_end - iname);
 			if (k < 0) {
 				err = -ENAMETOOLONG;
 				goto quit;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 750abf211e2..477d37d83b3 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <asm/byteorder.h>
 
 static struct nls_table default_table;
 static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
     {0,						       /* end of table    */}
 };
 
-int
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define UNICODE_MAX	0x0010ffff
+#define PLANE_SIZE	0x00010000
+
+#define SURROGATE_MASK	0xfffff800
+#define SURROGATE_PAIR	0x0000d800
+#define SURROGATE_LOW	0x00000400
+#define SURROGATE_BITS	0x000003ff
+
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 {
-	long l;
+	unsigned long l;
 	int c0, c, nc;
 	const struct utf8_table *t;
   
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
 		nc++;
 		if ((c0 & t->cmask) == t->cval) {
 			l &= t->lmask;
-			if (l < t->lval)
+			if (l < t->lval || l > UNICODE_MAX ||
+					(l & SURROGATE_MASK) == SURROGATE_PAIR)
 				return -1;
-			*p = l;
+			*pu = (unicode_t) l;
 			return nc;
 		}
-		if (n <= nc)
+		if (len <= nc)
 			return -1;
 		s++;
 		c = (*s ^ 0x80) & 0xFF;
@@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
 	}
 	return -1;
 }
+EXPORT_SYMBOL(utf8_to_utf32);
 
-int
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
 {
-	__u16 *op;
-	const __u8 *ip;
-	int size;
-
-	op = pwcs;
-	ip = s;
-	while (*ip && n > 0) {
-		if (*ip & 0x80) {
-			size = utf8_mbtowc(op, ip, n);
-			if (size == -1) {
-				/* Ignore character and move on */
-				ip++;
-				n--;
-			} else {
-				op++;
-				ip += size;
-				n -= size;
-			}
-		} else {
-			*op++ = *ip++;
-			n--;
-		}
-	}
-	return (op - pwcs);
-}
-
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-	long l;
+	unsigned long l;
 	int c, nc;
 	const struct utf8_table *t;
-  
+
 	if (!s)
 		return 0;
-  
-	l = wc;
+
+	l = u;
+	if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -1;
+
 	nc = 0;
 	for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
 		nc++;
 		if (l <= t->lmask) {
 			c = t->shift;
-			*s = t->cval | (l >> c);
+			*s = (u8) (t->cval | (l >> c));
 			while (c > 0) {
 				c -= 6;
 				s++;
-				*s = 0x80 | ((l >> c) & 0x3F);
+				*s = (u8) (0x80 | ((l >> c) & 0x3F));
 			}
 			return nc;
 		}
 	}
 	return -1;
 }
+EXPORT_SYMBOL(utf32_to_utf8);
 
-int
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
 {
-	const __u16 *ip;
-	__u8 *op;
+	u16 *op;
 	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (*s && len > 0) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(s, len, &u);
+			if (size < 0) {
+				/* Ignore character and move on */
+				size = 1;
+			} else if (u >= PLANE_SIZE) {
+				u -= PLANE_SIZE;
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						((u >> 10) & SURROGATE_BITS));
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						SURROGATE_LOW |
+						(u & SURROGATE_BITS));
+			} else {
+				*op++ = (wchar_t) u;
+			}
+			s += size;
+			len -= size;
+		} else {
+			*op++ = *s++;
+			len--;
+		}
+	}
+	return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		return c;
+	case UTF16_LITTLE_ENDIAN:
+		return __le16_to_cpu(c);
+	case UTF16_BIG_ENDIAN:
+		return __be16_to_cpu(c);
+	}
+}
+
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+		u8 *s, int maxlen)
+{
+	u8 *op;
+	int size;
+	unsigned long u, v;
 
 	op = s;
-	ip = pwcs;
-	while (*ip && maxlen > 0) {
-		if (*ip > 0x7f) {
-			size = utf8_wctomb(op, *ip, maxlen);
+	while (len > 0 && maxlen > 0) {
+		u = get_utf16(*pwcs, endian);
+		if (!u)
+			break;
+		pwcs++;
+		len--;
+		if (u > 0x7f) {
+			if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+				if (u & SURROGATE_LOW) {
+					/* Ignore character and move on */
+					continue;
+				}
+				if (len <= 0)
+					break;
+				v = get_utf16(*pwcs, endian);
+				if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+						!(v & SURROGATE_LOW)) {
+					/* Ignore character and move on */
+					continue;
+				}
+				u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+						+ (v & SURROGATE_BITS);
+				pwcs++;
+				len--;
+			}
+			size = utf32_to_utf8(u, op, maxlen);
 			if (size == -1) {
 				/* Ignore character and move on */
 			} else {
@@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
 				maxlen -= size;
 			}
 		} else {
-			*op++ = (__u8) *ip;
+			*op++ = (u8) u;
 			maxlen--;
 		}
-		ip++;
 	}
-	return (op - s);
+	return op - s;
 }
+EXPORT_SYMBOL(utf16s_to_utf8s);
 
 int register_nls(struct nls_table * nls)
 {
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
 EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
 
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd97..0d60a44acac 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 {
 	int n;
 
-	if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(uni, out, boundlen);
+	if (n < 0) {
 		*out = '?';
 		return -EINVAL;
 	}
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
 {
 	int n;
+	unicode_t u;
 
-	if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+	n = utf8_to_utf32(rawstring, boundlen, &u);
+	if (n < 0 || u > MAX_WCHAR_T) {
 		*uni = 0x003f;	/* ? */
-		n = -EINVAL;
+		return -EINVAL;
 	}
+	*uni = (wchar_t) u;
 	return n;
 }
 
diff --git a/include/linux/nls.h b/include/linux/nls.h
index 52b1a76c1b4..d47beef08df 100644
--- a/include/linux/nls.h
+++ b/include/linux/nls.h
@@ -3,8 +3,23 @@
 
 #include <linux/init.h>
 
-/* unicode character */
-typedef __u16 wchar_t;
+/* Unicode has changed over the years.  Unicode code points no longer
+ * fit into 16 bits; as of Unicode 5 valid code points range from 0
+ * to 0x10ffff (17 planes, where each plane holds 65536 code points).
+ *
+ * The original decision to represent Unicode characters as 16-bit
+ * wchar_t values is now outdated.  But plane 0 still includes the
+ * most commonly used characters, so we will retain it.  The newer
+ * 32-bit unicode_t type can be used when it is necessary to
+ * represent the full Unicode character set.
+ */
+
+/* Plane-0 Unicode character */
+typedef u16 wchar_t;
+#define MAX_WCHAR_T	0xffff
+
+/* Arbitrary Unicode character */
+typedef u32 unicode_t;
 
 struct nls_table {
 	const char *charset;
@@ -21,6 +36,13 @@ struct nls_table {
 /* this value hold the maximum octet of charset */
 #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */
 
+/* Byte order for UTF-16 strings */
+enum utf16_endian {
+	UTF16_HOST_ENDIAN,
+	UTF16_LITTLE_ENDIAN,
+	UTF16_BIG_ENDIAN
+};
+
 /* nls.c */
 extern int register_nls(struct nls_table *);
 extern int unregister_nls(struct nls_table *);
@@ -28,10 +50,11 @@ extern struct nls_table *load_nls(char *);
 extern void unload_nls(struct nls_table *);
 extern struct nls_table *load_nls_default(void);
 
-extern int utf8_mbtowc(wchar_t *, const __u8 *, int);
-extern int utf8_mbstowcs(wchar_t *, const __u8 *, int);
-extern int utf8_wctomb(__u8 *, wchar_t, int);
-extern int utf8_wcstombs(__u8 *, const wchar_t *, int);
+extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
+extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
+extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs);
+extern int utf16s_to_utf8s(const wchar_t *pwcs, int len,
+		enum utf16_endian endian, u8 *s, int maxlen);
 
 static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c)
 {
-- 
cgit v1.2.3-70-g09d2


From 820d7a253c5e59a786d5b608f6e8d0419fdc2f6e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 27 Apr 2009 13:17:21 -0700
Subject: USB: remove unused usb_host class

The usb_host class isn't used for anything anymore (it was used for
debug files, but they have moved to debugfs a few kernel releases ago),
so let's delete it before someone accidentally puts a file in it.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.c | 27 ---------------------------
 drivers/usb/core/usb.c |  6 ------
 drivers/usb/core/usb.h |  2 --
 include/linux/usb.h    |  1 -
 4 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 42b93da1085..c65d9560402 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -755,23 +755,6 @@ static struct attribute_group usb_bus_attr_group = {
 
 /*-------------------------------------------------------------------------*/
 
-static struct class *usb_host_class;
-
-int usb_host_init(void)
-{
-	int retval = 0;
-
-	usb_host_class = class_create(THIS_MODULE, "usb_host");
-	if (IS_ERR(usb_host_class))
-		retval = PTR_ERR(usb_host_class);
-	return retval;
-}
-
-void usb_host_cleanup(void)
-{
-	class_destroy(usb_host_class);
-}
-
 /**
  * usb_bus_init - shared initialization code
  * @bus: the bus structure being initialized
@@ -818,12 +801,6 @@ static int usb_register_bus(struct usb_bus *bus)
 	set_bit (busnum, busmap.busmap);
 	bus->busnum = busnum;
 
-	bus->dev = device_create(usb_host_class, bus->controller, MKDEV(0, 0),
-				 bus, "usb_host%d", busnum);
-	result = PTR_ERR(bus->dev);
-	if (IS_ERR(bus->dev))
-		goto error_create_class_dev;
-
 	/* Add it to the local list of buses */
 	list_add (&bus->bus_list, &usb_bus_list);
 	mutex_unlock(&usb_bus_list_lock);
@@ -834,8 +811,6 @@ static int usb_register_bus(struct usb_bus *bus)
 		  "number %d\n", bus->busnum);
 	return 0;
 
-error_create_class_dev:
-	clear_bit(busnum, busmap.busmap);
 error_find_busnum:
 	mutex_unlock(&usb_bus_list_lock);
 	return result;
@@ -865,8 +840,6 @@ static void usb_deregister_bus (struct usb_bus *bus)
 	usb_notify_remove_bus(bus);
 
 	clear_bit (bus->busnum, busmap.busmap);
-
-	device_unregister(bus->dev);
 }
 
 /**
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index c71590666ad..eb810bbe7bb 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1055,9 +1055,6 @@ static int __init usb_init(void)
 	retval = bus_register_notifier(&usb_bus_type, &usb_bus_nb);
 	if (retval)
 		goto bus_notifier_failed;
-	retval = usb_host_init();
-	if (retval)
-		goto host_init_failed;
 	retval = usb_major_init();
 	if (retval)
 		goto major_init_failed;
@@ -1087,8 +1084,6 @@ usb_devio_init_failed:
 driver_register_failed:
 	usb_major_cleanup();
 major_init_failed:
-	usb_host_cleanup();
-host_init_failed:
 	bus_unregister_notifier(&usb_bus_type, &usb_bus_nb);
 bus_notifier_failed:
 	bus_unregister(&usb_bus_type);
@@ -1113,7 +1108,6 @@ static void __exit usb_exit(void)
 	usb_deregister(&usbfs_driver);
 	usb_devio_cleanup();
 	usb_hub_cleanup();
-	usb_host_cleanup();
 	bus_unregister_notifier(&usb_bus_type, &usb_bus_nb);
 	bus_unregister(&usb_bus_type);
 	ksuspend_usb_cleanup();
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 79d8a9ea559..dabf9255a10 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -41,8 +41,6 @@ extern int  usb_hub_init(void);
 extern void usb_hub_cleanup(void);
 extern int usb_major_init(void);
 extern void usb_major_cleanup(void);
-extern int usb_host_init(void);
-extern void usb_host_cleanup(void);
 
 #ifdef	CONFIG_PM
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 29060dad81e..606e0aa5bbe 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -336,7 +336,6 @@ struct usb_bus {
 #ifdef CONFIG_USB_DEVICEFS
 	struct dentry *usbfs_dentry;	/* usbfs dentry entry for the bus */
 #endif
-	struct device *dev;		/* device for this bus */
 
 #if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
 	struct mon_bus *mon_bus;	/* non-null when associated */
-- 
cgit v1.2.3-70-g09d2


From 00240c3839d843ccf07abd52806f421f7b87bbdc Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 27 Apr 2009 13:33:16 -0400
Subject: PCI: add power-state name strings

This patch (as1235) adds an array of PCI power-state names, together
with a simple inline accessor routine.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/pci.c   | 5 +++++
 include/linux/pci.h | 8 ++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 1a91bf9687a..07bbb9b3b93 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -24,6 +24,11 @@
 #include <asm/setup.h>
 #include "pci.h"
 
+const char *pci_power_names[] = {
+	"error", "D0", "D1", "D2", "D3hot", "D3cold", "unknown",
+};
+EXPORT_SYMBOL_GPL(pci_power_names);
+
 unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT;
 
 #ifdef CONFIG_PCI_DOMAINS
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 72698d89e76..8e366bb0705 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -124,6 +124,14 @@ typedef int __bitwise pci_power_t;
 #define PCI_UNKNOWN	((pci_power_t __force) 5)
 #define PCI_POWER_ERROR	((pci_power_t __force) -1)
 
+/* Remember to update this when the list above changes! */
+extern const char *pci_power_names[];
+
+static inline const char *pci_power_name(pci_power_t state)
+{
+	return pci_power_names[1 + (int) state];
+}
+
 #define PCI_PM_D2_DELAY	200
 #define PCI_PM_D3_WAIT	10
 #define PCI_PM_BUS_WAIT	50
-- 
cgit v1.2.3-70-g09d2


From cac85a8b4e8e7c51bc0ce2980bba0e35cfec5c2e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Apr 2009 21:04:19 -0700
Subject: USB: composite.h: mark private struct members as private:

Mark internal struct members as /* private: */ so that kernel-doc
won't produce warnings about missing descriptions for them.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/composite.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index acd7b0f06c8..4f6bb3d2160 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -124,6 +124,7 @@ struct usb_function {
 	void			(*suspend)(struct usb_function *);
 	void			(*resume)(struct usb_function *);
 
+	/* private: */
 	/* internals */
 	struct list_head		list;
 };
@@ -219,6 +220,7 @@ struct usb_configuration {
 
 	struct usb_composite_dev	*cdev;
 
+	/* private: */
 	/* internals */
 	struct list_head	list;
 	struct list_head	functions;
@@ -321,6 +323,7 @@ struct usb_composite_dev {
 
 	struct usb_configuration	*config;
 
+	/* private: */
 	/* internals */
 	struct usb_device_descriptor	desc;
 	struct list_head		configs;
-- 
cgit v1.2.3-70-g09d2


From bf92c1906e4f294a48fafc15755c65af636195e0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Apr 2009 21:02:49 -0700
Subject: USB: usb.h: change private: kernel-doc for new format requirement

Use "/* private:" to mark struct members as private so that
scripts/kernel-doc will handle them correctly.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 606e0aa5bbe..fb1f2a32ae0 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1421,8 +1421,8 @@ struct usb_sg_request {
 	int			status;
 	size_t			bytes;
 
-	/*
-	 * members below are private: to usbcore,
+	/* private:
+	 * members below are private to usbcore,
 	 * and are not provided for driver access!
 	 */
 	spinlock_t		lock;
-- 
cgit v1.2.3-70-g09d2


From 715b1dc01fe44537e8fce9566e4bb48d6821d84b Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 11 May 2009 15:24:07 -0500
Subject: USB: usb_debug, usb_generic_serial: implement multi urb write

The usb_debug driver, when used as the console, will always fail to
insert the carriage return and new line sequence as well as randomly
drop console output.  This is a result of only having the single
write_urb and that the tty layer will have a lock that prevents the
processing of the back to back urb requests.

The solution is to allow more than one urb to be outstanding and have
a slightly deeper transmit queue.  The idea and some code is borrowed
from the ftdi_sio usb driver.

The generic usb serial driver was modified so as to allow the classic
method of 1 write urb, or a multi write urb scheme with N allowed
outstanding urbs where N is controlled by max_in_flight_urbs.  When
max_in_flight_urbs in a "struct usb_serial_driver" is non zero the
multi write urb scheme will be used.

The size of 4000 was selected for the usb_debug driver so that the
driver lowers possibility of losing the queued console messages during
the kernel startup.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/generic.c   | 121 ++++++++++++++++++++++++++++++++++++++---
 drivers/usb/serial/usb_debug.c |   2 +
 include/linux/usb/serial.h     |   4 ++
 3 files changed, 120 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index be82ea95672..c919686521f 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -190,6 +190,88 @@ void usb_serial_generic_close(struct usb_serial_port *port)
 	generic_cleanup(port);
 }
 
+static int usb_serial_multi_urb_write(struct tty_struct *tty,
+	struct usb_serial_port *port, const unsigned char *buf, int count)
+{
+	unsigned long flags;
+	struct urb *urb;
+	unsigned char *buffer;
+	int status;
+	int towrite;
+	int bwrite = 0;
+
+	dbg("%s - port %d", __func__, port->number);
+
+	if (count == 0)
+		dbg("%s - write request of 0 bytes", __func__);
+
+	while (count > 0) {
+		towrite = (count > port->bulk_out_size) ?
+			port->bulk_out_size : count;
+		spin_lock_irqsave(&port->lock, flags);
+		if (port->urbs_in_flight >
+		    port->serial->type->max_in_flight_urbs) {
+			spin_unlock_irqrestore(&port->lock, flags);
+			dbg("%s - write limit hit\n", __func__);
+			return bwrite;
+		}
+		port->tx_bytes_flight += towrite;
+		port->urbs_in_flight++;
+		spin_unlock_irqrestore(&port->lock, flags);
+
+		buffer = kmalloc(towrite, GFP_ATOMIC);
+		if (!buffer) {
+			dev_err(&port->dev,
+			"%s ran out of kernel memory for urb ...\n", __func__);
+			goto error_no_buffer;
+		}
+
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
+		if (!urb) {
+			dev_err(&port->dev, "%s - no more free urbs\n",
+				__func__);
+			goto error_no_urb;
+		}
+
+		/* Copy data */
+		memcpy(buffer, buf + bwrite, towrite);
+		usb_serial_debug_data(debug, &port->dev, __func__,
+				      towrite, buffer);
+		/* fill the buffer and send it */
+		usb_fill_bulk_urb(urb, port->serial->dev,
+			usb_sndbulkpipe(port->serial->dev,
+					port->bulk_out_endpointAddress),
+			buffer, towrite,
+			usb_serial_generic_write_bulk_callback, port);
+
+		status = usb_submit_urb(urb, GFP_ATOMIC);
+		if (status) {
+			dev_err(&port->dev,
+				"%s - failed submitting write urb, error %d\n",
+				__func__, status);
+			goto error;
+		}
+
+		/* This urb is the responsibility of the host driver now */
+		usb_free_urb(urb);
+		dbg("%s write: %d", __func__, towrite);
+		count -= towrite;
+		bwrite += towrite;
+	}
+	return bwrite;
+
+error:
+	usb_free_urb(urb);
+error_no_urb:
+	kfree(buffer);
+error_no_buffer:
+	spin_lock_irqsave(&port->lock, flags);
+	port->urbs_in_flight--;
+	port->tx_bytes_flight -= towrite;
+	spin_unlock_irqrestore(&port->lock, flags);
+	return bwrite;
+}
+
 int usb_serial_generic_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count)
 {
@@ -207,6 +289,11 @@ int usb_serial_generic_write(struct tty_struct *tty,
 	/* only do something if we have a bulk out endpoint */
 	if (serial->num_bulk_out) {
 		unsigned long flags;
+
+		if (serial->type->max_in_flight_urbs)
+			return usb_serial_multi_urb_write(tty, port,
+							  buf, count);
+
 		spin_lock_irqsave(&port->lock, flags);
 		if (port->write_urb_busy) {
 			spin_unlock_irqrestore(&port->lock, flags);
@@ -257,15 +344,18 @@ int usb_serial_generic_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct usb_serial *serial = port->serial;
+	unsigned long flags;
 	int room = 0;
 
 	dbg("%s - port %d", __func__, port->number);
-
-	/* FIXME: Locking */
-	if (serial->num_bulk_out) {
-		if (!(port->write_urb_busy))
+	spin_lock_irqsave(&port->lock, flags);
+	if (serial->type->max_in_flight_urbs) {
+		if (port->urbs_in_flight < serial->type->max_in_flight_urbs)
 			room = port->bulk_out_size;
+	} else if (serial->num_bulk_out && !(port->write_urb_busy)) {
+		room = port->bulk_out_size;
 	}
+	spin_unlock_irqrestore(&port->lock, flags);
 
 	dbg("%s - returns %d", __func__, room);
 	return room;
@@ -276,11 +366,16 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
 	struct usb_serial_port *port = tty->driver_data;
 	struct usb_serial *serial = port->serial;
 	int chars = 0;
+	unsigned long flags;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* FIXME: Locking */
-	if (serial->num_bulk_out) {
+	if (serial->type->max_in_flight_urbs) {
+		spin_lock_irqsave(&port->lock, flags);
+		chars = port->tx_bytes_flight;
+		spin_unlock_irqrestore(&port->lock, flags);
+	} else if (serial->num_bulk_out) {
+		/* FIXME: Locking */
 		if (port->write_urb_busy)
 			chars = port->write_urb->transfer_buffer_length;
 	}
@@ -363,12 +458,24 @@ EXPORT_SYMBOL_GPL(usb_serial_generic_read_bulk_callback);
 
 void usb_serial_generic_write_bulk_callback(struct urb *urb)
 {
+	unsigned long flags;
 	struct usb_serial_port *port = urb->context;
 	int status = urb->status;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	port->write_urb_busy = 0;
+	if (port->serial->type->max_in_flight_urbs) {
+		spin_lock_irqsave(&port->lock, flags);
+		--port->urbs_in_flight;
+		port->tx_bytes_flight -= urb->transfer_buffer_length;
+		if (port->urbs_in_flight < 0)
+			port->urbs_in_flight = 0;
+		spin_unlock_irqrestore(&port->lock, flags);
+	} else {
+		/* Handle the case for single urb mode */
+		port->write_urb_busy = 0;
+	}
+
 	if (status) {
 		dbg("%s - nonzero write bulk status received: %d",
 		    __func__, status);
diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c
index 6c9cbb59552..a9427a8b867 100644
--- a/drivers/usb/serial/usb_debug.c
+++ b/drivers/usb/serial/usb_debug.c
@@ -15,6 +15,7 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 
+#define URB_DEBUG_MAX_IN_FLIGHT_URBS	4000
 #define USB_DEBUG_MAX_PACKET_SIZE	8
 
 static struct usb_device_id id_table [] = {
@@ -46,6 +47,7 @@ static struct usb_serial_driver debug_device = {
 	.id_table =		id_table,
 	.num_ports =		1,
 	.open =			usb_debug_open,
+	.max_in_flight_urbs =	URB_DEBUG_MAX_IN_FLIGHT_URBS,
 };
 
 static int __init debug_init(void)
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 8cdfed738fe..e2938fd179e 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -91,6 +91,9 @@ struct usb_serial_port {
 	int			write_urb_busy;
 	__u8			bulk_out_endpointAddress;
 
+	int			tx_bytes_flight;
+	int			urbs_in_flight;
+
 	wait_queue_head_t	write_wait;
 	struct work_struct	work;
 	char			throttled;
@@ -207,6 +210,7 @@ struct usb_serial_driver {
 	struct device_driver	driver;
 	struct usb_driver	*usb_driver;
 	struct usb_dynids	dynids;
+	int			max_in_flight_urbs;
 
 	int (*probe)(struct usb_serial *serial, const struct usb_device_id *id);
 	int (*attach)(struct usb_serial *serial);
-- 
cgit v1.2.3-70-g09d2


From 98fcb5f78165b8a3d93870ad7afd4d9ebbb8b43a Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 11 May 2009 15:24:09 -0500
Subject: USB: serial: usb_debug,usb_generic_serial: implement sysrq and serial
 break

The usb_debug driver was modified to implement serial break handling
by using a "magic" data packet comprised of the sequence:

       0x00 0xff 0x01 0xfe   0x00 0xfe 0x01 0xff

When the tty layer requests a serial break the usb_debug driver sends
the magic packet.  On the receiving side the magic packet is thrown
away or a sysrq is activated depending on what kernel .config options
have been set.

The generic serial driver was modified as well as the usb serial
headers to generically implement sysrq processing in the same way the
non usb uart based drivers implement the sysrq handling.  This will
allow other usb serial devices to implement sysrq handling as desired.

The new usb serial functions are named similarly and implemented
similarly to the uart functions as follows:

usb_serial_handle_break <-> uart_handle_break
usb_serial_handle_sysrq_char <-> uart_handle_sysrq_char

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/generic.c   | 56 +++++++++++++++++++++++++++++++++---------
 drivers/usb/serial/usb_debug.c | 39 +++++++++++++++++++++++++++++
 include/linux/usb/serial.h     |  8 ++++++
 3 files changed, 91 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index c919686521f..9fccc26235a 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -339,6 +339,7 @@ int usb_serial_generic_write(struct tty_struct *tty,
 	/* no bulk out, so return 0 bytes written */
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_serial_generic_write);
 
 int usb_serial_generic_write_room(struct tty_struct *tty)
 {
@@ -351,7 +352,9 @@ int usb_serial_generic_write_room(struct tty_struct *tty)
 	spin_lock_irqsave(&port->lock, flags);
 	if (serial->type->max_in_flight_urbs) {
 		if (port->urbs_in_flight < serial->type->max_in_flight_urbs)
-			room = port->bulk_out_size;
+			room = port->bulk_out_size *
+				(serial->type->max_in_flight_urbs -
+				 port->urbs_in_flight);
 	} else if (serial->num_bulk_out && !(port->write_urb_busy)) {
 		room = port->bulk_out_size;
 	}
@@ -385,7 +388,8 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
 }
 
 
-static void resubmit_read_urb(struct usb_serial_port *port, gfp_t mem_flags)
+void usb_serial_generic_resubmit_read_urb(struct usb_serial_port *port,
+			gfp_t mem_flags)
 {
 	struct urb *urb = port->read_urb;
 	struct usb_serial *serial = port->serial;
@@ -406,25 +410,28 @@ static void resubmit_read_urb(struct usb_serial_port *port, gfp_t mem_flags)
 			"%s - failed resubmitting read urb, error %d\n",
 							__func__, result);
 }
+EXPORT_SYMBOL_GPL(usb_serial_generic_resubmit_read_urb);
 
 /* Push data to tty layer and resubmit the bulk read URB */
 static void flush_and_resubmit_read_urb(struct usb_serial_port *port)
 {
 	struct urb *urb = port->read_urb;
 	struct tty_struct *tty = tty_port_tty_get(&port->port);
-	int room;
+	char *ch = (char *)urb->transfer_buffer;
+	int i;
+
+	if (!tty)
+		goto done;
 
 	/* Push data to tty */
-	if (tty && urb->actual_length) {
-		room = tty_buffer_request_room(tty, urb->actual_length);
-		if (room) {
-			tty_insert_flip_string(tty, urb->transfer_buffer, room);
-			tty_flip_buffer_push(tty);
-		}
+	for (i = 0; i < urb->actual_length; i++, ch++) {
+		if (!usb_serial_handle_sysrq_char(port, *ch))
+			tty_insert_flip_char(tty, *ch, TTY_NORMAL);
 	}
+	tty_flip_buffer_push(tty);
 	tty_kref_put(tty);
-
-	resubmit_read_urb(port, GFP_ATOMIC);
+done:
+	usb_serial_generic_resubmit_read_urb(port, GFP_ATOMIC);
 }
 
 void usb_serial_generic_read_bulk_callback(struct urb *urb)
@@ -515,10 +522,35 @@ void usb_serial_generic_unthrottle(struct tty_struct *tty)
 
 	if (was_throttled) {
 		/* Resume reading from device */
-		resubmit_read_urb(port, GFP_KERNEL);
+		usb_serial_generic_resubmit_read_urb(port, GFP_KERNEL);
 	}
 }
 
+int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch)
+{
+	if (port->sysrq) {
+		if (ch && time_before(jiffies, port->sysrq)) {
+			handle_sysrq(ch, tty_port_tty_get(&port->port));
+			port->sysrq = 0;
+			return 1;
+		}
+		port->sysrq = 0;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_serial_handle_sysrq_char);
+
+int usb_serial_handle_break(struct usb_serial_port *port)
+{
+	if (!port->sysrq) {
+		port->sysrq = jiffies + HZ*5;
+		return 1;
+	}
+	port->sysrq = 0;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_serial_handle_break);
+
 void usb_serial_generic_shutdown(struct usb_serial *serial)
 {
 	int i;
diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c
index a9427a8b867..614800972dc 100644
--- a/drivers/usb/serial/usb_debug.c
+++ b/drivers/usb/serial/usb_debug.c
@@ -17,6 +17,17 @@
 
 #define URB_DEBUG_MAX_IN_FLIGHT_URBS	4000
 #define USB_DEBUG_MAX_PACKET_SIZE	8
+#define USB_DEBUG_BRK_SIZE		8
+static char USB_DEBUG_BRK[USB_DEBUG_BRK_SIZE] = {
+	0x00,
+	0xff,
+	0x01,
+	0xfe,
+	0x00,
+	0xfe,
+	0x01,
+	0xff,
+};
 
 static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x0525, 0x127a) },
@@ -39,6 +50,32 @@ static int usb_debug_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return usb_serial_generic_open(tty, port, filp);
 }
 
+/* This HW really does not support a serial break, so one will be
+ * emulated when ever the break state is set to true.
+ */
+static void usb_debug_break_ctl(struct tty_struct *tty, int break_state)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	if (!break_state)
+		return;
+	usb_serial_generic_write(tty, port, USB_DEBUG_BRK, USB_DEBUG_BRK_SIZE);
+}
+
+static void usb_debug_read_bulk_callback(struct urb *urb)
+{
+	struct usb_serial_port *port = urb->context;
+
+	if (urb->actual_length == USB_DEBUG_BRK_SIZE &&
+	    memcmp(urb->transfer_buffer, USB_DEBUG_BRK,
+		   USB_DEBUG_BRK_SIZE) == 0) {
+		usb_serial_handle_break(port);
+		usb_serial_generic_resubmit_read_urb(port, GFP_ATOMIC);
+		return;
+	}
+
+	usb_serial_generic_read_bulk_callback(urb);
+}
+
 static struct usb_serial_driver debug_device = {
 	.driver = {
 		.owner =	THIS_MODULE,
@@ -48,6 +85,8 @@ static struct usb_serial_driver debug_device = {
 	.num_ports =		1,
 	.open =			usb_debug_open,
 	.max_in_flight_urbs =	URB_DEBUG_MAX_IN_FLIGHT_URBS,
+	.break_ctl =		usb_debug_break_ctl,
+	.read_bulk_callback =	usb_debug_read_bulk_callback,
 };
 
 static int __init debug_init(void)
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index e2938fd179e..e29ebcf3287 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -15,6 +15,7 @@
 
 #include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/sysrq.h>
 
 #define SERIAL_TTY_MAJOR	188	/* Nice legal number now */
 #define SERIAL_TTY_MINORS	254	/* loads of devices :) */
@@ -99,6 +100,7 @@ struct usb_serial_port {
 	char			throttled;
 	char			throttle_req;
 	char			console;
+	unsigned long		sysrq; /* sysrq timeout */
 	struct device		dev;
 };
 #define to_usb_serial_port(d) container_of(d, struct usb_serial_port, dev)
@@ -301,6 +303,12 @@ extern void usb_serial_generic_unthrottle(struct tty_struct *tty);
 extern void usb_serial_generic_shutdown(struct usb_serial *serial);
 extern int usb_serial_generic_register(int debug);
 extern void usb_serial_generic_deregister(void);
+extern void usb_serial_generic_resubmit_read_urb(struct usb_serial_port *port,
+						 gfp_t mem_flags);
+extern int usb_serial_handle_sysrq_char(struct usb_serial_port *port,
+					unsigned int ch);
+extern int usb_serial_handle_break(struct usb_serial_port *port);
+
 
 extern int usb_serial_bus_register(struct usb_serial_driver *device);
 extern void usb_serial_bus_deregister(struct usb_serial_driver *device);
-- 
cgit v1.2.3-70-g09d2


From 5effabbe9e6e0089f7afdde35cb51e8c8b4cf6bc Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Date: Tue, 26 May 2009 18:24:34 +0900
Subject: USB: r8a66597-hcd: use platform_data instead of module_param

CPU/board specific parameters (PLL clock, vif etc...) can be set
by platform_data instead of module_param.

v2: remove irq_sense member in platform_data because it can OR in
    IRQF_TRIGGER_LOW or IRQF_TRIGGER_FALLING against IORESOURCE_IRQ in
    the struct resource.

Signed-off-by: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Reviewed-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/host/r8a66597-hcd.c | 60 ++++++++++-------------------------------
 drivers/usb/host/r8a66597.h     | 38 +++++++++++++++++++++++---
 include/linux/usb/r8a66597.h    | 44 ++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 50 deletions(-)
 create mode 100644 include/linux/usb/r8a66597.h

(limited to 'include/linux')

diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index 3e1216ad86b..56976cc0352 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -46,31 +46,10 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Yoshihiro Shimoda");
 MODULE_ALIAS("platform:r8a66597_hcd");
 
-#define DRIVER_VERSION	"10 Apr 2008"
+#define DRIVER_VERSION	"2009-05-26"
 
 static const char hcd_name[] = "r8a66597_hcd";
 
-/* module parameters */
-#if !defined(CONFIG_SUPERH_ON_CHIP_R8A66597)
-static unsigned short clock = XTAL12;
-module_param(clock, ushort, 0644);
-MODULE_PARM_DESC(clock, "input clock: 48MHz=32768, 24MHz=16384, 12MHz=0 "
-		"(default=0)");
-#endif
-
-static unsigned short vif = LDRV;
-module_param(vif, ushort, 0644);
-MODULE_PARM_DESC(vif, "input VIF: 3.3V=32768, 1.5V=0(default=32768)");
-
-static unsigned short endian;
-module_param(endian, ushort, 0644);
-MODULE_PARM_DESC(endian, "data endian: big=256, little=0 (default=0)");
-
-static unsigned short irq_sense = 0xff;
-module_param(irq_sense, ushort, 0644);
-MODULE_PARM_DESC(irq_sense, "IRQ sense: low level=32, falling edge=0 "
-		"(default=32)");
-
 static void packet_write(struct r8a66597 *r8a66597, u16 pipenum);
 static int r8a66597_get_frame(struct usb_hcd *hcd);
 
@@ -136,7 +115,8 @@ static int r8a66597_clock_enable(struct r8a66597 *r8a66597)
 		}
 	} while ((tmp & USBE) != USBE);
 	r8a66597_bclr(r8a66597, USBE, SYSCFG0);
-	r8a66597_mdfy(r8a66597, clock, XTAL, SYSCFG0);
+	r8a66597_mdfy(r8a66597, get_xtal_from_pdata(r8a66597->pdata), XTAL,
+			SYSCFG0);
 
 	i = 0;
 	r8a66597_bset(r8a66597, XCKE, SYSCFG0);
@@ -203,6 +183,9 @@ static void r8a66597_disable_port(struct r8a66597 *r8a66597, int port)
 static int enable_controller(struct r8a66597 *r8a66597)
 {
 	int ret, port;
+	u16 vif = r8a66597->pdata->vif ? LDRV : 0;
+	u16 irq_sense = r8a66597->irq_sense_low ? INTL : 0;
+	u16 endian = r8a66597->pdata->endian ? BIGEND : 0;
 
 	ret = r8a66597_clock_enable(r8a66597);
 	if (ret < 0)
@@ -2418,6 +2401,12 @@ static int __devinit r8a66597_probe(struct platform_device *pdev)
 		goto clean_up;
 	}
 
+	if (pdev->dev.platform_data == NULL) {
+		dev_err(&pdev->dev, "no platform data\n");
+		ret = -ENODEV;
+		goto clean_up;
+	}
+
 	/* initialize hcd */
 	hcd = usb_create_hcd(&r8a66597_hc_driver, &pdev->dev, (char *)hcd_name);
 	if (!hcd) {
@@ -2428,6 +2417,8 @@ static int __devinit r8a66597_probe(struct platform_device *pdev)
 	r8a66597 = hcd_to_r8a66597(hcd);
 	memset(r8a66597, 0, sizeof(struct r8a66597));
 	dev_set_drvdata(&pdev->dev, r8a66597);
+	r8a66597->pdata = pdev->dev.platform_data;
+	r8a66597->irq_sense_low = irq_trigger == IRQF_TRIGGER_LOW;
 
 #if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
 	snprintf(clk_name, sizeof(clk_name), "usb%d", pdev->id);
@@ -2458,29 +2449,6 @@ static int __devinit r8a66597_probe(struct platform_device *pdev)
 
 	hcd->rsrc_start = res->start;
 
-	/* irq_sense setting on cmdline takes precedence over resource
-	 * settings, so the introduction of irqflags in IRQ resourse
-	 * won't disturb existing setups */
-	switch (irq_sense) {
-		case INTL:
-			irq_trigger = IRQF_TRIGGER_LOW;
-			break;
-		case 0:
-			irq_trigger = IRQF_TRIGGER_FALLING;
-			break;
-		case 0xff:
-			if (irq_trigger)
-				irq_sense = (irq_trigger & IRQF_TRIGGER_LOW) ?
-					    INTL : 0;
-			else {
-				irq_sense = INTL;
-				irq_trigger = IRQF_TRIGGER_LOW;
-			}
-			break;
-		default:
-			dev_err(&pdev->dev, "Unknown irq_sense value.\n");
-	}
-
 	ret = usb_add_hcd(hcd, irq, IRQF_DISABLED | irq_trigger);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to add hcd\n");
diff --git a/drivers/usb/host/r8a66597.h b/drivers/usb/host/r8a66597.h
index f49208f1bb7..d72680b433f 100644
--- a/drivers/usb/host/r8a66597.h
+++ b/drivers/usb/host/r8a66597.h
@@ -30,6 +30,8 @@
 #include <linux/clk.h>
 #endif
 
+#include <linux/usb/r8a66597.h>
+
 #define SYSCFG0		0x00
 #define SYSCFG1		0x02
 #define SYSSTS0		0x04
@@ -488,6 +490,7 @@ struct r8a66597 {
 #if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK)
 	struct clk *clk;
 #endif
+	struct r8a66597_platdata	*pdata;
 	struct r8a66597_device		device0;
 	struct r8a66597_root_hub	root_hub[R8A66597_MAX_ROOT_HUB];
 	struct list_head		pipe_queue[R8A66597_MAX_NUM_PIPE];
@@ -506,6 +509,7 @@ struct r8a66597 {
 	unsigned long child_connect_map[4];
 
 	unsigned bus_suspended:1;
+	unsigned irq_sense_low:1;
 };
 
 static inline struct r8a66597 *hcd_to_r8a66597(struct usb_hcd *hcd)
@@ -660,10 +664,36 @@ static inline void r8a66597_port_power(struct r8a66597 *r8a66597, int port,
 {
 	unsigned long dvstctr_reg = get_dvstctr_reg(port);
 
-	if (power)
-		r8a66597_bset(r8a66597, VBOUT, dvstctr_reg);
-	else
-		r8a66597_bclr(r8a66597, VBOUT, dvstctr_reg);
+	if (r8a66597->pdata->port_power) {
+		r8a66597->pdata->port_power(port, power);
+	} else {
+		if (power)
+			r8a66597_bset(r8a66597, VBOUT, dvstctr_reg);
+		else
+			r8a66597_bclr(r8a66597, VBOUT, dvstctr_reg);
+	}
+}
+
+static inline u16 get_xtal_from_pdata(struct r8a66597_platdata *pdata)
+{
+	u16 clock = 0;
+
+	switch (pdata->xtal) {
+	case R8A66597_PLATDATA_XTAL_12MHZ:
+		clock = XTAL12;
+		break;
+	case R8A66597_PLATDATA_XTAL_24MHZ:
+		clock = XTAL24;
+		break;
+	case R8A66597_PLATDATA_XTAL_48MHZ:
+		clock = XTAL48;
+		break;
+	default:
+		printk(KERN_ERR "r8a66597: platdata clock is wrong.\n");
+		break;
+	}
+
+	return clock;
 }
 
 #define get_pipectr_addr(pipenum)	(PIPE1CTR + (pipenum - 1) * 2)
diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h
new file mode 100644
index 00000000000..e9f0384fa20
--- /dev/null
+++ b/include/linux/usb/r8a66597.h
@@ -0,0 +1,44 @@
+/*
+ * R8A66597 driver platform data
+ *
+ * Copyright (C) 2009  Renesas Solutions Corp.
+ *
+ * Author : Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __LINUX_USB_R8A66597_H
+#define __LINUX_USB_R8A66597_H
+
+#define R8A66597_PLATDATA_XTAL_12MHZ	0x01
+#define R8A66597_PLATDATA_XTAL_24MHZ	0x02
+#define R8A66597_PLATDATA_XTAL_48MHZ	0x03
+
+struct r8a66597_platdata {
+	/* This ops can controll port power instead of DVSTCTR register. */
+	void (*port_power)(int port, int power);
+
+	/* (external controller only) set R8A66597_PLATDATA_XTAL_nnMHZ */
+	unsigned	xtal:2;
+
+	/* set one = 3.3V, set zero = 1.5V */
+	unsigned	vif:1;
+
+	/* set one = big endian, set zero = little endian */
+	unsigned	endian:1;
+};
+#endif
+
-- 
cgit v1.2.3-70-g09d2


From c47d7b09891abb4f8b222317c89c7469bed8db3a Mon Sep 17 00:00:00 2001
From: Bryan Wu <cooloney@kernel.org>
Date: Wed, 3 Jun 2009 09:17:57 -0400
Subject: USB: audio: add USB audio class definitions

Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/audio.h | 265 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 263 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index 8cb025fef63..b5744bc218a 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -24,10 +24,75 @@
 #define USB_SUBCLASS_AUDIOCONTROL	0x01
 #define USB_SUBCLASS_AUDIOSTREAMING	0x02
 #define USB_SUBCLASS_MIDISTREAMING	0x03
+#define USB_SUBCLASS_VENDOR_SPEC	0xff
 
+/* A.5 Audio Class-Specific AC interface Descriptor Subtypes*/
+#define HEADER				0x01
+#define INPUT_TERMINAL			0x02
+#define OUTPUT_TERMINAL			0x03
+#define MIXER_UNIT			0x04
+#define SELECTOR_UNIT			0x05
+#define FEATURE_UNIT			0x06
+#define PROCESSING_UNIT			0x07
+#define EXTENSION_UNIT			0x08
+
+#define AS_GENERAL			0x01
+#define FORMAT_TYPE			0x02
+#define FORMAT_SPECIFIC			0x03
+
+#define EP_GENERAL			0x01
+
+#define MS_GENERAL			0x01
+#define MIDI_IN_JACK			0x02
+#define MIDI_OUT_JACK			0x03
+
+/* endpoint attributes */
+#define EP_ATTR_MASK			0x0c
+#define EP_ATTR_ASYNC			0x04
+#define EP_ATTR_ADAPTIVE		0x08
+#define EP_ATTR_SYNC			0x0c
+
+/* cs endpoint attributes */
+#define EP_CS_ATTR_SAMPLE_RATE		0x01
+#define EP_CS_ATTR_PITCH_CONTROL	0x02
+#define EP_CS_ATTR_FILL_MAX		0x80
+
+/* Audio Class specific Request Codes */
+#define USB_AUDIO_SET_INTF		0x21
+#define USB_AUDIO_SET_ENDPOINT		0x22
+#define USB_AUDIO_GET_INTF		0xa1
+#define USB_AUDIO_GET_ENDPOINT		0xa2
+
+#define SET_	0x00
+#define GET_	0x80
+
+#define _CUR	0x1
+#define _MIN	0x2
+#define _MAX	0x3
+#define _RES	0x4
+#define _MEM	0x5
+
+#define SET_CUR		(SET_ | _CUR)
+#define GET_CUR		(GET_ | _CUR)
+#define SET_MIN		(SET_ | _MIN)
+#define GET_MIN		(GET_ | _MIN)
+#define SET_MAX		(SET_ | _MAX)
+#define GET_MAX		(GET_ | _MAX)
+#define SET_RES		(SET_ | _RES)
+#define GET_RES		(GET_ | _RES)
+#define SET_MEM		(SET_ | _MEM)
+#define GET_MEM		(GET_ | _MEM)
+
+#define GET_STAT	0xff
+
+#define USB_AC_TERMINAL_UNDEFINED	0x100
+#define USB_AC_TERMINAL_STREAMING	0x101
+#define USB_AC_TERMINAL_VENDOR_SPEC	0x1FF
+
+/* Terminal Control Selectors */
 /* 4.3.2  Class-Specific AC Interface Descriptor */
 struct usb_ac_header_descriptor {
-	__u8  bLength;			/* 8+n */
+	__u8  bLength;			/* 8 + n */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* USB_MS_HEADER */
 	__le16 bcdADC;			/* 0x0100 */
@@ -36,7 +101,7 @@ struct usb_ac_header_descriptor {
 	__u8  baInterfaceNr[];		/* [n] */
 } __attribute__ ((packed));
 
-#define USB_DT_AC_HEADER_SIZE(n)	(8+(n))
+#define USB_DT_AC_HEADER_SIZE(n)	(8 + (n))
 
 /* As above, but more useful for defining your own descriptors: */
 #define DECLARE_USB_AC_HEADER_DESCRIPTOR(n) 			\
@@ -50,4 +115,200 @@ struct usb_ac_header_descriptor_##n {				\
 	__u8  baInterfaceNr[n];					\
 } __attribute__ ((packed))
 
+/* 4.3.2.1 Input Terminal Descriptor */
+struct usb_input_terminal_descriptor {
+	__u8  bLength;			/* in bytes: 12 */
+	__u8  bDescriptorType;		/* CS_INTERFACE descriptor type */
+	__u8  bDescriptorSubtype;	/* INPUT_TERMINAL descriptor subtype */
+	__u8  bTerminalID;		/* Constant uniquely terminal ID */
+	__le16 wTerminalType;		/* USB Audio Terminal Types */
+	__u8  bAssocTerminal;		/* ID of the Output Terminal associated */
+	__u8  bNrChannels;		/* Number of logical output channels */
+	__le16 wChannelConfig;
+	__u8  iChannelNames;
+	__u8  iTerminal;
+} __attribute__ ((packed));
+
+#define USB_DT_AC_INPUT_TERMINAL_SIZE			12
+
+#define USB_AC_INPUT_TERMINAL_UNDEFINED			0x200
+#define USB_AC_INPUT_TERMINAL_MICROPHONE		0x201
+#define USB_AC_INPUT_TERMINAL_DESKTOP_MICROPHONE	0x202
+#define USB_AC_INPUT_TERMINAL_PERSONAL_MICROPHONE	0x203
+#define USB_AC_INPUT_TERMINAL_OMNI_DIR_MICROPHONE	0x204
+#define USB_AC_INPUT_TERMINAL_MICROPHONE_ARRAY		0x205
+#define USB_AC_INPUT_TERMINAL_PROC_MICROPHONE_ARRAY	0x206
+
+/* 4.3.2.2 Output Terminal Descriptor */
+struct usb_output_terminal_descriptor {
+	__u8  bLength;			/* in bytes: 9 */
+	__u8  bDescriptorType;		/* CS_INTERFACE descriptor type */
+	__u8  bDescriptorSubtype;	/* OUTPUT_TERMINAL descriptor subtype */
+	__u8  bTerminalID;		/* Constant uniquely terminal ID */
+	__le16 wTerminalType;		/* USB Audio Terminal Types */
+	__u8  bAssocTerminal;		/* ID of the Input Terminal associated */
+	__u8  bSourceID;		/* ID of the connected Unit or Terminal*/
+	__u8  iTerminal;
+} __attribute__ ((packed));
+
+#define USB_DT_AC_OUTPUT_TERMINAL_SIZE				9
+
+#define USB_AC_OUTPUT_TERMINAL_UNDEFINED			0x300
+#define USB_AC_OUTPUT_TERMINAL_SPEAKER				0x301
+#define USB_AC_OUTPUT_TERMINAL_HEADPHONES			0x302
+#define USB_AC_OUTPUT_TERMINAL_HEAD_MOUNTED_DISPLAY_AUDIO	0x303
+#define USB_AC_OUTPUT_TERMINAL_DESKTOP_SPEAKER			0x304
+#define USB_AC_OUTPUT_TERMINAL_ROOM_SPEAKER			0x305
+#define USB_AC_OUTPUT_TERMINAL_COMMUNICATION_SPEAKER		0x306
+#define USB_AC_OUTPUT_TERMINAL_LOW_FREQ_EFFECTS_SPEAKER		0x307
+
+/* Set bControlSize = 2 as default setting */
+#define USB_DT_AC_FEATURE_UNIT_SIZE(ch)		(7 + ((ch) + 1) * 2)
+
+/* As above, but more useful for defining your own descriptors: */
+#define DECLARE_USB_AC_FEATURE_UNIT_DESCRIPTOR(ch) 		\
+struct usb_ac_feature_unit_descriptor_##ch {			\
+	__u8  bLength;						\
+	__u8  bDescriptorType;					\
+	__u8  bDescriptorSubtype;				\
+	__u8  bUnitID;						\
+	__u8  bSourceID;					\
+	__u8  bControlSize;					\
+	__le16 bmaControls[ch + 1];				\
+	__u8  iFeature;						\
+} __attribute__ ((packed))
+
+/* 4.5.2 Class-Specific AS Interface Descriptor */
+struct usb_as_header_descriptor {
+	__u8  bLength;			/* in bytes: 7 */
+	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
+	__u8  bDescriptorSubtype;	/* AS_GENERAL */
+	__u8  bTerminalLink;		/* Terminal ID of connected Terminal */
+	__u8  bDelay;			/* Delay introduced by the data path */
+	__le16 wFormatTag;		/* The Audio Data Format */
+} __attribute__ ((packed));
+
+#define USB_DT_AS_HEADER_SIZE		7
+
+#define USB_AS_AUDIO_FORMAT_TYPE_I_UNDEFINED	0x0
+#define USB_AS_AUDIO_FORMAT_TYPE_I_PCM		0x1
+#define USB_AS_AUDIO_FORMAT_TYPE_I_PCM8		0x2
+#define USB_AS_AUDIO_FORMAT_TYPE_I_IEEE_FLOAT	0x3
+#define USB_AS_AUDIO_FORMAT_TYPE_I_ALAW		0x4
+#define USB_AS_AUDIO_FORMAT_TYPE_I_MULAW	0x5
+
+struct usb_as_format_type_i_continuous_descriptor {
+	__u8  bLength;			/* in bytes: 8 + (ns * 3) */
+	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
+	__u8  bDescriptorSubtype;	/* FORMAT_TYPE */
+	__u8  bFormatType;		/* FORMAT_TYPE_1 */
+	__u8  bNrChannels;		/* physical channels in the stream */
+	__u8  bSubframeSize;		/* */
+	__u8  bBitResolution;
+	__u8  bSamFreqType;
+	__u8  tLowerSamFreq[3];
+	__u8  tUpperSamFreq[3];
+} __attribute__ ((packed));
+
+#define USB_AS_FORMAT_TYPE_I_CONTINUOUS_DESC_SIZE	14
+
+struct usb_as_formate_type_i_discrete_descriptor {
+	__u8  bLength;			/* in bytes: 8 + (ns * 3) */
+	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
+	__u8  bDescriptorSubtype;	/* FORMAT_TYPE */
+	__u8  bFormatType;		/* FORMAT_TYPE_1 */
+	__u8  bNrChannels;		/* physical channels in the stream */
+	__u8  bSubframeSize;		/* */
+	__u8  bBitResolution;
+	__u8  bSamFreqType;
+	__u8  tSamFreq[][3];
+} __attribute__ ((packed));
+
+#define DECLARE_USB_AS_FORMAT_TYPE_I_DISCRETE_DESC(n) 		\
+struct usb_as_formate_type_i_discrete_descriptor_##n {		\
+	__u8  bLength;						\
+	__u8  bDescriptorType;					\
+	__u8  bDescriptorSubtype;				\
+	__u8  bFormatType;					\
+	__u8  bNrChannels;					\
+	__u8  bSubframeSize;					\
+	__u8  bBitResolution;					\
+	__u8  bSamFreqType;					\
+	__u8  tSamFreq[n][3];					\
+} __attribute__ ((packed))
+
+#define USB_AS_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(n)	(8 + (n * 3))
+
+#define USB_AS_FORMAT_TYPE_UNDEFINED	0x0
+#define USB_AS_FORMAT_TYPE_I		0x1
+#define USB_AS_FORMAT_TYPE_II		0x2
+#define USB_AS_FORMAT_TYPE_III		0x3
+
+#define USB_AS_ENDPOINT_ASYNC		(1 << 2)
+#define USB_AS_ENDPOINT_ADAPTIVE	(2 << 2)
+#define USB_AS_ENDPOINT_SYNC		(3 << 2)
+
+struct usb_as_iso_endpoint_descriptor {
+	__u8  bLength;			/* in bytes: 7 */
+	__u8  bDescriptorType;		/* USB_DT_CS_ENDPOINT */
+	__u8  bDescriptorSubtype;	/* EP_GENERAL */
+	__u8  bmAttributes;
+	__u8  bLockDelayUnits;
+	__le16 wLockDelay;
+};
+#define USB_AS_ISO_ENDPOINT_DESC_SIZE	7
+
+#define FU_CONTROL_UNDEFINED		0x00
+#define MUTE_CONTROL			0x01
+#define VOLUME_CONTROL			0x02
+#define BASS_CONTROL			0x03
+#define MID_CONTROL			0x04
+#define TREBLE_CONTROL			0x05
+#define GRAPHIC_EQUALIZER_CONTROL	0x06
+#define AUTOMATIC_GAIN_CONTROL		0x07
+#define DELAY_CONTROL			0x08
+#define BASS_BOOST_CONTROL		0x09
+#define LOUDNESS_CONTROL		0x0a
+
+#define FU_MUTE		(1 << (MUTE_CONTROL - 1))
+#define FU_VOLUME	(1 << (VOLUME_CONTROL - 1))
+#define FU_BASS		(1 << (BASS_CONTROL - 1))
+#define FU_MID		(1 << (MID_CONTROL - 1))
+#define FU_TREBLE	(1 << (TREBLE_CONTROL - 1))
+#define FU_GRAPHIC_EQ	(1 << (GRAPHIC_EQUALIZER_CONTROL - 1))
+#define FU_AUTO_GAIN	(1 << (AUTOMATIC_GAIN_CONTROL - 1))
+#define FU_DELAY	(1 << (DELAY_CONTROL - 1))
+#define FU_BASS_BOOST	(1 << (BASS_BOOST_CONTROL - 1))
+#define FU_LOUDNESS	(1 << (LOUDNESS_CONTROL - 1))
+
+struct usb_audio_control {
+	struct list_head list;
+	const char *name;
+	u8 type;
+	int data[5];
+	int (*set)(struct usb_audio_control *con, u8 cmd, int value);
+	int (*get)(struct usb_audio_control *con, u8 cmd);
+};
+
+static inline int generic_set_cmd(struct usb_audio_control *con, u8 cmd, int value)
+{
+	con->data[cmd] = value;
+
+	return 0;
+}
+
+static inline int generic_get_cmd(struct usb_audio_control *con, u8 cmd)
+{
+	return con->data[cmd];
+}
+
+struct usb_audio_control_selector {
+	struct list_head list;
+	struct list_head control;
+	u8 id;
+	const char *name;
+	u8 type;
+	struct usb_descriptor_header *desc;
+};
+
 #endif /* __LINUX_USB_AUDIO_H */
-- 
cgit v1.2.3-70-g09d2


From c706ebdfc8955b850e477255a8c0f93f9f14712d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 2 Jun 2009 11:54:11 -0400
Subject: USB: usb-serial: call port_probe and port_remove at the right times

This patch (as1253) prevents the usb-serial core from calling a
driver's port_probe and port_remove methods more than once per port.
It also removes some unnecessary try_module_get() calls and adds a
missing port_remove method call in a failure path.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/bus.c        | 27 +++++++++++----------------
 drivers/usb/serial/usb-serial.c | 24 ++++++++++++++++++++++--
 include/linux/usb/serial.h      |  8 ++++++++
 3 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 83bbb5bca2e..ba555c528cc 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -59,23 +59,22 @@ static int usb_serial_device_probe(struct device *dev)
 		retval = -ENODEV;
 		goto exit;
 	}
+	if (port->dev_state != PORT_REGISTERING)
+		goto exit;
 
 	driver = port->serial->type;
 	if (driver->port_probe) {
-		if (!try_module_get(driver->driver.owner)) {
-			dev_err(dev, "module get failed, exiting\n");
-			retval = -EIO;
-			goto exit;
-		}
 		retval = driver->port_probe(port);
-		module_put(driver->driver.owner);
 		if (retval)
 			goto exit;
 	}
 
 	retval = device_create_file(dev, &dev_attr_port_number);
-	if (retval)
+	if (retval) {
+		if (driver->port_remove)
+			retval = driver->port_remove(port);
 		goto exit;
+	}
 
 	minor = port->number;
 	tty_register_device(usb_serial_tty_driver, minor, dev);
@@ -98,19 +97,15 @@ static int usb_serial_device_remove(struct device *dev)
 	if (!port)
 		return -ENODEV;
 
+	if (port->dev_state != PORT_UNREGISTERING)
+		return retval;
+
 	device_remove_file(&port->dev, &dev_attr_port_number);
 
 	driver = port->serial->type;
-	if (driver->port_remove) {
-		if (!try_module_get(driver->driver.owner)) {
-			dev_err(dev, "module get failed, exiting\n");
-			retval = -EIO;
-			goto exit;
-		}
+	if (driver->port_remove)
 		retval = driver->port_remove(port);
-		module_put(driver->driver.owner);
-	}
-exit:
+
 	minor = port->number;
 	tty_unregister_device(usb_serial_tty_driver, minor);
 	dev_info(dev, "%s converter now disconnected from ttyUSB%d\n",
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 1967a7edc10..da890f030fa 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1046,10 +1046,15 @@ int usb_serial_probe(struct usb_interface *interface,
 
 		dev_set_name(&port->dev, "ttyUSB%d", port->number);
 		dbg ("%s - registering %s", __func__, dev_name(&port->dev));
+		port->dev_state = PORT_REGISTERING;
 		retval = device_register(&port->dev);
-		if (retval)
+		if (retval) {
 			dev_err(&port->dev, "Error registering port device, "
 				"continuing\n");
+			port->dev_state = PORT_UNREGISTERED;
+		} else {
+			port->dev_state = PORT_REGISTERED;
+		}
 	}
 
 	usb_serial_console_init(debug, minor);
@@ -1130,7 +1135,22 @@ void usb_serial_disconnect(struct usb_interface *interface)
 			}
 			kill_traffic(port);
 			cancel_work_sync(&port->work);
-			device_del(&port->dev);
+			if (port->dev_state == PORT_REGISTERED) {
+
+				/* Make sure the port is bound so that the
+				 * driver's port_remove method is called.
+				 */
+				if (!port->dev.driver) {
+					int rc;
+
+					port->dev.driver =
+							&serial->type->driver;
+					rc = device_bind_driver(&port->dev);
+				}
+				port->dev_state = PORT_UNREGISTERING;
+				device_del(&port->dev);
+				port->dev_state = PORT_UNREGISTERED;
+			}
 		}
 	}
 	serial->type->shutdown(serial);
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index e29ebcf3287..ed4aa0fa7ed 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -27,6 +27,13 @@
 /* parity check flag */
 #define RELEVANT_IFLAG(iflag)	(iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
 
+enum port_dev_state {
+	PORT_UNREGISTERED,
+	PORT_REGISTERING,
+	PORT_REGISTERED,
+	PORT_UNREGISTERING,
+};
+
 /**
  * usb_serial_port: structure for the specific ports of a device.
  * @serial: pointer back to the struct usb_serial owner of this port.
@@ -102,6 +109,7 @@ struct usb_serial_port {
 	char			console;
 	unsigned long		sysrq; /* sysrq timeout */
 	struct device		dev;
+	enum port_dev_state	dev_state;
 };
 #define to_usb_serial_port(d) container_of(d, struct usb_serial_port, dev)
 
-- 
cgit v1.2.3-70-g09d2


From f9c99bb8b3a1ec81af68d484a551307326c2e933 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 2 Jun 2009 11:53:55 -0400
Subject: USB: usb-serial: replace shutdown with disconnect, release

This patch (as1254) splits up the shutdown method of usb_serial_driver
into a disconnect and a release method.

The problem is that the usb-serial core was calling shutdown during
disconnect handling, but drivers didn't expect it to be called until
after all the open file references had been closed.  The result was an
oops when the close method tried to use memory that had been
deallocated by shutdown.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/staging/uc2322/aten2011.c     |  4 ++--
 drivers/usb/serial/aircable.c         |  5 ++---
 drivers/usb/serial/belkin_sa.c        |  7 +++---
 drivers/usb/serial/cp210x.c           |  6 ++---
 drivers/usb/serial/cyberjack.c        | 20 ++++++++++++-----
 drivers/usb/serial/cypress_m8.c       | 11 +++++----
 drivers/usb/serial/digi_acceleport.c  | 20 ++++++++++++-----
 drivers/usb/serial/empeg.c            |  8 -------
 drivers/usb/serial/ftdi_sio.c         | 14 ------------
 drivers/usb/serial/garmin_gps.c       | 16 ++++++++++---
 drivers/usb/serial/generic.c          |  9 ++++++--
 drivers/usb/serial/io_edgeport.c      | 29 ++++++++++++++++--------
 drivers/usb/serial/io_tables.h        | 12 ++++++----
 drivers/usb/serial/io_ti.c            | 22 +++++++++++++-----
 drivers/usb/serial/ipaq.c             |  7 ------
 drivers/usb/serial/iuu_phoenix.c      |  6 ++---
 drivers/usb/serial/keyspan.c          | 13 ++++++++++-
 drivers/usb/serial/keyspan.h          | 12 ++++++----
 drivers/usb/serial/keyspan_pda.c      |  4 ++--
 drivers/usb/serial/kl5kusb105.c       | 39 ++++++++++++++++++--------------
 drivers/usb/serial/kobil_sct.c        | 12 ++++------
 drivers/usb/serial/mct_u232.c         | 13 +++++------
 drivers/usb/serial/mos7720.c          |  9 +++-----
 drivers/usb/serial/mos7840.c          | 42 ++++++++++++++++++++++++++++++-----
 drivers/usb/serial/omninet.c          | 19 ++++++++++++----
 drivers/usb/serial/opticon.c          | 14 +++++++++---
 drivers/usb/serial/option.c           | 17 +++++++++-----
 drivers/usb/serial/oti6858.c          |  7 +++---
 drivers/usb/serial/pl2303.c           |  5 ++---
 drivers/usb/serial/sierra.c           |  4 ++--
 drivers/usb/serial/spcp8x5.c          |  5 ++---
 drivers/usb/serial/symbolserial.c     | 14 +++++++++---
 drivers/usb/serial/ti_usb_3410_5052.c | 10 ++++-----
 drivers/usb/serial/usb-serial.c       | 29 +++++++++++-------------
 drivers/usb/serial/visor.c            | 13 +++++------
 drivers/usb/serial/whiteheat.c        |  6 ++---
 include/linux/usb/serial.h            | 12 ++++++----
 37 files changed, 297 insertions(+), 198 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/uc2322/aten2011.c b/drivers/staging/uc2322/aten2011.c
index 9c62f787cc9..39d0926d1a9 100644
--- a/drivers/staging/uc2322/aten2011.c
+++ b/drivers/staging/uc2322/aten2011.c
@@ -2336,7 +2336,7 @@ static int ATEN2011_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void ATEN2011_shutdown(struct usb_serial *serial)
+static void ATEN2011_release(struct usb_serial *serial)
 {
 	int i;
 	struct ATENINTL_port *ATEN2011_port;
@@ -2382,7 +2382,7 @@ static struct usb_serial_driver aten_serial_driver = {
 	.tiocmget =		ATEN2011_tiocmget,
 	.tiocmset =		ATEN2011_tiocmset,
 	.attach =		ATEN2011_startup,
-	.shutdown =		ATEN2011_shutdown,
+	.release =		ATEN2011_release,
 	.read_bulk_callback =	ATEN2011_bulk_in_callback,
 	.read_int_callback =	ATEN2011_interrupt_callback,
 };
diff --git a/drivers/usb/serial/aircable.c b/drivers/usb/serial/aircable.c
index 6d106e74265..2cbfab3716e 100644
--- a/drivers/usb/serial/aircable.c
+++ b/drivers/usb/serial/aircable.c
@@ -364,7 +364,7 @@ static int aircable_attach(struct usb_serial *serial)
 	return 0;
 }
 
-static void aircable_shutdown(struct usb_serial *serial)
+static void aircable_release(struct usb_serial *serial)
 {
 
 	struct usb_serial_port *port = serial->port[0];
@@ -375,7 +375,6 @@ static void aircable_shutdown(struct usb_serial *serial)
 	if (priv) {
 		serial_buf_free(priv->tx_buf);
 		serial_buf_free(priv->rx_buf);
-		usb_set_serial_port_data(port, NULL);
 		kfree(priv);
 	}
 }
@@ -601,7 +600,7 @@ static struct usb_serial_driver aircable_device = {
 	.num_ports =		1,
 	.attach =		aircable_attach,
 	.probe =		aircable_probe,
-	.shutdown =		aircable_shutdown,
+	.release =		aircable_release,
 	.write =		aircable_write,
 	.write_room =		aircable_write_room,
 	.write_bulk_callback =	aircable_write_bulk_callback,
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index 2bfd6dd85b5..7033b031b44 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -90,7 +90,7 @@ static int debug;
 
 /* function prototypes for a Belkin USB Serial Adapter F5U103 */
 static int  belkin_sa_startup(struct usb_serial *serial);
-static void belkin_sa_shutdown(struct usb_serial *serial);
+static void belkin_sa_release(struct usb_serial *serial);
 static int  belkin_sa_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void belkin_sa_close(struct usb_serial_port *port);
@@ -142,7 +142,7 @@ static struct usb_serial_driver belkin_device = {
 	.tiocmget =		belkin_sa_tiocmget,
 	.tiocmset =		belkin_sa_tiocmset,
 	.attach =		belkin_sa_startup,
-	.shutdown =		belkin_sa_shutdown,
+	.release =		belkin_sa_release,
 };
 
 
@@ -197,14 +197,13 @@ static int belkin_sa_startup(struct usb_serial *serial)
 }
 
 
-static void belkin_sa_shutdown(struct usb_serial *serial)
+static void belkin_sa_release(struct usb_serial *serial)
 {
 	struct belkin_sa_private *priv;
 	int i;
 
 	dbg("%s", __func__);
 
-	/* stop reads and writes on all ports */
 	for (i = 0; i < serial->num_ports; ++i) {
 		/* My special items, the standard routines free my urbs */
 		priv = usb_get_serial_port_data(serial->port[i]);
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 16a154d3b2f..2b9eeda62bf 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -50,7 +50,7 @@ static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *,
 		unsigned int, unsigned int);
 static void cp210x_break_ctl(struct tty_struct *, int);
 static int cp210x_startup(struct usb_serial *);
-static void cp210x_shutdown(struct usb_serial *);
+static void cp210x_disconnect(struct usb_serial *);
 
 static int debug;
 
@@ -137,7 +137,7 @@ static struct usb_serial_driver cp210x_device = {
 	.tiocmget 		= cp210x_tiocmget,
 	.tiocmset		= cp210x_tiocmset,
 	.attach			= cp210x_startup,
-	.shutdown		= cp210x_shutdown,
+	.disconnect		= cp210x_disconnect,
 };
 
 /* Config request types */
@@ -792,7 +792,7 @@ static int cp210x_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void cp210x_shutdown(struct usb_serial *serial)
+static void cp210x_disconnect(struct usb_serial *serial)
 {
 	int i;
 
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 933ba913e66..336523fd736 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -58,7 +58,8 @@ static int debug;
 
 /* Function prototypes */
 static int cyberjack_startup(struct usb_serial *serial);
-static void cyberjack_shutdown(struct usb_serial *serial);
+static void cyberjack_disconnect(struct usb_serial *serial);
+static void cyberjack_release(struct usb_serial *serial);
 static int  cyberjack_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void cyberjack_close(struct usb_serial_port *port);
@@ -94,7 +95,8 @@ static struct usb_serial_driver cyberjack_device = {
 	.id_table =		id_table,
 	.num_ports =		1,
 	.attach =		cyberjack_startup,
-	.shutdown =		cyberjack_shutdown,
+	.disconnect =		cyberjack_disconnect,
+	.release =		cyberjack_release,
 	.open =			cyberjack_open,
 	.close =		cyberjack_close,
 	.write =		cyberjack_write,
@@ -148,17 +150,25 @@ static int cyberjack_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void cyberjack_shutdown(struct usb_serial *serial)
+static void cyberjack_disconnect(struct usb_serial *serial)
 {
 	int i;
 
 	dbg("%s", __func__);
 
-	for (i = 0; i < serial->num_ports; ++i) {
+	for (i = 0; i < serial->num_ports; ++i)
 		usb_kill_urb(serial->port[i]->interrupt_in_urb);
+}
+
+static void cyberjack_release(struct usb_serial *serial)
+{
+	int i;
+
+	dbg("%s", __func__);
+
+	for (i = 0; i < serial->num_ports; ++i) {
 		/* My special items, the standard routines free my urbs */
 		kfree(usb_get_serial_port_data(serial->port[i]));
-		usb_set_serial_port_data(serial->port[i], NULL);
 	}
 }
 
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 669f9384853..9734085fd2f 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -171,7 +171,7 @@ struct cypress_buf {
 static int  cypress_earthmate_startup(struct usb_serial *serial);
 static int  cypress_hidcom_startup(struct usb_serial *serial);
 static int  cypress_ca42v2_startup(struct usb_serial *serial);
-static void cypress_shutdown(struct usb_serial *serial);
+static void cypress_release(struct usb_serial *serial);
 static int  cypress_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void cypress_close(struct usb_serial_port *port);
@@ -215,7 +215,7 @@ static struct usb_serial_driver cypress_earthmate_device = {
 	.id_table =			id_table_earthmate,
 	.num_ports =			1,
 	.attach =			cypress_earthmate_startup,
-	.shutdown =			cypress_shutdown,
+	.release =			cypress_release,
 	.open =				cypress_open,
 	.close =			cypress_close,
 	.dtr_rts =			cypress_dtr_rts,
@@ -242,7 +242,7 @@ static struct usb_serial_driver cypress_hidcom_device = {
 	.id_table =			id_table_cyphidcomrs232,
 	.num_ports =			1,
 	.attach =			cypress_hidcom_startup,
-	.shutdown =			cypress_shutdown,
+	.release =			cypress_release,
 	.open =				cypress_open,
 	.close =			cypress_close,
 	.dtr_rts =			cypress_dtr_rts,
@@ -269,7 +269,7 @@ static struct usb_serial_driver cypress_ca42v2_device = {
 	.id_table =			id_table_nokiaca42v2,
 	.num_ports =			1,
 	.attach =			cypress_ca42v2_startup,
-	.shutdown =			cypress_shutdown,
+	.release =			cypress_release,
 	.open =				cypress_open,
 	.close =			cypress_close,
 	.dtr_rts =			cypress_dtr_rts,
@@ -616,7 +616,7 @@ static int cypress_ca42v2_startup(struct usb_serial *serial)
 } /* cypress_ca42v2_startup */
 
 
-static void cypress_shutdown(struct usb_serial *serial)
+static void cypress_release(struct usb_serial *serial)
 {
 	struct cypress_private *priv;
 
@@ -629,7 +629,6 @@ static void cypress_shutdown(struct usb_serial *serial)
 	if (priv) {
 		cypress_buf_free(priv->buf);
 		kfree(priv);
-		usb_set_serial_port_data(serial->port[0], NULL);
 	}
 }
 
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 30f5140eff0..f4808091c47 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -460,7 +460,8 @@ static int digi_carrier_raised(struct usb_serial_port *port);
 static void digi_dtr_rts(struct usb_serial_port *port, int on);
 static int digi_startup_device(struct usb_serial *serial);
 static int digi_startup(struct usb_serial *serial);
-static void digi_shutdown(struct usb_serial *serial);
+static void digi_disconnect(struct usb_serial *serial);
+static void digi_release(struct usb_serial *serial);
 static void digi_read_bulk_callback(struct urb *urb);
 static int digi_read_inb_callback(struct urb *urb);
 static int digi_read_oob_callback(struct urb *urb);
@@ -524,7 +525,8 @@ static struct usb_serial_driver digi_acceleport_2_device = {
 	.tiocmget =			digi_tiocmget,
 	.tiocmset =			digi_tiocmset,
 	.attach =			digi_startup,
-	.shutdown =			digi_shutdown,
+	.disconnect =			digi_disconnect,
+	.release =			digi_release,
 };
 
 static struct usb_serial_driver digi_acceleport_4_device = {
@@ -550,7 +552,8 @@ static struct usb_serial_driver digi_acceleport_4_device = {
 	.tiocmget =			digi_tiocmget,
 	.tiocmset =			digi_tiocmset,
 	.attach =			digi_startup,
-	.shutdown =			digi_shutdown,
+	.disconnect =			digi_disconnect,
+	.release =			digi_release,
 };
 
 
@@ -1556,16 +1559,23 @@ static int digi_startup(struct usb_serial *serial)
 }
 
 
-static void digi_shutdown(struct usb_serial *serial)
+static void digi_disconnect(struct usb_serial *serial)
 {
 	int i;
-	dbg("digi_shutdown: TOP, in_interrupt()=%ld", in_interrupt());
+	dbg("digi_disconnect: TOP, in_interrupt()=%ld", in_interrupt());
 
 	/* stop reads and writes on all ports */
 	for (i = 0; i < serial->type->num_ports + 1; i++) {
 		usb_kill_urb(serial->port[i]->read_urb);
 		usb_kill_urb(serial->port[i]->write_urb);
 	}
+}
+
+
+static void digi_release(struct usb_serial *serial)
+{
+	int i;
+	dbg("digi_release: TOP, in_interrupt()=%ld", in_interrupt());
 
 	/* free the private data structures for all ports */
 	/* number of regular ports + 1 for the out-of-band port */
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index 2b141ccb0cd..80cb3471adb 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -90,7 +90,6 @@ static int  empeg_chars_in_buffer(struct tty_struct *tty);
 static void empeg_throttle(struct tty_struct *tty);
 static void empeg_unthrottle(struct tty_struct *tty);
 static int  empeg_startup(struct usb_serial *serial);
-static void empeg_shutdown(struct usb_serial *serial);
 static void empeg_set_termios(struct tty_struct *tty,
 		struct usb_serial_port *port, struct ktermios *old_termios);
 static void empeg_write_bulk_callback(struct urb *urb);
@@ -124,7 +123,6 @@ static struct usb_serial_driver empeg_device = {
 	.throttle =		empeg_throttle,
 	.unthrottle =		empeg_unthrottle,
 	.attach =		empeg_startup,
-	.shutdown =		empeg_shutdown,
 	.set_termios =		empeg_set_termios,
 	.write =		empeg_write,
 	.write_room =		empeg_write_room,
@@ -427,12 +425,6 @@ static int  empeg_startup(struct usb_serial *serial)
 }
 
 
-static void empeg_shutdown(struct usb_serial *serial)
-{
-	dbg("%s", __func__);
-}
-
-
 static void empeg_set_termios(struct tty_struct *tty,
 		struct usb_serial_port *port, struct ktermios *old_termios)
 {
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index fc423583eed..3dc3768ca71 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -720,7 +720,6 @@ static const char *ftdi_chip_name[] = {
 /* function prototypes for a FTDI serial converter */
 static int  ftdi_sio_probe(struct usb_serial *serial,
 					const struct usb_device_id *id);
-static void ftdi_shutdown(struct usb_serial *serial);
 static int  ftdi_sio_port_probe(struct usb_serial_port *port);
 static int  ftdi_sio_port_remove(struct usb_serial_port *port);
 static int  ftdi_open(struct tty_struct *tty,
@@ -779,7 +778,6 @@ static struct usb_serial_driver ftdi_sio_device = {
 	.ioctl =		ftdi_ioctl,
 	.set_termios =		ftdi_set_termios,
 	.break_ctl =		ftdi_break_ctl,
-	.shutdown =		ftdi_shutdown,
 };
 
 
@@ -1594,18 +1592,6 @@ static int ftdi_mtxorb_hack_setup(struct usb_serial *serial)
 	return 0;
 }
 
-/* ftdi_shutdown is called from usbserial:usb_serial_disconnect
- *   it is called when the usb device is disconnected
- *
- *   usbserial:usb_serial_disconnect
- *      calls __serial_close for each open of the port
- *      shutdown is called then (ie ftdi_shutdown)
- */
-static void ftdi_shutdown(struct usb_serial *serial)
-{
-	dbg("%s", __func__);
-}
-
 static void ftdi_sio_priv_release(struct kref *k)
 {
 	struct ftdi_private *priv = container_of(k, struct ftdi_private, kref);
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 5092d6aba65..8839f1c70b7 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -1475,7 +1475,7 @@ static int garmin_attach(struct usb_serial *serial)
 }
 
 
-static void garmin_shutdown(struct usb_serial *serial)
+static void garmin_disconnect(struct usb_serial *serial)
 {
 	struct usb_serial_port *port = serial->port[0];
 	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
@@ -1484,8 +1484,17 @@ static void garmin_shutdown(struct usb_serial *serial)
 
 	usb_kill_urb(port->interrupt_in_urb);
 	del_timer_sync(&garmin_data_p->timer);
+}
+
+
+static void garmin_release(struct usb_serial *serial)
+{
+	struct usb_serial_port *port = serial->port[0];
+	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
+
+	dbg("%s", __func__);
+
 	kfree(garmin_data_p);
-	usb_set_serial_port_data(port, NULL);
 }
 
 
@@ -1504,7 +1513,8 @@ static struct usb_serial_driver garmin_device = {
 	.throttle            = garmin_throttle,
 	.unthrottle          = garmin_unthrottle,
 	.attach              = garmin_attach,
-	.shutdown            = garmin_shutdown,
+	.disconnect          = garmin_disconnect,
+	.release             = garmin_release,
 	.write               = garmin_write,
 	.write_room          = garmin_write_room,
 	.write_bulk_callback = garmin_write_bulk_callback,
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 8dabac1929b..932d6241b78 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -63,7 +63,8 @@ struct usb_serial_driver usb_serial_generic_device = {
 	.id_table =		generic_device_ids,
 	.usb_driver = 		&generic_driver,
 	.num_ports =		1,
-	.shutdown =		usb_serial_generic_shutdown,
+	.disconnect =		usb_serial_generic_disconnect,
+	.release =		usb_serial_generic_release,
 	.throttle =		usb_serial_generic_throttle,
 	.unthrottle =		usb_serial_generic_unthrottle,
 	.resume =		usb_serial_generic_resume,
@@ -551,7 +552,7 @@ int usb_serial_handle_break(struct usb_serial_port *port)
 }
 EXPORT_SYMBOL_GPL(usb_serial_handle_break);
 
-void usb_serial_generic_shutdown(struct usb_serial *serial)
+void usb_serial_generic_disconnect(struct usb_serial *serial)
 {
 	int i;
 
@@ -562,3 +563,7 @@ void usb_serial_generic_shutdown(struct usb_serial *serial)
 		generic_cleanup(serial->port[i]);
 }
 
+void usb_serial_generic_release(struct usb_serial *serial)
+{
+	dbg("%s", __func__);
+}
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index 53ef5996e33..0191693625d 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -224,7 +224,8 @@ static int  edge_tiocmget(struct tty_struct *tty, struct file *file);
 static int  edge_tiocmset(struct tty_struct *tty, struct file *file,
 					unsigned int set, unsigned int clear);
 static int  edge_startup(struct usb_serial *serial);
-static void edge_shutdown(struct usb_serial *serial);
+static void edge_disconnect(struct usb_serial *serial);
+static void edge_release(struct usb_serial *serial);
 
 #include "io_tables.h"	/* all of the devices that this driver supports */
 
@@ -3193,21 +3194,16 @@ static int edge_startup(struct usb_serial *serial)
 
 
 /****************************************************************************
- * edge_shutdown
+ * edge_disconnect
  *	This function is called whenever the device is removed from the usb bus.
  ****************************************************************************/
-static void edge_shutdown(struct usb_serial *serial)
+static void edge_disconnect(struct usb_serial *serial)
 {
 	struct edgeport_serial *edge_serial = usb_get_serial_data(serial);
-	int i;
 
 	dbg("%s", __func__);
 
 	/* stop reads and writes on all ports */
-	for (i = 0; i < serial->num_ports; ++i) {
-		kfree(usb_get_serial_port_data(serial->port[i]));
-		usb_set_serial_port_data(serial->port[i],  NULL);
-	}
 	/* free up our endpoint stuff */
 	if (edge_serial->is_epic) {
 		usb_kill_urb(edge_serial->interrupt_read_urb);
@@ -3218,9 +3214,24 @@ static void edge_shutdown(struct usb_serial *serial)
 		usb_free_urb(edge_serial->read_urb);
 		kfree(edge_serial->bulk_in_buffer);
 	}
+}
+
+
+/****************************************************************************
+ * edge_release
+ *	This function is called when the device structure is deallocated.
+ ****************************************************************************/
+static void edge_release(struct usb_serial *serial)
+{
+	struct edgeport_serial *edge_serial = usb_get_serial_data(serial);
+	int i;
+
+	dbg("%s", __func__);
+
+	for (i = 0; i < serial->num_ports; ++i)
+		kfree(usb_get_serial_port_data(serial->port[i]));
 
 	kfree(edge_serial);
-	usb_set_serial_data(serial, NULL);
 }
 
 
diff --git a/drivers/usb/serial/io_tables.h b/drivers/usb/serial/io_tables.h
index 7eb9d67b81b..9241d314751 100644
--- a/drivers/usb/serial/io_tables.h
+++ b/drivers/usb/serial/io_tables.h
@@ -117,7 +117,8 @@ static struct usb_serial_driver edgeport_2port_device = {
 	.throttle		= edge_throttle,
 	.unthrottle		= edge_unthrottle,
 	.attach			= edge_startup,
-	.shutdown		= edge_shutdown,
+	.disconnect		= edge_disconnect,
+	.release		= edge_release,
 	.ioctl			= edge_ioctl,
 	.set_termios		= edge_set_termios,
 	.tiocmget		= edge_tiocmget,
@@ -145,7 +146,8 @@ static struct usb_serial_driver edgeport_4port_device = {
 	.throttle		= edge_throttle,
 	.unthrottle		= edge_unthrottle,
 	.attach			= edge_startup,
-	.shutdown		= edge_shutdown,
+	.disconnect		= edge_disconnect,
+	.release		= edge_release,
 	.ioctl			= edge_ioctl,
 	.set_termios		= edge_set_termios,
 	.tiocmget		= edge_tiocmget,
@@ -173,7 +175,8 @@ static struct usb_serial_driver edgeport_8port_device = {
 	.throttle		= edge_throttle,
 	.unthrottle		= edge_unthrottle,
 	.attach			= edge_startup,
-	.shutdown		= edge_shutdown,
+	.disconnect		= edge_disconnect,
+	.release		= edge_release,
 	.ioctl			= edge_ioctl,
 	.set_termios		= edge_set_termios,
 	.tiocmget		= edge_tiocmget,
@@ -200,7 +203,8 @@ static struct usb_serial_driver epic_device = {
 	.throttle		= edge_throttle,
 	.unthrottle		= edge_unthrottle,
 	.attach			= edge_startup,
-	.shutdown		= edge_shutdown,
+	.disconnect		= edge_disconnect,
+	.release		= edge_release,
 	.ioctl			= edge_ioctl,
 	.set_termios		= edge_set_termios,
 	.tiocmget		= edge_tiocmget,
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index db964db42d3..e8bc42f92e7 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2663,7 +2663,7 @@ cleanup:
 	return -ENOMEM;
 }
 
-static void edge_shutdown(struct usb_serial *serial)
+static void edge_disconnect(struct usb_serial *serial)
 {
 	int i;
 	struct edgeport_port *edge_port;
@@ -2673,12 +2673,22 @@ static void edge_shutdown(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; ++i) {
 		edge_port = usb_get_serial_port_data(serial->port[i]);
 		edge_remove_sysfs_attrs(edge_port->port);
+	}
+}
+
+static void edge_release(struct usb_serial *serial)
+{
+	int i;
+	struct edgeport_port *edge_port;
+
+	dbg("%s", __func__);
+
+	for (i = 0; i < serial->num_ports; ++i) {
+		edge_port = usb_get_serial_port_data(serial->port[i]);
 		edge_buf_free(edge_port->ep_out_buf);
 		kfree(edge_port);
-		usb_set_serial_port_data(serial->port[i], NULL);
 	}
 	kfree(usb_get_serial_data(serial));
-	usb_set_serial_data(serial, NULL);
 }
 
 
@@ -2915,7 +2925,8 @@ static struct usb_serial_driver edgeport_1port_device = {
 	.throttle		= edge_throttle,
 	.unthrottle		= edge_unthrottle,
 	.attach			= edge_startup,
-	.shutdown		= edge_shutdown,
+	.disconnect		= edge_disconnect,
+	.release		= edge_release,
 	.port_probe		= edge_create_sysfs_attrs,
 	.ioctl			= edge_ioctl,
 	.set_termios		= edge_set_termios,
@@ -2944,7 +2955,8 @@ static struct usb_serial_driver edgeport_2port_device = {
 	.throttle		= edge_throttle,
 	.unthrottle		= edge_unthrottle,
 	.attach			= edge_startup,
-	.shutdown		= edge_shutdown,
+	.disconnect		= edge_disconnect,
+	.release		= edge_release,
 	.port_probe		= edge_create_sysfs_attrs,
 	.ioctl			= edge_ioctl,
 	.set_termios		= edge_set_termios,
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index c610a99fa47..2545d45ce16 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -79,7 +79,6 @@ static int  ipaq_open(struct tty_struct *tty,
 static void ipaq_close(struct usb_serial_port *port);
 static int  ipaq_calc_num_ports(struct usb_serial *serial);
 static int  ipaq_startup(struct usb_serial *serial);
-static void ipaq_shutdown(struct usb_serial *serial);
 static int ipaq_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static int ipaq_write_bulk(struct usb_serial_port *port,
@@ -576,7 +575,6 @@ static struct usb_serial_driver ipaq_device = {
 	.close =		ipaq_close,
 	.attach =		ipaq_startup,
 	.calc_num_ports =	ipaq_calc_num_ports,
-	.shutdown =		ipaq_shutdown,
 	.write =		ipaq_write,
 	.write_room =		ipaq_write_room,
 	.chars_in_buffer =	ipaq_chars_in_buffer,
@@ -990,11 +988,6 @@ static int ipaq_startup(struct usb_serial *serial)
 	return usb_reset_configuration(serial->dev);
 }
 
-static void ipaq_shutdown(struct usb_serial *serial)
-{
-	dbg("%s", __func__);
-}
-
 static int __init ipaq_init(void)
 {
 	int retval;
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 76a3cc327bb..96873a7a32b 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -121,8 +121,8 @@ static int iuu_startup(struct usb_serial *serial)
 	return 0;
 }
 
-/* Shutdown function */
-static void iuu_shutdown(struct usb_serial *serial)
+/* Release function */
+static void iuu_release(struct usb_serial *serial)
 {
 	struct usb_serial_port *port = serial->port[0];
 	struct iuu_private *priv = usb_get_serial_port_data(port);
@@ -1202,7 +1202,7 @@ static struct usb_serial_driver iuu_device = {
 	.tiocmset = iuu_tiocmset,
 	.set_termios = iuu_set_termios,
 	.attach = iuu_startup,
-	.shutdown = iuu_shutdown,
+	.release = iuu_release,
 };
 
 static int __init iuu_init(void)
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index f1195a98f31..2594b8743d3 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -2689,7 +2689,7 @@ static int keyspan_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void keyspan_shutdown(struct usb_serial *serial)
+static void keyspan_disconnect(struct usb_serial *serial)
 {
 	int				i, j;
 	struct usb_serial_port		*port;
@@ -2729,6 +2729,17 @@ static void keyspan_shutdown(struct usb_serial *serial)
 			usb_free_urb(p_priv->out_urbs[j]);
 		}
 	}
+}
+
+static void keyspan_release(struct usb_serial *serial)
+{
+	int				i;
+	struct usb_serial_port		*port;
+	struct keyspan_serial_private 	*s_priv;
+
+	dbg("%s", __func__);
+
+	s_priv = usb_get_serial_data(serial);
 
 	/*  dbg("Freeing serial->private."); */
 	kfree(s_priv);
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 0d4569b6076..3107ed15af6 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -41,7 +41,8 @@ static int  keyspan_open		(struct tty_struct *tty,
 static void keyspan_close		(struct usb_serial_port *port);
 static void keyspan_dtr_rts		(struct usb_serial_port *port, int on);
 static int  keyspan_startup		(struct usb_serial *serial);
-static void keyspan_shutdown		(struct usb_serial *serial);
+static void keyspan_disconnect		(struct usb_serial *serial);
+static void keyspan_release		(struct usb_serial *serial);
 static int  keyspan_write_room		(struct tty_struct *tty);
 
 static int  keyspan_write		(struct tty_struct *tty,
@@ -569,7 +570,8 @@ static struct usb_serial_driver keyspan_1port_device = {
 	.tiocmget		= keyspan_tiocmget,
 	.tiocmset		= keyspan_tiocmset,
 	.attach			= keyspan_startup,
-	.shutdown		= keyspan_shutdown,
+	.disconnect		= keyspan_disconnect,
+	.release		= keyspan_release,
 };
 
 static struct usb_serial_driver keyspan_2port_device = {
@@ -590,7 +592,8 @@ static struct usb_serial_driver keyspan_2port_device = {
 	.tiocmget		= keyspan_tiocmget,
 	.tiocmset		= keyspan_tiocmset,
 	.attach			= keyspan_startup,
-	.shutdown		= keyspan_shutdown,
+	.disconnect		= keyspan_disconnect,
+	.release		= keyspan_release,
 };
 
 static struct usb_serial_driver keyspan_4port_device = {
@@ -611,7 +614,8 @@ static struct usb_serial_driver keyspan_4port_device = {
 	.tiocmget		= keyspan_tiocmget,
 	.tiocmset		= keyspan_tiocmset,
 	.attach			= keyspan_startup,
-	.shutdown		= keyspan_shutdown,
+	.disconnect		= keyspan_disconnect,
+	.release		= keyspan_release,
 };
 
 #endif
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index ab769dbea1b..d0b12e40c2b 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -809,7 +809,7 @@ static int keyspan_pda_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void keyspan_pda_shutdown(struct usb_serial *serial)
+static void keyspan_pda_release(struct usb_serial *serial)
 {
 	dbg("%s", __func__);
 
@@ -869,7 +869,7 @@ static struct usb_serial_driver keyspan_pda_device = {
 	.tiocmget =		keyspan_pda_tiocmget,
 	.tiocmset =		keyspan_pda_tiocmset,
 	.attach =		keyspan_pda_startup,
-	.shutdown =		keyspan_pda_shutdown,
+	.release =		keyspan_pda_release,
 };
 
 
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index fa817c66b3e..0f44bb8e8d4 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -73,7 +73,8 @@ static int debug;
  * Function prototypes
  */
 static int  klsi_105_startup(struct usb_serial *serial);
-static void klsi_105_shutdown(struct usb_serial *serial);
+static void klsi_105_disconnect(struct usb_serial *serial);
+static void klsi_105_release(struct usb_serial *serial);
 static int  klsi_105_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void klsi_105_close(struct usb_serial_port *port);
@@ -131,7 +132,8 @@ static struct usb_serial_driver kl5kusb105d_device = {
 	.tiocmget =          klsi_105_tiocmget,
 	.tiocmset =          klsi_105_tiocmset,
 	.attach =	     klsi_105_startup,
-	.shutdown =	     klsi_105_shutdown,
+	.disconnect =	     klsi_105_disconnect,
+	.release =	     klsi_105_release,
 	.throttle =	     klsi_105_throttle,
 	.unthrottle =	     klsi_105_unthrottle,
 };
@@ -315,7 +317,7 @@ err_cleanup:
 } /* klsi_105_startup */
 
 
-static void klsi_105_shutdown(struct usb_serial *serial)
+static void klsi_105_disconnect(struct usb_serial *serial)
 {
 	int i;
 
@@ -325,33 +327,36 @@ static void klsi_105_shutdown(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; ++i) {
 		struct klsi_105_private *priv =
 				usb_get_serial_port_data(serial->port[i]);
-		unsigned long flags;
 
 		if (priv) {
 			/* kill our write urb pool */
 			int j;
 			struct urb **write_urbs = priv->write_urb_pool;
-			spin_lock_irqsave(&priv->lock, flags);
 
 			for (j = 0; j < NUM_URBS; j++) {
 				if (write_urbs[j]) {
-					/* FIXME - uncomment the following
-					 * usb_kill_urb call when the host
-					 * controllers get fixed to set
-					 * urb->dev = NULL after the urb is
-					 * finished.  Otherwise this call
-					 * oopses. */
-					/* usb_kill_urb(write_urbs[j]); */
-					kfree(write_urbs[j]->transfer_buffer);
+					usb_kill_urb(write_urbs[j]);
 					usb_free_urb(write_urbs[j]);
 				}
 			}
-			spin_unlock_irqrestore(&priv->lock, flags);
-			kfree(priv);
-			usb_set_serial_port_data(serial->port[i], NULL);
 		}
 	}
-} /* klsi_105_shutdown */
+} /* klsi_105_disconnect */
+
+
+static void klsi_105_release(struct usb_serial *serial)
+{
+	int i;
+
+	dbg("%s", __func__);
+
+	for (i = 0; i < serial->num_ports; ++i) {
+		struct klsi_105_private *priv =
+				usb_get_serial_port_data(serial->port[i]);
+
+		kfree(priv);
+	}
+} /* klsi_105_release */
 
 static int  klsi_105_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp)
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 6b570498287..6db0e561f68 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -69,7 +69,7 @@ static int debug;
 
 /* Function prototypes */
 static int  kobil_startup(struct usb_serial *serial);
-static void kobil_shutdown(struct usb_serial *serial);
+static void kobil_release(struct usb_serial *serial);
 static int  kobil_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void kobil_close(struct usb_serial_port *port);
@@ -117,7 +117,7 @@ static struct usb_serial_driver kobil_device = {
 	.id_table =		id_table,
 	.num_ports =		1,
 	.attach =		kobil_startup,
-	.shutdown =		kobil_shutdown,
+	.release =		kobil_release,
 	.ioctl =		kobil_ioctl,
 	.set_termios =		kobil_set_termios,
 	.tiocmget =		kobil_tiocmget,
@@ -201,17 +201,13 @@ static int kobil_startup(struct usb_serial *serial)
 }
 
 
-static void kobil_shutdown(struct usb_serial *serial)
+static void kobil_release(struct usb_serial *serial)
 {
 	int i;
 	dbg("%s - port %d", __func__, serial->port[0]->number);
 
-	for (i = 0; i < serial->num_ports; ++i) {
-		while (serial->port[i]->port.count > 0)
-			kobil_close(serial->port[i]);
+	for (i = 0; i < serial->num_ports; ++i)
 		kfree(usb_get_serial_port_data(serial->port[i]));
-		usb_set_serial_port_data(serial->port[i], NULL);
-	}
 }
 
 
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index 873795548fc..d8825e159aa 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -92,7 +92,7 @@ static int debug;
  * Function prototypes
  */
 static int  mct_u232_startup(struct usb_serial *serial);
-static void mct_u232_shutdown(struct usb_serial *serial);
+static void mct_u232_release(struct usb_serial *serial);
 static int  mct_u232_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void mct_u232_close(struct usb_serial_port *port);
@@ -149,7 +149,7 @@ static struct usb_serial_driver mct_u232_device = {
 	.tiocmget =	     mct_u232_tiocmget,
 	.tiocmset =	     mct_u232_tiocmset,
 	.attach =	     mct_u232_startup,
-	.shutdown =	     mct_u232_shutdown,
+	.release =	     mct_u232_release,
 };
 
 
@@ -407,7 +407,7 @@ static int mct_u232_startup(struct usb_serial *serial)
 } /* mct_u232_startup */
 
 
-static void mct_u232_shutdown(struct usb_serial *serial)
+static void mct_u232_release(struct usb_serial *serial)
 {
 	struct mct_u232_private *priv;
 	int i;
@@ -417,12 +417,9 @@ static void mct_u232_shutdown(struct usb_serial *serial)
 	for (i = 0; i < serial->num_ports; ++i) {
 		/* My special items, the standard routines free my urbs */
 		priv = usb_get_serial_port_data(serial->port[i]);
-		if (priv) {
-			usb_set_serial_port_data(serial->port[i], NULL);
-			kfree(priv);
-		}
+		kfree(priv);
 	}
-} /* mct_u232_shutdown */
+} /* mct_u232_release */
 
 static int  mct_u232_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp)
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 9e1a013ee7f..bfc5ce000ef 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -1521,19 +1521,16 @@ static int mos7720_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void mos7720_shutdown(struct usb_serial *serial)
+static void mos7720_release(struct usb_serial *serial)
 {
 	int i;
 
 	/* free private structure allocated for serial port */
-	for (i = 0; i < serial->num_ports; ++i) {
+	for (i = 0; i < serial->num_ports; ++i)
 		kfree(usb_get_serial_port_data(serial->port[i]));
-		usb_set_serial_port_data(serial->port[i], NULL);
-	}
 
 	/* free private structure allocated for serial device */
 	kfree(usb_get_serial_data(serial));
-	usb_set_serial_data(serial, NULL);
 }
 
 static struct usb_driver usb_driver = {
@@ -1558,7 +1555,7 @@ static struct usb_serial_driver moschip7720_2port_driver = {
 	.throttle		= mos7720_throttle,
 	.unthrottle		= mos7720_unthrottle,
 	.attach			= mos7720_startup,
-	.shutdown		= mos7720_shutdown,
+	.release		= mos7720_release,
 	.ioctl			= mos7720_ioctl,
 	.set_termios		= mos7720_set_termios,
 	.write			= mos7720_write,
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 5fe9fe3df77..c40f95c1951 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -2633,16 +2633,16 @@ error:
 }
 
 /****************************************************************************
- * mos7840_shutdown
+ * mos7840_disconnect
  *	This function is called whenever the device is removed from the usb bus.
  ****************************************************************************/
 
-static void mos7840_shutdown(struct usb_serial *serial)
+static void mos7840_disconnect(struct usb_serial *serial)
 {
 	int i;
 	unsigned long flags;
 	struct moschip_port *mos7840_port;
-	dbg("%s", " shutdown :entering..........");
+	dbg("%s", " disconnect :entering..........");
 
 	if (!serial) {
 		dbg("%s", "Invalid Handler");
@@ -2662,11 +2662,42 @@ static void mos7840_shutdown(struct usb_serial *serial)
 			mos7840_port->zombie = 1;
 			spin_unlock_irqrestore(&mos7840_port->pool_lock, flags);
 			usb_kill_urb(mos7840_port->control_urb);
+		}
+	}
+
+	dbg("%s", "Thank u :: ");
+
+}
+
+/****************************************************************************
+ * mos7840_release
+ *	This function is called when the usb_serial structure is freed.
+ ****************************************************************************/
+
+static void mos7840_release(struct usb_serial *serial)
+{
+	int i;
+	struct moschip_port *mos7840_port;
+	dbg("%s", " release :entering..........");
+
+	if (!serial) {
+		dbg("%s", "Invalid Handler");
+		return;
+	}
+
+	/* check for the ports to be closed,close the ports and disconnect */
+
+	/* free private structure allocated for serial port  *
+	 * stop reads and writes on all ports                */
+
+	for (i = 0; i < serial->num_ports; ++i) {
+		mos7840_port = mos7840_get_port_private(serial->port[i]);
+		dbg("mos7840_port %d = %p", i, mos7840_port);
+		if (mos7840_port) {
 			kfree(mos7840_port->ctrl_buf);
 			kfree(mos7840_port->dr);
 			kfree(mos7840_port);
 		}
-		mos7840_set_port_private(serial->port[i], NULL);
 	}
 
 	dbg("%s", "Thank u :: ");
@@ -2707,7 +2738,8 @@ static struct usb_serial_driver moschip7840_4port_device = {
 	.tiocmget = mos7840_tiocmget,
 	.tiocmset = mos7840_tiocmset,
 	.attach = mos7840_startup,
-	.shutdown = mos7840_shutdown,
+	.disconnect = mos7840_disconnect,
+	.release = mos7840_release,
 	.read_bulk_callback = mos7840_bulk_in_callback,
 	.read_int_callback = mos7840_interrupt_callback,
 };
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index 1104617334f..56857ddbd70 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -72,7 +72,8 @@ static void omninet_write_bulk_callback(struct urb *urb);
 static int  omninet_write(struct tty_struct *tty, struct usb_serial_port *port,
 				const unsigned char *buf, int count);
 static int  omninet_write_room(struct tty_struct *tty);
-static void omninet_shutdown(struct usb_serial *serial);
+static void omninet_disconnect(struct usb_serial *serial);
+static void omninet_release(struct usb_serial *serial);
 static int omninet_attach(struct usb_serial *serial);
 
 static struct usb_device_id id_table[] = {
@@ -108,7 +109,8 @@ static struct usb_serial_driver zyxel_omninet_device = {
 	.write_room =		omninet_write_room,
 	.read_bulk_callback =	omninet_read_bulk_callback,
 	.write_bulk_callback =	omninet_write_bulk_callback,
-	.shutdown =		omninet_shutdown,
+	.disconnect =		omninet_disconnect,
+	.release =		omninet_release,
 };
 
 
@@ -345,13 +347,22 @@ static void omninet_write_bulk_callback(struct urb *urb)
 }
 
 
-static void omninet_shutdown(struct usb_serial *serial)
+static void omninet_disconnect(struct usb_serial *serial)
 {
 	struct usb_serial_port *wport = serial->port[1];
-	struct usb_serial_port *port = serial->port[0];
+
 	dbg("%s", __func__);
 
 	usb_kill_urb(wport->write_urb);
+}
+
+
+static void omninet_release(struct usb_serial *serial)
+{
+	struct usb_serial_port *port = serial->port[0];
+
+	dbg("%s", __func__);
+
 	kfree(usb_get_serial_port_data(port));
 }
 
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index c20480aa975..336bba79ad3 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -463,7 +463,7 @@ error:
 	return retval;
 }
 
-static void opticon_shutdown(struct usb_serial *serial)
+static void opticon_disconnect(struct usb_serial *serial)
 {
 	struct opticon_private *priv = usb_get_serial_data(serial);
 
@@ -471,9 +471,16 @@ static void opticon_shutdown(struct usb_serial *serial)
 
 	usb_kill_urb(priv->bulk_read_urb);
 	usb_free_urb(priv->bulk_read_urb);
+}
+
+static void opticon_release(struct usb_serial *serial)
+{
+	struct opticon_private *priv = usb_get_serial_data(serial);
+
+	dbg("%s", __func__);
+
 	kfree(priv->bulk_in_buffer);
 	kfree(priv);
-	usb_set_serial_data(serial, NULL);
 }
 
 static int opticon_suspend(struct usb_interface *intf, pm_message_t message)
@@ -524,7 +531,8 @@ static struct usb_serial_driver opticon_device = {
 	.close =		opticon_close,
 	.write =		opticon_write,
 	.write_room = 		opticon_write_room,
-	.shutdown =		opticon_shutdown,
+	.disconnect =		opticon_disconnect,
+	.release =		opticon_release,
 	.throttle = 		opticon_throttle,
 	.unthrottle =		opticon_unthrottle,
 	.ioctl =		opticon_ioctl,
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index a38971cc373..575816e6ba3 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -51,7 +51,8 @@ static void option_close(struct usb_serial_port *port);
 static void option_dtr_rts(struct usb_serial_port *port, int on);
 
 static int  option_startup(struct usb_serial *serial);
-static void option_shutdown(struct usb_serial *serial);
+static void option_disconnect(struct usb_serial *serial);
+static void option_release(struct usb_serial *serial);
 static int  option_write_room(struct tty_struct *tty);
 
 static void option_instat_callback(struct urb *urb);
@@ -568,7 +569,8 @@ static struct usb_serial_driver option_1port_device = {
 	.tiocmget          = option_tiocmget,
 	.tiocmset          = option_tiocmset,
 	.attach            = option_startup,
-	.shutdown          = option_shutdown,
+	.disconnect        = option_disconnect,
+	.release           = option_release,
 	.read_int_callback = option_instat_callback,
 	.suspend           = option_suspend,
 	.resume            = option_resume,
@@ -1149,7 +1151,14 @@ static void stop_read_write_urbs(struct usb_serial *serial)
 	}
 }
 
-static void option_shutdown(struct usb_serial *serial)
+static void option_disconnect(struct usb_serial *serial)
+{
+	dbg("%s", __func__);
+
+	stop_read_write_urbs(serial);
+}
+
+static void option_release(struct usb_serial *serial)
 {
 	int i, j;
 	struct usb_serial_port *port;
@@ -1157,8 +1166,6 @@ static void option_shutdown(struct usb_serial *serial)
 
 	dbg("%s", __func__);
 
-	stop_read_write_urbs(serial);
-
 	/* Now free them */
 	for (i = 0; i < serial->num_ports; ++i) {
 		port = serial->port[i];
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index 7de54781fe6..3cece27325e 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -159,7 +159,7 @@ static int oti6858_tiocmget(struct tty_struct *tty, struct file *file);
 static int oti6858_tiocmset(struct tty_struct *tty, struct file *file,
 				unsigned int set, unsigned int clear);
 static int oti6858_startup(struct usb_serial *serial);
-static void oti6858_shutdown(struct usb_serial *serial);
+static void oti6858_release(struct usb_serial *serial);
 
 /* functions operating on buffers */
 static struct oti6858_buf *oti6858_buf_alloc(unsigned int size);
@@ -194,7 +194,7 @@ static struct usb_serial_driver oti6858_device = {
 	.write_room =		oti6858_write_room,
 	.chars_in_buffer =	oti6858_chars_in_buffer,
 	.attach =		oti6858_startup,
-	.shutdown =		oti6858_shutdown,
+	.release =		oti6858_release,
 };
 
 struct oti6858_private {
@@ -782,7 +782,7 @@ static int oti6858_ioctl(struct tty_struct *tty, struct file *file,
 }
 
 
-static void oti6858_shutdown(struct usb_serial *serial)
+static void oti6858_release(struct usb_serial *serial)
 {
 	struct oti6858_private *priv;
 	int i;
@@ -794,7 +794,6 @@ static void oti6858_shutdown(struct usb_serial *serial)
 		if (priv) {
 			oti6858_buf_free(priv->buf);
 			kfree(priv);
-			usb_set_serial_port_data(serial->port[i], NULL);
 		}
 	}
 }
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 6357b57f628..ec6c132a25b 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -878,7 +878,7 @@ static void pl2303_break_ctl(struct tty_struct *tty, int break_state)
 		dbg("%s - error sending break = %d", __func__, result);
 }
 
-static void pl2303_shutdown(struct usb_serial *serial)
+static void pl2303_release(struct usb_serial *serial)
 {
 	int i;
 	struct pl2303_private *priv;
@@ -890,7 +890,6 @@ static void pl2303_shutdown(struct usb_serial *serial)
 		if (priv) {
 			pl2303_buf_free(priv->buf);
 			kfree(priv);
-			usb_set_serial_port_data(serial->port[i], NULL);
 		}
 	}
 }
@@ -1123,7 +1122,7 @@ static struct usb_serial_driver pl2303_device = {
 	.write_room =		pl2303_write_room,
 	.chars_in_buffer =	pl2303_chars_in_buffer,
 	.attach =		pl2303_startup,
-	.shutdown =		pl2303_shutdown,
+	.release =		pl2303_release,
 };
 
 static int __init pl2303_init(void)
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index a88cde95016..032f7aeb40a 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -814,7 +814,7 @@ static int sierra_startup(struct usb_serial *serial)
 	return 0;
 }
 
-static void sierra_shutdown(struct usb_serial *serial)
+static void sierra_disconnect(struct usb_serial *serial)
 {
 	int i;
 	struct usb_serial_port *port;
@@ -853,7 +853,7 @@ static struct usb_serial_driver sierra_device = {
 	.tiocmget          = sierra_tiocmget,
 	.tiocmset          = sierra_tiocmset,
 	.attach            = sierra_startup,
-	.shutdown          = sierra_shutdown,
+	.disconnect        = sierra_disconnect,
 	.read_int_callback = sierra_instat_callback,
 };
 
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 8f7ed8f1399..3c249d8e8b8 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -356,7 +356,7 @@ cleanup:
 }
 
 /* call when the device plug out. free all the memory alloced by probe */
-static void spcp8x5_shutdown(struct usb_serial *serial)
+static void spcp8x5_release(struct usb_serial *serial)
 {
 	int i;
 	struct spcp8x5_private *priv;
@@ -366,7 +366,6 @@ static void spcp8x5_shutdown(struct usb_serial *serial)
 		if (priv) {
 			free_ringbuf(priv->buf);
 			kfree(priv);
-			usb_set_serial_port_data(serial->port[i] , NULL);
 		}
 	}
 }
@@ -1020,7 +1019,7 @@ static struct usb_serial_driver spcp8x5_device = {
 	.write_bulk_callback	= spcp8x5_write_bulk_callback,
 	.chars_in_buffer 	= spcp8x5_chars_in_buffer,
 	.attach 		= spcp8x5_startup,
-	.shutdown 		= spcp8x5_shutdown,
+	.release 		= spcp8x5_release,
 };
 
 static int __init spcp8x5_init(void)
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 8b07ebc6bae..6157fac9366 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -267,7 +267,7 @@ error:
 	return retval;
 }
 
-static void symbol_shutdown(struct usb_serial *serial)
+static void symbol_disconnect(struct usb_serial *serial)
 {
 	struct symbol_private *priv = usb_get_serial_data(serial);
 
@@ -275,9 +275,16 @@ static void symbol_shutdown(struct usb_serial *serial)
 
 	usb_kill_urb(priv->int_urb);
 	usb_free_urb(priv->int_urb);
+}
+
+static void symbol_release(struct usb_serial *serial)
+{
+	struct symbol_private *priv = usb_get_serial_data(serial);
+
+	dbg("%s", __func__);
+
 	kfree(priv->int_buffer);
 	kfree(priv);
-	usb_set_serial_data(serial, NULL);
 }
 
 static struct usb_driver symbol_driver = {
@@ -299,7 +306,8 @@ static struct usb_serial_driver symbol_device = {
 	.attach =		symbol_startup,
 	.open =			symbol_open,
 	.close =		symbol_close,
-	.shutdown =		symbol_shutdown,
+	.disconnect =		symbol_disconnect,
+	.release =		symbol_release,
 	.throttle = 		symbol_throttle,
 	.unthrottle =		symbol_unthrottle,
 };
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 42cb04c403b..991d8232e37 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -97,7 +97,7 @@ struct ti_device {
 /* Function Declarations */
 
 static int ti_startup(struct usb_serial *serial);
-static void ti_shutdown(struct usb_serial *serial);
+static void ti_release(struct usb_serial *serial);
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port,
 		struct file *file);
 static void ti_close(struct usb_serial_port *port);
@@ -230,7 +230,7 @@ static struct usb_serial_driver ti_1port_device = {
 	.id_table		= ti_id_table_3410,
 	.num_ports		= 1,
 	.attach			= ti_startup,
-	.shutdown		= ti_shutdown,
+	.release		= ti_release,
 	.open			= ti_open,
 	.close			= ti_close,
 	.write			= ti_write,
@@ -258,7 +258,7 @@ static struct usb_serial_driver ti_2port_device = {
 	.id_table		= ti_id_table_5052,
 	.num_ports		= 2,
 	.attach			= ti_startup,
-	.shutdown		= ti_shutdown,
+	.release		= ti_release,
 	.open			= ti_open,
 	.close			= ti_close,
 	.write			= ti_write,
@@ -473,7 +473,7 @@ free_tdev:
 }
 
 
-static void ti_shutdown(struct usb_serial *serial)
+static void ti_release(struct usb_serial *serial)
 {
 	int i;
 	struct ti_device *tdev = usb_get_serial_data(serial);
@@ -486,12 +486,10 @@ static void ti_shutdown(struct usb_serial *serial)
 		if (tport) {
 			ti_buf_free(tport->tp_write_buf);
 			kfree(tport);
-			usb_set_serial_port_data(serial->port[i], NULL);
 		}
 	}
 
 	kfree(tdev);
-	usb_set_serial_data(serial, NULL);
 }
 
 
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index da890f030fa..d595aa5586a 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -141,6 +141,14 @@ static void destroy_serial(struct kref *kref)
 	if (serial->minor != SERIAL_TTY_NO_MINOR)
 		return_serial(serial);
 
+	serial->type->release(serial);
+
+	for (i = 0; i < serial->num_ports; ++i) {
+		port = serial->port[i];
+		if (port)
+			put_device(&port->dev);
+	}
+
 	/* If this is a "fake" port, we have to clean it up here, as it will
 	 * not get cleaned up in port_release() as it was never registered with
 	 * the driver core */
@@ -148,9 +156,8 @@ static void destroy_serial(struct kref *kref)
 		for (i = serial->num_ports;
 					i < serial->num_port_pointers; ++i) {
 			port = serial->port[i];
-			if (!port)
-				continue;
-			port_free(port);
+			if (port)
+				port_free(port);
 		}
 	}
 
@@ -1118,10 +1125,6 @@ void usb_serial_disconnect(struct usb_interface *interface)
 	serial->disconnected = 1;
 	mutex_unlock(&serial->disc_mutex);
 
-	/* Unfortunately, many of the sub-drivers expect the port structures
-	 * to exist when their shutdown method is called, so we have to go
-	 * through this awkward two-step unregistration procedure.
-	 */
 	for (i = 0; i < serial->num_ports; ++i) {
 		port = serial->port[i];
 		if (port) {
@@ -1153,14 +1156,7 @@ void usb_serial_disconnect(struct usb_interface *interface)
 			}
 		}
 	}
-	serial->type->shutdown(serial);
-	for (i = 0; i < serial->num_ports; ++i) {
-		port = serial->port[i];
-		if (port) {
-			put_device(&port->dev);
-			serial->port[i] = NULL;
-		}
-	}
+	serial->type->disconnect(serial);
 
 	/* let the last holder of this object
 	 * cause it to be cleaned up */
@@ -1338,7 +1334,8 @@ static void fixup_generic(struct usb_serial_driver *device)
 	set_to_generic_if_null(device, chars_in_buffer);
 	set_to_generic_if_null(device, read_bulk_callback);
 	set_to_generic_if_null(device, write_bulk_callback);
-	set_to_generic_if_null(device, shutdown);
+	set_to_generic_if_null(device, disconnect);
+	set_to_generic_if_null(device, release);
 }
 
 int usb_serial_register(struct usb_serial_driver *driver)
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index b15f1c0e1d4..f5d0f64dcc5 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -47,7 +47,7 @@ static void visor_unthrottle(struct tty_struct *tty);
 static int  visor_probe(struct usb_serial *serial,
 					const struct usb_device_id *id);
 static int  visor_calc_num_ports(struct usb_serial *serial);
-static void visor_shutdown(struct usb_serial *serial);
+static void visor_release(struct usb_serial *serial);
 static void visor_write_bulk_callback(struct urb *urb);
 static void visor_read_bulk_callback(struct urb *urb);
 static void visor_read_int_callback(struct urb *urb);
@@ -202,7 +202,7 @@ static struct usb_serial_driver handspring_device = {
 	.attach =		treo_attach,
 	.probe =		visor_probe,
 	.calc_num_ports =	visor_calc_num_ports,
-	.shutdown =		visor_shutdown,
+	.release =		visor_release,
 	.write =		visor_write,
 	.write_room =		visor_write_room,
 	.write_bulk_callback =	visor_write_bulk_callback,
@@ -227,7 +227,7 @@ static struct usb_serial_driver clie_5_device = {
 	.attach =		clie_5_attach,
 	.probe =		visor_probe,
 	.calc_num_ports =	visor_calc_num_ports,
-	.shutdown =		visor_shutdown,
+	.release =		visor_release,
 	.write =		visor_write,
 	.write_room =		visor_write_room,
 	.write_bulk_callback =	visor_write_bulk_callback,
@@ -918,7 +918,7 @@ static int clie_5_attach(struct usb_serial *serial)
 	return generic_startup(serial);
 }
 
-static void visor_shutdown(struct usb_serial *serial)
+static void visor_release(struct usb_serial *serial)
 {
 	struct visor_private *priv;
 	int i;
@@ -927,10 +927,7 @@ static void visor_shutdown(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; i++) {
 		priv = usb_get_serial_port_data(serial->port[i]);
-		if (priv) {
-			usb_set_serial_port_data(serial->port[i], NULL);
-			kfree(priv);
-		}
+		kfree(priv);
 	}
 }
 
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 7c7295d09f3..8d126dd7a02 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -144,7 +144,7 @@ static int  whiteheat_firmware_attach(struct usb_serial *serial);
 
 /* function prototypes for the Connect Tech WhiteHEAT serial converter */
 static int  whiteheat_attach(struct usb_serial *serial);
-static void whiteheat_shutdown(struct usb_serial *serial);
+static void whiteheat_release(struct usb_serial *serial);
 static int  whiteheat_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
 static void whiteheat_close(struct usb_serial_port *port);
@@ -189,7 +189,7 @@ static struct usb_serial_driver whiteheat_device = {
 	.id_table =		id_table_std,
 	.num_ports =		4,
 	.attach =		whiteheat_attach,
-	.shutdown =		whiteheat_shutdown,
+	.release =		whiteheat_release,
 	.open =			whiteheat_open,
 	.close =		whiteheat_close,
 	.write =		whiteheat_write,
@@ -617,7 +617,7 @@ no_command_buffer:
 }
 
 
-static void whiteheat_shutdown(struct usb_serial *serial)
+static void whiteheat_release(struct usb_serial *serial)
 {
 	struct usb_serial_port *command_port;
 	struct usb_serial_port *port;
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index ed4aa0fa7ed..44801d26a37 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -194,8 +194,10 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
  *	This will be called when the struct usb_serial structure is fully set
  *	set up.  Do any local initialization of the device, or any private
  *	memory structure allocation at this point in time.
- * @shutdown: pointer to the driver's shutdown function.  This will be
- *	called when the device is removed from the system.
+ * @disconnect: pointer to the driver's disconnect function.  This will be
+ *	called when the device is unplugged or unbound from the driver.
+ * @release: pointer to the driver's release function.  This will be called
+ *	when the usb_serial data structure is about to be destroyed.
  * @usb_driver: pointer to the struct usb_driver that controls this
  *	device.  This is necessary to allow dynamic ids to be added to
  *	the driver from sysfs.
@@ -226,7 +228,8 @@ struct usb_serial_driver {
 	int (*attach)(struct usb_serial *serial);
 	int (*calc_num_ports) (struct usb_serial *serial);
 
-	void (*shutdown)(struct usb_serial *serial);
+	void (*disconnect)(struct usb_serial *serial);
+	void (*release)(struct usb_serial *serial);
 
 	int (*port_probe)(struct usb_serial_port *port);
 	int (*port_remove)(struct usb_serial_port *port);
@@ -308,7 +311,8 @@ extern void usb_serial_generic_read_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_write_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_throttle(struct tty_struct *tty);
 extern void usb_serial_generic_unthrottle(struct tty_struct *tty);
-extern void usb_serial_generic_shutdown(struct usb_serial *serial);
+extern void usb_serial_generic_disconnect(struct usb_serial *serial);
+extern void usb_serial_generic_release(struct usb_serial *serial);
 extern int usb_serial_generic_register(int debug);
 extern void usb_serial_generic_deregister(void);
 extern void usb_serial_generic_resubmit_read_urb(struct usb_serial_port *port,
-- 
cgit v1.2.3-70-g09d2


From 5be19a9daa2df2507adf5b4676a7db8d131cf56e Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <xiaochen.shen@intel.com>
Date: Thu, 4 Jun 2009 15:34:49 +0800
Subject: USB: Add Intel Langwell USB Device Controller driver

Intel Langwell USB Device Controller is a High-Speed USB OTG device
controller in Intel Moorestown platform. It can work in OTG device mode
with Intel Langwell USB OTG transceiver driver as well as device-only
mode. The number of programmable endpoints is different through
controller revision.

NOTE:
This patch is the first version Intel Langwell USB OTG device controller
driver. The bug fixing is on going for some hardware and software
issues.  Intel Langwell USB OTG transceiver driver and EHCI driver
patches will be submitted later.

Supported features:
 - USB OTG protocol support with Intel Langwell USB OTG transceiver
   driver (turn on CONFIG_USB_LANGWELL_OTG)
 - Support control, bulk, interrupt and isochronous endpoints
   (isochronous not tested)
 - PCI D0/D3 power management support
 - Link Power Management (LPM) support

Tested gadget drivers:
 - g_file_storage
 - g_ether
 - g_zero

The passed tests:
 - g_file_storage: USBCV Chapter 9 tests
 - g_file_storage: USBCV MSC tests
 - g_file_storage: from/to host files copying
 - g_ether: ping, ftp and scp files from/to host
 - Hotplug, with and without hubs

Known issues:
 - g_ether: failed part of USBCV chap9 tests
 - LPM support not fully tested

TODO:
 - g_ether: pass all USBCV chap9 tests
 - g_zero: pass usbtest tests
 - Stress tests on different gadget drivers
 - On-chip private SRAM caching support

Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/Kconfig        |   21 +
 drivers/usb/gadget/Makefile       |    1 +
 drivers/usb/gadget/gadget_chips.h |    8 +
 drivers/usb/gadget/langwell_udc.c | 3373 +++++++++++++++++++++++++++++++++++++
 drivers/usb/gadget/langwell_udc.h |  228 +++
 include/linux/usb/langwell_udc.h  |  310 ++++
 6 files changed, 3941 insertions(+)
 create mode 100644 drivers/usb/gadget/langwell_udc.c
 create mode 100644 drivers/usb/gadget/langwell_udc.h
 create mode 100644 include/linux/usb/langwell_udc.h

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index ef9d9086cdd..5d1ddf485d1 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -474,6 +474,27 @@ config USB_GOKU
 	default USB_GADGET
 	select USB_GADGET_SELECTED
 
+config USB_GADGET_LANGWELL
+	boolean "Intel Langwell USB Device Controller"
+	depends on PCI
+	select USB_GADGET_DUALSPEED
+	help
+	   Intel Langwell USB Device Controller is a High-Speed USB
+	   On-The-Go device controller.
+
+	   The number of programmable endpoints is different through
+	   controller revision.
+
+	   Say "y" to link the driver statically, or "m" to build a
+	   dynamically linked module called "langwell_udc" and force all
+	   gadget drivers to also be dynamically linked.
+
+config USB_LANGWELL
+	tristate
+	depends on USB_GADGET_LANGWELL
+	default USB_GADGET
+	select USB_GADGET_SELECTED
+
 
 #
 # LAST -- dummy/emulated controller
diff --git a/drivers/usb/gadget/Makefile b/drivers/usb/gadget/Makefile
index fc9a7dc0104..e6017e6bf6d 100644
--- a/drivers/usb/gadget/Makefile
+++ b/drivers/usb/gadget/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_USB_M66592)	+= m66592-udc.o
 obj-$(CONFIG_USB_FSL_QE)	+= fsl_qe_udc.o
 obj-$(CONFIG_USB_CI13XXX)	+= ci13xxx_udc.o
 obj-$(CONFIG_USB_S3C_HSOTG)	+= s3c-hsotg.o
+obj-$(CONFIG_USB_LANGWELL)	+= langwell_udc.o
 
 #
 # USB gadget drivers
diff --git a/drivers/usb/gadget/gadget_chips.h b/drivers/usb/gadget/gadget_chips.h
index ec6d439a2aa..8e0e9a0b736 100644
--- a/drivers/usb/gadget/gadget_chips.h
+++ b/drivers/usb/gadget/gadget_chips.h
@@ -137,6 +137,12 @@
 #define gadget_is_musbhdrc(g)	0
 #endif
 
+#ifdef CONFIG_USB_GADGET_LANGWELL
+#define gadget_is_langwell(g)	(!strcmp("langwell_udc", (g)->name))
+#else
+#define gadget_is_langwell(g)	0
+#endif
+
 /* from Montavista kernel (?) */
 #ifdef CONFIG_USB_GADGET_MPC8272
 #define gadget_is_mpc8272(g)	!strcmp("mpc8272_udc", (g)->name)
@@ -231,6 +237,8 @@ static inline int usb_gadget_controller_number(struct usb_gadget *gadget)
 		return 0x22;
 	else if (gadget_is_ci13xxx(gadget))
 		return 0x23;
+	else if (gadget_is_langwell(gadget))
+		return 0x24;
 	return -ENOENT;
 }
 
diff --git a/drivers/usb/gadget/langwell_udc.c b/drivers/usb/gadget/langwell_udc.c
new file mode 100644
index 00000000000..6829d596135
--- /dev/null
+++ b/drivers/usb/gadget/langwell_udc.c
@@ -0,0 +1,3373 @@
+/*
+ * Intel Langwell USB Device Controller driver
+ * Copyright (C) 2008-2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+
+/* #undef	DEBUG */
+/* #undef	VERBOSE */
+
+#if defined(CONFIG_USB_LANGWELL_OTG)
+#define	OTG_TRANSCEIVER
+#endif
+
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/interrupt.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/gadget.h>
+#include <linux/usb/otg.h>
+#include <linux/pm.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <asm/system.h>
+#include <asm/unaligned.h>
+
+#include "langwell_udc.h"
+
+
+#define	DRIVER_DESC		"Intel Langwell USB Device Controller driver"
+#define	DRIVER_VERSION		"16 May 2009"
+
+static const char driver_name[] = "langwell_udc";
+static const char driver_desc[] = DRIVER_DESC;
+
+
+/* controller device global variable */
+static struct langwell_udc	*the_controller;
+
+/* for endpoint 0 operations */
+static const struct usb_endpoint_descriptor
+langwell_ep0_desc = {
+	.bLength =		USB_DT_ENDPOINT_SIZE,
+	.bDescriptorType =	USB_DT_ENDPOINT,
+	.bEndpointAddress =	0,
+	.bmAttributes =		USB_ENDPOINT_XFER_CONTROL,
+	.wMaxPacketSize =	EP0_MAX_PKT_SIZE,
+};
+
+
+/*-------------------------------------------------------------------------*/
+/* debugging */
+
+#ifdef	DEBUG
+#define	DBG(dev, fmt, args...) \
+	pr_debug("%s %s: " fmt , driver_name, \
+			pci_name(dev->pdev), ## args)
+#else
+#define	DBG(dev, fmt, args...) \
+	do { } while (0)
+#endif /* DEBUG */
+
+
+#ifdef	VERBOSE
+#define	VDBG DBG
+#else
+#define	VDBG(dev, fmt, args...) \
+	do { } while (0)
+#endif	/* VERBOSE */
+
+
+#define	ERROR(dev, fmt, args...) \
+	pr_err("%s %s: " fmt , driver_name, \
+			pci_name(dev->pdev), ## args)
+
+#define	WARNING(dev, fmt, args...) \
+	pr_warning("%s %s: " fmt , driver_name, \
+			pci_name(dev->pdev), ## args)
+
+#define	INFO(dev, fmt, args...) \
+	pr_info("%s %s: " fmt , driver_name, \
+			pci_name(dev->pdev), ## args)
+
+
+#ifdef	VERBOSE
+static inline void print_all_registers(struct langwell_udc *dev)
+{
+	int	i;
+
+	/* Capability Registers */
+	printk(KERN_DEBUG "Capability Registers (offset: "
+			"0x%04x, length: 0x%08x)\n",
+			CAP_REG_OFFSET,
+			(u32)sizeof(struct langwell_cap_regs));
+	printk(KERN_DEBUG "caplength=0x%02x\n",
+			readb(&dev->cap_regs->caplength));
+	printk(KERN_DEBUG "hciversion=0x%04x\n",
+			readw(&dev->cap_regs->hciversion));
+	printk(KERN_DEBUG "hcsparams=0x%08x\n",
+			readl(&dev->cap_regs->hcsparams));
+	printk(KERN_DEBUG "hccparams=0x%08x\n",
+			readl(&dev->cap_regs->hccparams));
+	printk(KERN_DEBUG "dciversion=0x%04x\n",
+			readw(&dev->cap_regs->dciversion));
+	printk(KERN_DEBUG "dccparams=0x%08x\n",
+			readl(&dev->cap_regs->dccparams));
+
+	/* Operational Registers */
+	printk(KERN_DEBUG "Operational Registers (offset: "
+			"0x%04x, length: 0x%08x)\n",
+			OP_REG_OFFSET,
+			(u32)sizeof(struct langwell_op_regs));
+	printk(KERN_DEBUG "extsts=0x%08x\n",
+			readl(&dev->op_regs->extsts));
+	printk(KERN_DEBUG "extintr=0x%08x\n",
+			readl(&dev->op_regs->extintr));
+	printk(KERN_DEBUG "usbcmd=0x%08x\n",
+			readl(&dev->op_regs->usbcmd));
+	printk(KERN_DEBUG "usbsts=0x%08x\n",
+			readl(&dev->op_regs->usbsts));
+	printk(KERN_DEBUG "usbintr=0x%08x\n",
+			readl(&dev->op_regs->usbintr));
+	printk(KERN_DEBUG "frindex=0x%08x\n",
+			readl(&dev->op_regs->frindex));
+	printk(KERN_DEBUG "ctrldssegment=0x%08x\n",
+			readl(&dev->op_regs->ctrldssegment));
+	printk(KERN_DEBUG "deviceaddr=0x%08x\n",
+			readl(&dev->op_regs->deviceaddr));
+	printk(KERN_DEBUG "endpointlistaddr=0x%08x\n",
+			readl(&dev->op_regs->endpointlistaddr));
+	printk(KERN_DEBUG "ttctrl=0x%08x\n",
+			readl(&dev->op_regs->ttctrl));
+	printk(KERN_DEBUG "burstsize=0x%08x\n",
+			readl(&dev->op_regs->burstsize));
+	printk(KERN_DEBUG "txfilltuning=0x%08x\n",
+			readl(&dev->op_regs->txfilltuning));
+	printk(KERN_DEBUG "txttfilltuning=0x%08x\n",
+			readl(&dev->op_regs->txttfilltuning));
+	printk(KERN_DEBUG "ic_usb=0x%08x\n",
+			readl(&dev->op_regs->ic_usb));
+	printk(KERN_DEBUG "ulpi_viewport=0x%08x\n",
+			readl(&dev->op_regs->ulpi_viewport));
+	printk(KERN_DEBUG "configflag=0x%08x\n",
+			readl(&dev->op_regs->configflag));
+	printk(KERN_DEBUG "portsc1=0x%08x\n",
+			readl(&dev->op_regs->portsc1));
+	printk(KERN_DEBUG "devlc=0x%08x\n",
+			readl(&dev->op_regs->devlc));
+	printk(KERN_DEBUG "otgsc=0x%08x\n",
+			readl(&dev->op_regs->otgsc));
+	printk(KERN_DEBUG "usbmode=0x%08x\n",
+			readl(&dev->op_regs->usbmode));
+	printk(KERN_DEBUG "endptnak=0x%08x\n",
+			readl(&dev->op_regs->endptnak));
+	printk(KERN_DEBUG "endptnaken=0x%08x\n",
+			readl(&dev->op_regs->endptnaken));
+	printk(KERN_DEBUG "endptsetupstat=0x%08x\n",
+			readl(&dev->op_regs->endptsetupstat));
+	printk(KERN_DEBUG "endptprime=0x%08x\n",
+			readl(&dev->op_regs->endptprime));
+	printk(KERN_DEBUG "endptflush=0x%08x\n",
+			readl(&dev->op_regs->endptflush));
+	printk(KERN_DEBUG "endptstat=0x%08x\n",
+			readl(&dev->op_regs->endptstat));
+	printk(KERN_DEBUG "endptcomplete=0x%08x\n",
+			readl(&dev->op_regs->endptcomplete));
+
+	for (i = 0; i < dev->ep_max / 2; i++) {
+		printk(KERN_DEBUG "endptctrl[%d]=0x%08x\n",
+				i, readl(&dev->op_regs->endptctrl[i]));
+	}
+}
+#endif /* VERBOSE */
+
+
+/*-------------------------------------------------------------------------*/
+
+#define	DIR_STRING(bAddress)	(((bAddress) & USB_DIR_IN) ? "in" : "out")
+
+#define is_in(ep)	(((ep)->ep_num == 0) ? ((ep)->dev->ep0_dir == \
+			USB_DIR_IN) : ((ep)->desc->bEndpointAddress \
+			& USB_DIR_IN) == USB_DIR_IN)
+
+
+#ifdef	DEBUG
+static char *type_string(u8 bmAttributes)
+{
+	switch ((bmAttributes) & USB_ENDPOINT_XFERTYPE_MASK) {
+	case USB_ENDPOINT_XFER_BULK:
+		return "bulk";
+	case USB_ENDPOINT_XFER_ISOC:
+		return "iso";
+	case USB_ENDPOINT_XFER_INT:
+		return "int";
+	};
+
+	return "control";
+}
+#endif
+
+
+/* configure endpoint control registers */
+static void ep_reset(struct langwell_ep *ep, unsigned char ep_num,
+		unsigned char is_in, unsigned char ep_type)
+{
+	struct langwell_udc	*dev;
+	u32			endptctrl;
+
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	endptctrl = readl(&dev->op_regs->endptctrl[ep_num]);
+	if (is_in) {	/* TX */
+		if (ep_num)
+			endptctrl |= EPCTRL_TXR;
+		endptctrl |= EPCTRL_TXE;
+		endptctrl |= ep_type << EPCTRL_TXT_SHIFT;
+	} else {	/* RX */
+		if (ep_num)
+			endptctrl |= EPCTRL_RXR;
+		endptctrl |= EPCTRL_RXE;
+		endptctrl |= ep_type << EPCTRL_RXT_SHIFT;
+	}
+
+	writel(endptctrl, &dev->op_regs->endptctrl[ep_num]);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* reset ep0 dQH and endptctrl */
+static void ep0_reset(struct langwell_udc *dev)
+{
+	struct langwell_ep	*ep;
+	int			i;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* ep0 in and out */
+	for (i = 0; i < 2; i++) {
+		ep = &dev->ep[i];
+		ep->dev = dev;
+
+		/* ep0 dQH */
+		ep->dqh = &dev->ep_dqh[i];
+
+		/* configure ep0 endpoint capabilities in dQH */
+		ep->dqh->dqh_ios = 1;
+		ep->dqh->dqh_mpl = EP0_MAX_PKT_SIZE;
+
+		/* FIXME: enable ep0-in HW zero length termination select */
+		if (is_in(ep))
+			ep->dqh->dqh_zlt = 0;
+		ep->dqh->dqh_mult = 0;
+
+		/* configure ep0 control registers */
+		ep_reset(&dev->ep[0], 0, i, USB_ENDPOINT_XFER_CONTROL);
+	}
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return;
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* endpoints operations */
+
+/* configure endpoint, making it usable */
+static int langwell_ep_enable(struct usb_ep *_ep,
+		const struct usb_endpoint_descriptor *desc)
+{
+	struct langwell_udc	*dev;
+	struct langwell_ep	*ep;
+	u16			max = 0;
+	unsigned long		flags;
+	int			retval = 0;
+	unsigned char		zlt, ios = 0, mult = 0;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !desc || ep->desc
+			|| desc->bDescriptorType != USB_DT_ENDPOINT)
+		return -EINVAL;
+
+	if (!dev->driver || dev->gadget.speed == USB_SPEED_UNKNOWN)
+		return -ESHUTDOWN;
+
+	max = le16_to_cpu(desc->wMaxPacketSize);
+
+	/*
+	 * disable HW zero length termination select
+	 * driver handles zero length packet through req->req.zero
+	 */
+	zlt = 1;
+
+	/*
+	 * sanity check type, direction, address, and then
+	 * initialize the endpoint capabilities fields in dQH
+	 */
+	switch (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
+	case USB_ENDPOINT_XFER_CONTROL:
+		ios = 1;
+		break;
+	case USB_ENDPOINT_XFER_BULK:
+		if ((dev->gadget.speed == USB_SPEED_HIGH
+					&& max != 512)
+				|| (dev->gadget.speed == USB_SPEED_FULL
+					&& max > 64)) {
+			goto done;
+		}
+		break;
+	case USB_ENDPOINT_XFER_INT:
+		if (strstr(ep->ep.name, "-iso")) /* bulk is ok */
+			goto done;
+
+		switch (dev->gadget.speed) {
+		case USB_SPEED_HIGH:
+			if (max <= 1024)
+				break;
+		case USB_SPEED_FULL:
+			if (max <= 64)
+				break;
+		default:
+			if (max <= 8)
+				break;
+			goto done;
+		}
+		break;
+	case USB_ENDPOINT_XFER_ISOC:
+		if (strstr(ep->ep.name, "-bulk")
+				|| strstr(ep->ep.name, "-int"))
+			goto done;
+
+		switch (dev->gadget.speed) {
+		case USB_SPEED_HIGH:
+			if (max <= 1024)
+				break;
+		case USB_SPEED_FULL:
+			if (max <= 1023)
+				break;
+		default:
+			goto done;
+		}
+		/*
+		 * FIXME:
+		 * calculate transactions needed for high bandwidth iso
+		 */
+		mult = (unsigned char)(1 + ((max >> 11) & 0x03));
+		max = max & 0x8ff;	/* bit 0~10 */
+		/* 3 transactions at most */
+		if (mult > 3)
+			goto done;
+		break;
+	default:
+		goto done;
+	}
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/* configure endpoint capabilities in dQH */
+	ep->dqh->dqh_ios = ios;
+	ep->dqh->dqh_mpl = cpu_to_le16(max);
+	ep->dqh->dqh_zlt = zlt;
+	ep->dqh->dqh_mult = mult;
+
+	ep->ep.maxpacket = max;
+	ep->desc = desc;
+	ep->stopped = 0;
+	ep->ep_num = desc->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK;
+
+	/* ep_type */
+	ep->ep_type = desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK;
+
+	/* configure endpoint control registers */
+	ep_reset(ep, ep->ep_num, is_in(ep), ep->ep_type);
+
+	DBG(dev, "enabled %s (ep%d%s-%s), max %04x\n",
+			_ep->name,
+			ep->ep_num,
+			DIR_STRING(desc->bEndpointAddress),
+			type_string(desc->bmAttributes),
+			max);
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+done:
+	VDBG(dev, "<--- %s()\n", __func__);
+	return retval;
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* retire a request */
+static void done(struct langwell_ep *ep, struct langwell_request *req,
+		int status)
+{
+	struct langwell_udc	*dev = ep->dev;
+	unsigned		stopped = ep->stopped;
+	struct langwell_dtd	*curr_dtd, *next_dtd;
+	int			i;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* remove the req from ep->queue */
+	list_del_init(&req->queue);
+
+	if (req->req.status == -EINPROGRESS)
+		req->req.status = status;
+	else
+		status = req->req.status;
+
+	/* free dTD for the request */
+	next_dtd = req->head;
+	for (i = 0; i < req->dtd_count; i++) {
+		curr_dtd = next_dtd;
+		if (i != req->dtd_count - 1)
+			next_dtd = curr_dtd->next_dtd_virt;
+		dma_pool_free(dev->dtd_pool, curr_dtd, curr_dtd->dtd_dma);
+	}
+
+	if (req->mapped) {
+		dma_unmap_single(&dev->pdev->dev, req->req.dma, req->req.length,
+			is_in(ep) ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
+		req->req.dma = DMA_ADDR_INVALID;
+		req->mapped = 0;
+	} else
+		dma_sync_single_for_cpu(&dev->pdev->dev, req->req.dma,
+				req->req.length,
+				is_in(ep) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	if (status != -ESHUTDOWN)
+		DBG(dev, "complete %s, req %p, stat %d, len %u/%u\n",
+			ep->ep.name, &req->req, status,
+			req->req.actual, req->req.length);
+
+	/* don't modify queue heads during completion callback */
+	ep->stopped = 1;
+
+	spin_unlock(&dev->lock);
+	/* complete routine from gadget driver */
+	if (req->req.complete)
+		req->req.complete(&ep->ep, &req->req);
+
+	spin_lock(&dev->lock);
+	ep->stopped = stopped;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+static void langwell_ep_fifo_flush(struct usb_ep *_ep);
+
+/* delete all endpoint requests, called with spinlock held */
+static void nuke(struct langwell_ep *ep, int status)
+{
+	/* called with spinlock held */
+	ep->stopped = 1;
+
+	/* endpoint fifo flush */
+	if (&ep->ep && ep->desc)
+		langwell_ep_fifo_flush(&ep->ep);
+
+	while (!list_empty(&ep->queue)) {
+		struct langwell_request	*req = NULL;
+		req = list_entry(ep->queue.next, struct langwell_request,
+				queue);
+		done(ep, req, status);
+	}
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* endpoint is no longer usable */
+static int langwell_ep_disable(struct usb_ep *_ep)
+{
+	struct langwell_ep	*ep;
+	unsigned long		flags;
+	struct langwell_udc	*dev;
+	int			ep_num;
+	u32			endptctrl;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !ep->desc)
+		return -EINVAL;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/* disable endpoint control register */
+	ep_num = ep->ep_num;
+	endptctrl = readl(&dev->op_regs->endptctrl[ep_num]);
+	if (is_in(ep))
+		endptctrl &= ~EPCTRL_TXE;
+	else
+		endptctrl &= ~EPCTRL_RXE;
+	writel(endptctrl, &dev->op_regs->endptctrl[ep_num]);
+
+	/* nuke all pending requests (does flush) */
+	nuke(ep, -ESHUTDOWN);
+
+	ep->desc = NULL;
+	ep->stopped = 1;
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	DBG(dev, "disabled %s\n", _ep->name);
+	VDBG(dev, "<--- %s()\n", __func__);
+
+	return 0;
+}
+
+
+/* allocate a request object to use with this endpoint */
+static struct usb_request *langwell_alloc_request(struct usb_ep *_ep,
+		gfp_t gfp_flags)
+{
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+	struct langwell_request	*req = NULL;
+
+	if (!_ep)
+		return NULL;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	req = kzalloc(sizeof(*req), gfp_flags);
+	if (!req)
+		return NULL;
+
+	req->req.dma = DMA_ADDR_INVALID;
+	INIT_LIST_HEAD(&req->queue);
+
+	VDBG(dev, "alloc request for %s\n", _ep->name);
+	VDBG(dev, "<--- %s()\n", __func__);
+	return &req->req;
+}
+
+
+/* free a request object */
+static void langwell_free_request(struct usb_ep *_ep,
+		struct usb_request *_req)
+{
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+	struct langwell_request	*req = NULL;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !_req)
+		return;
+
+	req = container_of(_req, struct langwell_request, req);
+	WARN_ON(!list_empty(&req->queue));
+
+	if (_req)
+		kfree(req);
+
+	VDBG(dev, "free request for %s\n", _ep->name);
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* queue dTD and PRIME endpoint */
+static int queue_dtd(struct langwell_ep *ep, struct langwell_request *req)
+{
+	u32			bit_mask, usbcmd, endptstat, dtd_dma;
+	u8			dtd_status;
+	int			i;
+	struct langwell_dqh	*dqh;
+	struct langwell_udc	*dev;
+
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	i = ep->ep_num * 2 + is_in(ep);
+	dqh = &dev->ep_dqh[i];
+
+	if (ep->ep_num)
+		VDBG(dev, "%s\n", ep->name);
+	else
+		/* ep0 */
+		VDBG(dev, "%s-%s\n", ep->name, is_in(ep) ? "in" : "out");
+
+	VDBG(dev, "ep_dqh[%d] addr: 0x%08x\n", i, (u32)&(dev->ep_dqh[i]));
+
+	bit_mask = is_in(ep) ?
+		(1 << (ep->ep_num + 16)) : (1 << (ep->ep_num));
+
+	VDBG(dev, "bit_mask = 0x%08x\n", bit_mask);
+
+	/* check if the pipe is empty */
+	if (!(list_empty(&ep->queue))) {
+		/* add dTD to the end of linked list */
+		struct langwell_request	*lastreq;
+		lastreq = list_entry(ep->queue.prev,
+				struct langwell_request, queue);
+
+		lastreq->tail->dtd_next =
+			cpu_to_le32(req->head->dtd_dma & DTD_NEXT_MASK);
+
+		/* read prime bit, if 1 goto out */
+		if (readl(&dev->op_regs->endptprime) & bit_mask)
+			goto out;
+
+		do {
+			/* set ATDTW bit in USBCMD */
+			usbcmd = readl(&dev->op_regs->usbcmd);
+			writel(usbcmd | CMD_ATDTW, &dev->op_regs->usbcmd);
+
+			/* read correct status bit */
+			endptstat = readl(&dev->op_regs->endptstat) & bit_mask;
+
+		} while (!(readl(&dev->op_regs->usbcmd) & CMD_ATDTW));
+
+		/* write ATDTW bit to 0 */
+		usbcmd = readl(&dev->op_regs->usbcmd);
+		writel(usbcmd & ~CMD_ATDTW, &dev->op_regs->usbcmd);
+
+		if (endptstat)
+			goto out;
+	}
+
+	/* write dQH next pointer and terminate bit to 0 */
+	dtd_dma = req->head->dtd_dma & DTD_NEXT_MASK;
+	dqh->dtd_next = cpu_to_le32(dtd_dma);
+
+	/* clear active and halt bit */
+	dtd_status = (u8) ~(DTD_STS_ACTIVE | DTD_STS_HALTED);
+	dqh->dtd_status &= dtd_status;
+	VDBG(dev, "dqh->dtd_status = 0x%x\n", dqh->dtd_status);
+
+	/* write 1 to endptprime register to PRIME endpoint */
+	bit_mask = is_in(ep) ? (1 << (ep->ep_num + 16)) : (1 << ep->ep_num);
+	VDBG(dev, "endprime bit_mask = 0x%08x\n", bit_mask);
+	writel(bit_mask, &dev->op_regs->endptprime);
+out:
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* fill in the dTD structure to build a transfer descriptor */
+static struct langwell_dtd *build_dtd(struct langwell_request *req,
+		unsigned *length, dma_addr_t *dma, int *is_last)
+{
+	u32			 buf_ptr;
+	struct langwell_dtd	*dtd;
+	struct langwell_udc	*dev;
+	int			i;
+
+	dev = req->ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* the maximum transfer length, up to 16k bytes */
+	*length = min(req->req.length - req->req.actual,
+			(unsigned)DTD_MAX_TRANSFER_LENGTH);
+
+	/* create dTD dma_pool resource */
+	dtd = dma_pool_alloc(dev->dtd_pool, GFP_KERNEL, dma);
+	if (dtd == NULL)
+		return dtd;
+	dtd->dtd_dma = *dma;
+
+	/* initialize buffer page pointers */
+	buf_ptr = (u32)(req->req.dma + req->req.actual);
+	for (i = 0; i < 5; i++)
+		dtd->dtd_buf[i] = cpu_to_le32(buf_ptr + i * PAGE_SIZE);
+
+	req->req.actual += *length;
+
+	/* fill in total bytes with transfer size */
+	dtd->dtd_total = cpu_to_le16(*length);
+	VDBG(dev, "dtd->dtd_total = %d\n", dtd->dtd_total);
+
+	/* set is_last flag if req->req.zero is set or not */
+	if (req->req.zero) {
+		if (*length == 0 || (*length % req->ep->ep.maxpacket) != 0)
+			*is_last = 1;
+		else
+			*is_last = 0;
+	} else if (req->req.length == req->req.actual) {
+		*is_last = 1;
+	} else
+		*is_last = 0;
+
+	if (*is_last == 0)
+		VDBG(dev, "multi-dtd request!\n");
+
+	/* set interrupt on complete bit for the last dTD */
+	if (*is_last && !req->req.no_interrupt)
+		dtd->dtd_ioc = 1;
+
+	/* set multiplier override 0 for non-ISO and non-TX endpoint */
+	dtd->dtd_multo = 0;
+
+	/* set the active bit of status field to 1 */
+	dtd->dtd_status = DTD_STS_ACTIVE;
+	VDBG(dev, "dtd->dtd_status = 0x%02x\n", dtd->dtd_status);
+
+	VDBG(dev, "length = %d, dma addr= 0x%08x\n", *length, (int)*dma);
+	VDBG(dev, "<--- %s()\n", __func__);
+	return dtd;
+}
+
+
+/* generate dTD linked list for a request */
+static int req_to_dtd(struct langwell_request *req)
+{
+	unsigned		count;
+	int			is_last, is_first = 1;
+	struct langwell_dtd	*dtd, *last_dtd = NULL;
+	struct langwell_udc	*dev;
+	dma_addr_t		dma;
+
+	dev = req->ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+	do {
+		dtd = build_dtd(req, &count, &dma, &is_last);
+		if (dtd == NULL)
+			return -ENOMEM;
+
+		if (is_first) {
+			is_first = 0;
+			req->head = dtd;
+		} else {
+			last_dtd->dtd_next = cpu_to_le32(dma);
+			last_dtd->next_dtd_virt = dtd;
+		}
+		last_dtd = dtd;
+		req->dtd_count++;
+	} while (!is_last);
+
+	/* set terminate bit to 1 for the last dTD */
+	dtd->dtd_next = DTD_TERM;
+
+	req->tail = dtd;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* queue (submits) an I/O requests to an endpoint */
+static int langwell_ep_queue(struct usb_ep *_ep, struct usb_request *_req,
+		gfp_t gfp_flags)
+{
+	struct langwell_request	*req;
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+	unsigned long		flags;
+	int			is_iso = 0, zlflag = 0;
+
+	/* always require a cpu-view buffer */
+	req = container_of(_req, struct langwell_request, req);
+	ep = container_of(_ep, struct langwell_ep, ep);
+
+	if (!_req || !_req->complete || !_req->buf
+			|| !list_empty(&req->queue)) {
+		return -EINVAL;
+	}
+
+	if (unlikely(!_ep || !ep->desc))
+		return -EINVAL;
+
+	dev = ep->dev;
+	req->ep = ep;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (ep->desc->bmAttributes == USB_ENDPOINT_XFER_ISOC) {
+		if (req->req.length > ep->ep.maxpacket)
+			return -EMSGSIZE;
+		is_iso = 1;
+	}
+
+	if (unlikely(!dev->driver || dev->gadget.speed == USB_SPEED_UNKNOWN))
+		return -ESHUTDOWN;
+
+	/* set up dma mapping in case the caller didn't */
+	if (_req->dma == DMA_ADDR_INVALID) {
+		/* WORKAROUND: WARN_ON(size == 0) */
+		if (_req->length == 0) {
+			VDBG(dev, "req->length: 0->1\n");
+			zlflag = 1;
+			_req->length++;
+		}
+
+		_req->dma = dma_map_single(&dev->pdev->dev,
+				_req->buf, _req->length,
+				is_in(ep) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		if (zlflag && (_req->length == 1)) {
+			VDBG(dev, "req->length: 1->0\n");
+			zlflag = 0;
+			_req->length = 0;
+		}
+
+		req->mapped = 1;
+		VDBG(dev, "req->mapped = 1\n");
+	} else {
+		dma_sync_single_for_device(&dev->pdev->dev,
+				_req->dma, _req->length,
+				is_in(ep) ?  DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		req->mapped = 0;
+		VDBG(dev, "req->mapped = 0\n");
+	}
+
+	DBG(dev, "%s queue req %p, len %u, buf %p, dma 0x%08x\n",
+			_ep->name,
+			_req, _req->length, _req->buf, _req->dma);
+
+	_req->status = -EINPROGRESS;
+	_req->actual = 0;
+	req->dtd_count = 0;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/* build and put dTDs to endpoint queue */
+	if (!req_to_dtd(req)) {
+		queue_dtd(ep, req);
+	} else {
+		spin_unlock_irqrestore(&dev->lock, flags);
+		return -ENOMEM;
+	}
+
+	/* update ep0 state */
+	if (ep->ep_num == 0)
+		dev->ep0_state = DATA_STATE_XMIT;
+
+	if (likely(req != NULL)) {
+		list_add_tail(&req->queue, &ep->queue);
+		VDBG(dev, "list_add_tail() \n");
+	}
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* dequeue (cancels, unlinks) an I/O request from an endpoint */
+static int langwell_ep_dequeue(struct usb_ep *_ep, struct usb_request *_req)
+{
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+	struct langwell_request	*req;
+	unsigned long		flags;
+	int			stopped, ep_num, retval = 0;
+	u32			endptctrl;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !ep->desc || !_req)
+		return -EINVAL;
+
+	if (!dev->driver)
+		return -ESHUTDOWN;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	stopped = ep->stopped;
+
+	/* quiesce dma while we patch the queue */
+	ep->stopped = 1;
+	ep_num = ep->ep_num;
+
+	/* disable endpoint control register */
+	endptctrl = readl(&dev->op_regs->endptctrl[ep_num]);
+	if (is_in(ep))
+		endptctrl &= ~EPCTRL_TXE;
+	else
+		endptctrl &= ~EPCTRL_RXE;
+	writel(endptctrl, &dev->op_regs->endptctrl[ep_num]);
+
+	/* make sure it's still queued on this endpoint */
+	list_for_each_entry(req, &ep->queue, queue) {
+		if (&req->req == _req)
+			break;
+	}
+
+	if (&req->req != _req) {
+		retval = -EINVAL;
+		goto done;
+	}
+
+	/* queue head may be partially complete. */
+	if (ep->queue.next == &req->queue) {
+		DBG(dev, "unlink (%s) dma\n", _ep->name);
+		_req->status = -ECONNRESET;
+		langwell_ep_fifo_flush(&ep->ep);
+
+		/* not the last request in endpoint queue */
+		if (likely(ep->queue.next == &req->queue)) {
+			struct langwell_dqh	*dqh;
+			struct langwell_request	*next_req;
+
+			dqh = ep->dqh;
+			next_req = list_entry(req->queue.next,
+					struct langwell_request, queue);
+
+			/* point the dQH to the first dTD of next request */
+			writel((u32) next_req->head, &dqh->dqh_current);
+		}
+	} else {
+		struct langwell_request	*prev_req;
+
+		prev_req = list_entry(req->queue.prev,
+				struct langwell_request, queue);
+		writel(readl(&req->tail->dtd_next),
+				&prev_req->tail->dtd_next);
+	}
+
+	done(ep, req, -ECONNRESET);
+
+done:
+	/* enable endpoint again */
+	endptctrl = readl(&dev->op_regs->endptctrl[ep_num]);
+	if (is_in(ep))
+		endptctrl |= EPCTRL_TXE;
+	else
+		endptctrl |= EPCTRL_RXE;
+	writel(endptctrl, &dev->op_regs->endptctrl[ep_num]);
+
+	ep->stopped = stopped;
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return retval;
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* endpoint set/clear halt */
+static void ep_set_halt(struct langwell_ep *ep, int value)
+{
+	u32			endptctrl = 0;
+	int			ep_num;
+	struct langwell_udc	*dev = ep->dev;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	ep_num = ep->ep_num;
+	endptctrl = readl(&dev->op_regs->endptctrl[ep_num]);
+
+	/* value: 1 - set halt, 0 - clear halt */
+	if (value) {
+		/* set the stall bit */
+		if (is_in(ep))
+			endptctrl |= EPCTRL_TXS;
+		else
+			endptctrl |= EPCTRL_RXS;
+	} else {
+		/* clear the stall bit and reset data toggle */
+		if (is_in(ep)) {
+			endptctrl &= ~EPCTRL_TXS;
+			endptctrl |= EPCTRL_TXR;
+		} else {
+			endptctrl &= ~EPCTRL_RXS;
+			endptctrl |= EPCTRL_RXR;
+		}
+	}
+
+	writel(endptctrl, &dev->op_regs->endptctrl[ep_num]);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* set the endpoint halt feature */
+static int langwell_ep_set_halt(struct usb_ep *_ep, int value)
+{
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+	unsigned long		flags;
+	int			retval = 0;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !ep->desc)
+		return -EINVAL;
+
+	if (!dev->driver || dev->gadget.speed == USB_SPEED_UNKNOWN)
+		return -ESHUTDOWN;
+
+	if (ep->desc && (ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK)
+			== USB_ENDPOINT_XFER_ISOC)
+		return  -EOPNOTSUPP;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/*
+	 * attempt to halt IN ep will fail if any transfer requests
+	 * are still queue
+	 */
+	if (!list_empty(&ep->queue) && is_in(ep) && value) {
+		/* IN endpoint FIFO holds bytes */
+		DBG(dev, "%s FIFO holds bytes\n", _ep->name);
+		retval = -EAGAIN;
+		goto done;
+	}
+
+	/* endpoint set/clear halt */
+	if (ep->ep_num) {
+		ep_set_halt(ep, value);
+	} else { /* endpoint 0 */
+		dev->ep0_state = WAIT_FOR_SETUP;
+		dev->ep0_dir = USB_DIR_OUT;
+	}
+done:
+	spin_unlock_irqrestore(&dev->lock, flags);
+	DBG(dev, "%s %s halt\n", _ep->name, value ? "set" : "clear");
+	VDBG(dev, "<--- %s()\n", __func__);
+	return retval;
+}
+
+
+/* set the halt feature and ignores clear requests */
+static int langwell_ep_set_wedge(struct usb_ep *_ep)
+{
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !ep->desc)
+		return -EINVAL;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return usb_ep_set_halt(_ep);
+}
+
+
+/* flush contents of a fifo */
+static void langwell_ep_fifo_flush(struct usb_ep *_ep)
+{
+	struct langwell_ep	*ep;
+	struct langwell_udc	*dev;
+	u32			flush_bit;
+	unsigned long		timeout;
+
+	ep = container_of(_ep, struct langwell_ep, ep);
+	dev = ep->dev;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (!_ep || !ep->desc) {
+		VDBG(dev, "ep or ep->desc is NULL\n");
+		VDBG(dev, "<--- %s()\n", __func__);
+		return;
+	}
+
+	VDBG(dev, "%s-%s fifo flush\n", _ep->name, is_in(ep) ? "in" : "out");
+
+	/* flush endpoint buffer */
+	if (ep->ep_num == 0)
+		flush_bit = (1 << 16) | 1;
+	else if (is_in(ep))
+		flush_bit = 1 << (ep->ep_num + 16);	/* TX */
+	else
+		flush_bit = 1 << ep->ep_num;		/* RX */
+
+	/* wait until flush complete */
+	timeout = jiffies + FLUSH_TIMEOUT;
+	do {
+		writel(flush_bit, &dev->op_regs->endptflush);
+		while (readl(&dev->op_regs->endptflush)) {
+			if (time_after(jiffies, timeout)) {
+				ERROR(dev, "ep flush timeout\n");
+				goto done;
+			}
+			cpu_relax();
+		}
+	} while (readl(&dev->op_regs->endptstat) & flush_bit);
+done:
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* endpoints operations structure */
+static const struct usb_ep_ops langwell_ep_ops = {
+
+	/* configure endpoint, making it usable */
+	.enable		= langwell_ep_enable,
+
+	/* endpoint is no longer usable */
+	.disable	= langwell_ep_disable,
+
+	/* allocate a request object to use with this endpoint */
+	.alloc_request	= langwell_alloc_request,
+
+	/* free a request object */
+	.free_request	= langwell_free_request,
+
+	/* queue (submits) an I/O requests to an endpoint */
+	.queue		= langwell_ep_queue,
+
+	/* dequeue (cancels, unlinks) an I/O request from an endpoint */
+	.dequeue	= langwell_ep_dequeue,
+
+	/* set the endpoint halt feature */
+	.set_halt	= langwell_ep_set_halt,
+
+	/* set the halt feature and ignores clear requests */
+	.set_wedge	= langwell_ep_set_wedge,
+
+	/* flush contents of a fifo */
+	.fifo_flush	= langwell_ep_fifo_flush,
+};
+
+
+/*-------------------------------------------------------------------------*/
+
+/* device controller usb_gadget_ops structure */
+
+/* returns the current frame number */
+static int langwell_get_frame(struct usb_gadget *_gadget)
+{
+	struct langwell_udc	*dev;
+	u16			retval;
+
+	if (!_gadget)
+		return -ENODEV;
+
+	dev = container_of(_gadget, struct langwell_udc, gadget);
+	VDBG(dev, "---> %s()\n", __func__);
+
+	retval = readl(&dev->op_regs->frindex) & FRINDEX_MASK;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return retval;
+}
+
+
+/* tries to wake up the host connected to this gadget */
+static int langwell_wakeup(struct usb_gadget *_gadget)
+{
+	struct langwell_udc	*dev;
+	u32 			portsc1, devlc;
+	unsigned long   	flags;
+
+	if (!_gadget)
+		return 0;
+
+	dev = container_of(_gadget, struct langwell_udc, gadget);
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* Remote Wakeup feature not enabled by host */
+	if (!dev->remote_wakeup)
+		return -ENOTSUPP;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	portsc1 = readl(&dev->op_regs->portsc1);
+	if (!(portsc1 & PORTS_SUSP)) {
+		spin_unlock_irqrestore(&dev->lock, flags);
+		return 0;
+	}
+
+	/* LPM L1 to L0, remote wakeup */
+	if (dev->lpm && dev->lpm_state == LPM_L1) {
+		portsc1 |= PORTS_SLP;
+		writel(portsc1, &dev->op_regs->portsc1);
+	}
+
+	/* force port resume */
+	if (dev->usb_state == USB_STATE_SUSPENDED) {
+		portsc1 |= PORTS_FPR;
+		writel(portsc1, &dev->op_regs->portsc1);
+	}
+
+	/* exit PHY low power suspend */
+	devlc = readl(&dev->op_regs->devlc);
+	VDBG(dev, "devlc = 0x%08x\n", devlc);
+	devlc &= ~LPM_PHCD;
+	writel(devlc, &dev->op_regs->devlc);
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* notify controller that VBUS is powered or not */
+static int langwell_vbus_session(struct usb_gadget *_gadget, int is_active)
+{
+	struct langwell_udc	*dev;
+	unsigned long		flags;
+	u32             	usbcmd;
+
+	if (!_gadget)
+		return -ENODEV;
+
+	dev = container_of(_gadget, struct langwell_udc, gadget);
+	VDBG(dev, "---> %s()\n", __func__);
+
+	spin_lock_irqsave(&dev->lock, flags);
+	VDBG(dev, "VBUS status: %s\n", is_active ? "on" : "off");
+
+	dev->vbus_active = (is_active != 0);
+	if (dev->driver && dev->softconnected && dev->vbus_active) {
+		usbcmd = readl(&dev->op_regs->usbcmd);
+		usbcmd |= CMD_RUNSTOP;
+		writel(usbcmd, &dev->op_regs->usbcmd);
+	} else {
+		usbcmd = readl(&dev->op_regs->usbcmd);
+		usbcmd &= ~CMD_RUNSTOP;
+		writel(usbcmd, &dev->op_regs->usbcmd);
+	}
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* constrain controller's VBUS power usage */
+static int langwell_vbus_draw(struct usb_gadget *_gadget, unsigned mA)
+{
+	struct langwell_udc	*dev;
+
+	if (!_gadget)
+		return -ENODEV;
+
+	dev = container_of(_gadget, struct langwell_udc, gadget);
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (dev->transceiver) {
+		VDBG(dev, "otg_set_power\n");
+		VDBG(dev, "<--- %s()\n", __func__);
+		return otg_set_power(dev->transceiver, mA);
+	}
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return -ENOTSUPP;
+}
+
+
+/* D+ pullup, software-controlled connect/disconnect to USB host */
+static int langwell_pullup(struct usb_gadget *_gadget, int is_on)
+{
+	struct langwell_udc	*dev;
+	u32             	usbcmd;
+	unsigned long   	flags;
+
+	if (!_gadget)
+		return -ENODEV;
+
+	dev = container_of(_gadget, struct langwell_udc, gadget);
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	spin_lock_irqsave(&dev->lock, flags);
+	dev->softconnected = (is_on != 0);
+
+	if (dev->driver && dev->softconnected && dev->vbus_active) {
+		usbcmd = readl(&dev->op_regs->usbcmd);
+		usbcmd |= CMD_RUNSTOP;
+		writel(usbcmd, &dev->op_regs->usbcmd);
+	} else {
+		usbcmd = readl(&dev->op_regs->usbcmd);
+		usbcmd &= ~CMD_RUNSTOP;
+		writel(usbcmd, &dev->op_regs->usbcmd);
+	}
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* device controller usb_gadget_ops structure */
+static const struct usb_gadget_ops langwell_ops = {
+
+	/* returns the current frame number */
+	.get_frame	= langwell_get_frame,
+
+	/* tries to wake up the host connected to this gadget */
+	.wakeup		= langwell_wakeup,
+
+	/* set the device selfpowered feature, always selfpowered */
+	/* .set_selfpowered = langwell_set_selfpowered, */
+
+	/* notify controller that VBUS is powered or not */
+	.vbus_session	= langwell_vbus_session,
+
+	/* constrain controller's VBUS power usage */
+	.vbus_draw	= langwell_vbus_draw,
+
+	/* D+ pullup, software-controlled connect/disconnect to USB host */
+	.pullup		= langwell_pullup,
+};
+
+
+/*-------------------------------------------------------------------------*/
+
+/* device controller operations */
+
+/* reset device controller */
+static int langwell_udc_reset(struct langwell_udc *dev)
+{
+	u32		usbcmd, usbmode, devlc, endpointlistaddr;
+	unsigned long	timeout;
+
+	if (!dev)
+		return -EINVAL;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	/* set controller to stop state */
+	usbcmd = readl(&dev->op_regs->usbcmd);
+	usbcmd &= ~CMD_RUNSTOP;
+	writel(usbcmd, &dev->op_regs->usbcmd);
+
+	/* reset device controller */
+	usbcmd = readl(&dev->op_regs->usbcmd);
+	usbcmd |= CMD_RST;
+	writel(usbcmd, &dev->op_regs->usbcmd);
+
+	/* wait for reset to complete */
+	timeout = jiffies + RESET_TIMEOUT;
+	while (readl(&dev->op_regs->usbcmd) & CMD_RST) {
+		if (time_after(jiffies, timeout)) {
+			ERROR(dev, "device reset timeout\n");
+			return -ETIMEDOUT;
+		}
+		cpu_relax();
+	}
+
+	/* set controller to device mode */
+	usbmode = readl(&dev->op_regs->usbmode);
+	usbmode |= MODE_DEVICE;
+
+	/* turn setup lockout off, require setup tripwire in usbcmd */
+	usbmode |= MODE_SLOM;
+
+	writel(usbmode, &dev->op_regs->usbmode);
+	usbmode = readl(&dev->op_regs->usbmode);
+	VDBG(dev, "usbmode=0x%08x\n", usbmode);
+
+	/* Write-Clear setup status */
+	writel(0, &dev->op_regs->usbsts);
+
+	/* if support USB LPM, ACK all LPM token */
+	if (dev->lpm) {
+		devlc = readl(&dev->op_regs->devlc);
+		devlc &= ~LPM_STL;	/* don't STALL LPM token */
+		devlc &= ~LPM_NYT_ACK;	/* ACK LPM token */
+		writel(devlc, &dev->op_regs->devlc);
+	}
+
+	/* fill endpointlistaddr register */
+	endpointlistaddr = dev->ep_dqh_dma;
+	endpointlistaddr &= ENDPOINTLISTADDR_MASK;
+	writel(endpointlistaddr, &dev->op_regs->endpointlistaddr);
+
+	VDBG(dev, "dQH base (vir: %p, phy: 0x%08x), endpointlistaddr=0x%08x\n",
+			dev->ep_dqh, endpointlistaddr,
+			readl(&dev->op_regs->endpointlistaddr));
+	DBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* reinitialize device controller endpoints */
+static int eps_reinit(struct langwell_udc *dev)
+{
+	struct langwell_ep	*ep;
+	char			name[14];
+	int			i;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* initialize ep0 */
+	ep = &dev->ep[0];
+	ep->dev = dev;
+	strncpy(ep->name, "ep0", sizeof(ep->name));
+	ep->ep.name = ep->name;
+	ep->ep.ops = &langwell_ep_ops;
+	ep->stopped = 0;
+	ep->ep.maxpacket = EP0_MAX_PKT_SIZE;
+	ep->ep_num = 0;
+	ep->desc = &langwell_ep0_desc;
+	INIT_LIST_HEAD(&ep->queue);
+
+	ep->ep_type = USB_ENDPOINT_XFER_CONTROL;
+
+	/* initialize other endpoints */
+	for (i = 2; i < dev->ep_max; i++) {
+		ep = &dev->ep[i];
+		if (i % 2)
+			snprintf(name, sizeof(name), "ep%din", i / 2);
+		else
+			snprintf(name, sizeof(name), "ep%dout", i / 2);
+		ep->dev = dev;
+		strncpy(ep->name, name, sizeof(ep->name));
+		ep->ep.name = ep->name;
+
+		ep->ep.ops = &langwell_ep_ops;
+		ep->stopped = 0;
+		ep->ep.maxpacket = (unsigned short) ~0;
+		ep->ep_num = i / 2;
+
+		INIT_LIST_HEAD(&ep->queue);
+		list_add_tail(&ep->ep.ep_list, &dev->gadget.ep_list);
+
+		ep->dqh = &dev->ep_dqh[i];
+	}
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* enable interrupt and set controller to run state */
+static void langwell_udc_start(struct langwell_udc *dev)
+{
+	u32	usbintr, usbcmd;
+	DBG(dev, "---> %s()\n", __func__);
+
+	/* enable interrupts */
+	usbintr = INTR_ULPIE	/* ULPI */
+		| INTR_SLE	/* suspend */
+		/* | INTR_SRE	SOF received */
+		| INTR_URE	/* USB reset */
+		| INTR_AAE	/* async advance */
+		| INTR_SEE	/* system error */
+		| INTR_FRE	/* frame list rollover */
+		| INTR_PCE	/* port change detect */
+		| INTR_UEE	/* USB error interrupt */
+		| INTR_UE;	/* USB interrupt */
+	writel(usbintr, &dev->op_regs->usbintr);
+
+	/* clear stopped bit */
+	dev->stopped = 0;
+
+	/* set controller to run */
+	usbcmd = readl(&dev->op_regs->usbcmd);
+	usbcmd |= CMD_RUNSTOP;
+	writel(usbcmd, &dev->op_regs->usbcmd);
+
+	DBG(dev, "<--- %s()\n", __func__);
+	return;
+}
+
+
+/* disable interrupt and set controller to stop state */
+static void langwell_udc_stop(struct langwell_udc *dev)
+{
+	u32	usbcmd;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	/* disable all interrupts */
+	writel(0, &dev->op_regs->usbintr);
+
+	/* set stopped bit */
+	dev->stopped = 1;
+
+	/* set controller to stop state */
+	usbcmd = readl(&dev->op_regs->usbcmd);
+	usbcmd &= ~CMD_RUNSTOP;
+	writel(usbcmd, &dev->op_regs->usbcmd);
+
+	DBG(dev, "<--- %s()\n", __func__);
+	return;
+}
+
+
+/* stop all USB activities */
+static void stop_activity(struct langwell_udc *dev,
+		struct usb_gadget_driver *driver)
+{
+	struct langwell_ep	*ep;
+	DBG(dev, "---> %s()\n", __func__);
+
+	nuke(&dev->ep[0], -ESHUTDOWN);
+
+	list_for_each_entry(ep, &dev->gadget.ep_list, ep.ep_list) {
+		nuke(ep, -ESHUTDOWN);
+	}
+
+	/* report disconnect; the driver is already quiesced */
+	if (driver) {
+		spin_unlock(&dev->lock);
+		driver->disconnect(&dev->gadget);
+		spin_lock(&dev->lock);
+	}
+
+	DBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* device "function" sysfs attribute file */
+static ssize_t show_function(struct device *_dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct langwell_udc	*dev = the_controller;
+
+	if (!dev->driver || !dev->driver->function
+			|| strlen(dev->driver->function) > PAGE_SIZE)
+		return 0;
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", dev->driver->function);
+}
+static DEVICE_ATTR(function, S_IRUGO, show_function, NULL);
+
+
+/* device "langwell_udc" sysfs attribute file */
+static ssize_t show_langwell_udc(struct device *_dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct langwell_udc	*dev = the_controller;
+	struct langwell_request *req;
+	struct langwell_ep	*ep = NULL;
+	char			*next;
+	unsigned		size;
+	unsigned		t;
+	unsigned		i;
+	unsigned long		flags;
+	u32			tmp_reg;
+
+	next = buf;
+	size = PAGE_SIZE;
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/* driver basic information */
+	t = scnprintf(next, size,
+			DRIVER_DESC "\n"
+			"%s version: %s\n"
+			"Gadget driver: %s\n\n",
+			driver_name, DRIVER_VERSION,
+			dev->driver ? dev->driver->driver.name : "(none)");
+	size -= t;
+	next += t;
+
+	/* device registers */
+	tmp_reg = readl(&dev->op_regs->usbcmd);
+	t = scnprintf(next, size,
+			"USBCMD reg:\n"
+			"SetupTW: %d\n"
+			"Run/Stop: %s\n\n",
+			(tmp_reg & CMD_SUTW) ? 1 : 0,
+			(tmp_reg & CMD_RUNSTOP) ? "Run" : "Stop");
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->usbsts);
+	t = scnprintf(next, size,
+			"USB Status Reg:\n"
+			"Device Suspend: %d\n"
+			"Reset Received: %d\n"
+			"System Error: %s\n"
+			"USB Error Interrupt: %s\n\n",
+			(tmp_reg & STS_SLI) ? 1 : 0,
+			(tmp_reg & STS_URI) ? 1 : 0,
+			(tmp_reg & STS_SEI) ? "Error" : "No error",
+			(tmp_reg & STS_UEI) ? "Error detected" : "No error");
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->usbintr);
+	t = scnprintf(next, size,
+			"USB Intrrupt Enable Reg:\n"
+			"Sleep Enable: %d\n"
+			"SOF Received Enable: %d\n"
+			"Reset Enable: %d\n"
+			"System Error Enable: %d\n"
+			"Port Change Dectected Enable: %d\n"
+			"USB Error Intr Enable: %d\n"
+			"USB Intr Enable: %d\n\n",
+			(tmp_reg & INTR_SLE) ? 1 : 0,
+			(tmp_reg & INTR_SRE) ? 1 : 0,
+			(tmp_reg & INTR_URE) ? 1 : 0,
+			(tmp_reg & INTR_SEE) ? 1 : 0,
+			(tmp_reg & INTR_PCE) ? 1 : 0,
+			(tmp_reg & INTR_UEE) ? 1 : 0,
+			(tmp_reg & INTR_UE) ? 1 : 0);
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->frindex);
+	t = scnprintf(next, size,
+			"USB Frame Index Reg:\n"
+			"Frame Number is 0x%08x\n\n",
+			(tmp_reg & FRINDEX_MASK));
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->deviceaddr);
+	t = scnprintf(next, size,
+			"USB Device Address Reg:\n"
+			"Device Addr is 0x%x\n\n",
+			USBADR(tmp_reg));
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->endpointlistaddr);
+	t = scnprintf(next, size,
+			"USB Endpoint List Address Reg:\n"
+			"Endpoint List Pointer is 0x%x\n\n",
+			EPBASE(tmp_reg));
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->portsc1);
+	t = scnprintf(next, size,
+		"USB Port Status & Control Reg:\n"
+		"Port Reset: %s\n"
+		"Port Suspend Mode: %s\n"
+		"Over-current Change: %s\n"
+		"Port Enable/Disable Change: %s\n"
+		"Port Enabled/Disabled: %s\n"
+		"Current Connect Status: %s\n\n",
+		(tmp_reg & PORTS_PR) ? "Reset" : "Not Reset",
+		(tmp_reg & PORTS_SUSP) ? "Suspend " : "Not Suspend",
+		(tmp_reg & PORTS_OCC) ? "Detected" : "No",
+		(tmp_reg & PORTS_PEC) ? "Changed" : "Not Changed",
+		(tmp_reg & PORTS_PE) ? "Enable" : "Not Correct",
+		(tmp_reg & PORTS_CCS) ?  "Attached" : "Not Attached");
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->devlc);
+	t = scnprintf(next, size,
+		"Device LPM Control Reg:\n"
+		"Parallel Transceiver : %d\n"
+		"Serial Transceiver : %d\n"
+		"Port Speed: %s\n"
+		"Port Force Full Speed Connenct: %s\n"
+		"PHY Low Power Suspend Clock Disable: %s\n"
+		"BmAttributes: %d\n\n",
+		LPM_PTS(tmp_reg),
+		(tmp_reg & LPM_STS) ? 1 : 0,
+		({
+			char	*s;
+			switch (LPM_PSPD(tmp_reg)) {
+			case LPM_SPEED_FULL:
+				s = "Full Speed"; break;
+			case LPM_SPEED_LOW:
+				s = "Low Speed"; break;
+			case LPM_SPEED_HIGH:
+				s = "High Speed"; break;
+			default:
+				s = "Unknown Speed"; break;
+			}
+			s;
+		}),
+		(tmp_reg & LPM_PFSC) ? "Force Full Speed" : "Not Force",
+		(tmp_reg & LPM_PHCD) ? "Disabled" : "Enabled",
+		LPM_BA(tmp_reg));
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->usbmode);
+	t = scnprintf(next, size,
+			"USB Mode Reg:\n"
+			"Controller Mode is : %s\n\n", ({
+				char *s;
+				switch (MODE_CM(tmp_reg)) {
+				case MODE_IDLE:
+					s = "Idle"; break;
+				case MODE_DEVICE:
+					s = "Device Controller"; break;
+				case MODE_HOST:
+					s = "Host Controller"; break;
+				default:
+					s = "None"; break;
+				}
+				s;
+			}));
+	size -= t;
+	next += t;
+
+	tmp_reg = readl(&dev->op_regs->endptsetupstat);
+	t = scnprintf(next, size,
+			"Endpoint Setup Status Reg:\n"
+			"SETUP on ep 0x%04x\n\n",
+			tmp_reg & SETUPSTAT_MASK);
+	size -= t;
+	next += t;
+
+	for (i = 0; i < dev->ep_max / 2; i++) {
+		tmp_reg = readl(&dev->op_regs->endptctrl[i]);
+		t = scnprintf(next, size, "EP Ctrl Reg [%d]: 0x%08x\n",
+				i, tmp_reg);
+		size -= t;
+		next += t;
+	}
+	tmp_reg = readl(&dev->op_regs->endptprime);
+	t = scnprintf(next, size, "EP Prime Reg: 0x%08x\n\n", tmp_reg);
+	size -= t;
+	next += t;
+
+	/* langwell_udc, langwell_ep, langwell_request structure information */
+	ep = &dev->ep[0];
+	t = scnprintf(next, size, "%s MaxPacketSize: 0x%x, ep_num: %d\n",
+			ep->ep.name, ep->ep.maxpacket, ep->ep_num);
+	size -= t;
+	next += t;
+
+	if (list_empty(&ep->queue)) {
+		t = scnprintf(next, size, "its req queue is empty\n\n");
+		size -= t;
+		next += t;
+	} else {
+		list_for_each_entry(req, &ep->queue, queue) {
+			t = scnprintf(next, size,
+				"req %p actual 0x%x length 0x%x  buf %p\n",
+				&req->req, req->req.actual,
+				req->req.length, req->req.buf);
+			size -= t;
+			next += t;
+		}
+	}
+	/* other gadget->eplist ep */
+	list_for_each_entry(ep, &dev->gadget.ep_list, ep.ep_list) {
+		if (ep->desc) {
+			t = scnprintf(next, size,
+					"\n%s MaxPacketSize: 0x%x, "
+					"ep_num: %d\n",
+					ep->ep.name, ep->ep.maxpacket,
+					ep->ep_num);
+			size -= t;
+			next += t;
+
+			if (list_empty(&ep->queue)) {
+				t = scnprintf(next, size,
+						"its req queue is empty\n\n");
+				size -= t;
+				next += t;
+			} else {
+				list_for_each_entry(req, &ep->queue, queue) {
+					t = scnprintf(next, size,
+						"req %p actual 0x%x length "
+						"0x%x  buf %p\n",
+						&req->req, req->req.actual,
+						req->req.length, req->req.buf);
+					size -= t;
+					next += t;
+				}
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+	return PAGE_SIZE - size;
+}
+static DEVICE_ATTR(langwell_udc, S_IRUGO, show_langwell_udc, NULL);
+
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * when a driver is successfully registered, it will receive
+ * control requests including set_configuration(), which enables
+ * non-control requests.  then usb traffic follows until a
+ * disconnect is reported.  then a host may connect again, or
+ * the driver might get unbound.
+ */
+
+int usb_gadget_register_driver(struct usb_gadget_driver *driver)
+{
+	struct langwell_udc	*dev = the_controller;
+	unsigned long		flags;
+	int			retval;
+
+	if (!dev)
+		return -ENODEV;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	if (dev->driver)
+		return -EBUSY;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/* hook up the driver ... */
+	driver->driver.bus = NULL;
+	dev->driver = driver;
+	dev->gadget.dev.driver = &driver->driver;
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	retval = driver->bind(&dev->gadget);
+	if (retval) {
+		DBG(dev, "bind to driver %s --> %d\n",
+				driver->driver.name, retval);
+		dev->driver = NULL;
+		dev->gadget.dev.driver = NULL;
+		return retval;
+	}
+
+	retval = device_create_file(&dev->pdev->dev, &dev_attr_function);
+	if (retval)
+		goto err_unbind;
+
+	dev->usb_state = USB_STATE_ATTACHED;
+	dev->ep0_state = WAIT_FOR_SETUP;
+	dev->ep0_dir = USB_DIR_OUT;
+
+	/* enable interrupt and set controller to run state */
+	if (dev->got_irq)
+		langwell_udc_start(dev);
+
+	VDBG(dev, "After langwell_udc_start(), print all registers:\n");
+#ifdef	VERBOSE
+	print_all_registers(dev);
+#endif
+
+	INFO(dev, "register driver: %s\n", driver->driver.name);
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+
+err_unbind:
+	driver->unbind(&dev->gadget);
+	dev->gadget.dev.driver = NULL;
+	dev->driver = NULL;
+
+	DBG(dev, "<--- %s()\n", __func__);
+	return retval;
+}
+EXPORT_SYMBOL(usb_gadget_register_driver);
+
+
+/* unregister gadget driver */
+int usb_gadget_unregister_driver(struct usb_gadget_driver *driver)
+{
+	struct langwell_udc	*dev = the_controller;
+	unsigned long		flags;
+
+	if (!dev)
+		return -ENODEV;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	if (unlikely(!driver || !driver->bind || !driver->unbind))
+		return -EINVAL;
+
+	/* unbind OTG transceiver */
+	if (dev->transceiver)
+		(void)otg_set_peripheral(dev->transceiver, 0);
+
+	/* disable interrupt and set controller to stop state */
+	langwell_udc_stop(dev);
+
+	dev->usb_state = USB_STATE_ATTACHED;
+	dev->ep0_state = WAIT_FOR_SETUP;
+	dev->ep0_dir = USB_DIR_OUT;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	/* stop all usb activities */
+	dev->gadget.speed = USB_SPEED_UNKNOWN;
+	stop_activity(dev, driver);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	/* unbind gadget driver */
+	driver->unbind(&dev->gadget);
+	dev->gadget.dev.driver = NULL;
+	dev->driver = NULL;
+
+	device_remove_file(&dev->pdev->dev, &dev_attr_function);
+
+	INFO(dev, "unregistered driver '%s'\n", driver->driver.name);
+	DBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+EXPORT_SYMBOL(usb_gadget_unregister_driver);
+
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * setup tripwire is used as a semaphore to ensure that the setup data
+ * payload is extracted from a dQH without being corrupted
+ */
+static void setup_tripwire(struct langwell_udc *dev)
+{
+	u32			usbcmd,
+				endptsetupstat;
+	unsigned long		timeout;
+	struct langwell_dqh	*dqh;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* ep0 OUT dQH */
+	dqh = &dev->ep_dqh[EP_DIR_OUT];
+
+	/* Write-Clear endptsetupstat */
+	endptsetupstat = readl(&dev->op_regs->endptsetupstat);
+	writel(endptsetupstat, &dev->op_regs->endptsetupstat);
+
+	/* wait until endptsetupstat is cleared */
+	timeout = jiffies + SETUPSTAT_TIMEOUT;
+	while (readl(&dev->op_regs->endptsetupstat)) {
+		if (time_after(jiffies, timeout)) {
+			ERROR(dev, "setup_tripwire timeout\n");
+			break;
+		}
+		cpu_relax();
+	}
+
+	/* while a hazard exists when setup packet arrives */
+	do {
+		/* set setup tripwire bit */
+		usbcmd = readl(&dev->op_regs->usbcmd);
+		writel(usbcmd | CMD_SUTW, &dev->op_regs->usbcmd);
+
+		/* copy the setup packet to local buffer */
+		memcpy(&dev->local_setup_buff, &dqh->dqh_setup, 8);
+	} while (!(readl(&dev->op_regs->usbcmd) & CMD_SUTW));
+
+	/* Write-Clear setup tripwire bit */
+	usbcmd = readl(&dev->op_regs->usbcmd);
+	writel(usbcmd & ~CMD_SUTW, &dev->op_regs->usbcmd);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* protocol ep0 stall, will automatically be cleared on new transaction */
+static void ep0_stall(struct langwell_udc *dev)
+{
+	u32	endptctrl;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* set TX and RX to stall */
+	endptctrl = readl(&dev->op_regs->endptctrl[0]);
+	endptctrl |= EPCTRL_TXS | EPCTRL_RXS;
+	writel(endptctrl, &dev->op_regs->endptctrl[0]);
+
+	/* update ep0 state */
+	dev->ep0_state = WAIT_FOR_SETUP;
+	dev->ep0_dir = USB_DIR_OUT;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* PRIME a status phase for ep0 */
+static int prime_status_phase(struct langwell_udc *dev, int dir)
+{
+	struct langwell_request	*req;
+	struct langwell_ep	*ep;
+	int			status = 0;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (dir == EP_DIR_IN)
+		dev->ep0_dir = USB_DIR_IN;
+	else
+		dev->ep0_dir = USB_DIR_OUT;
+
+	ep = &dev->ep[0];
+	dev->ep0_state = WAIT_FOR_OUT_STATUS;
+
+	req = dev->status_req;
+
+	req->ep = ep;
+	req->req.length = 0;
+	req->req.status = -EINPROGRESS;
+	req->req.actual = 0;
+	req->req.complete = NULL;
+	req->dtd_count = 0;
+
+	if (!req_to_dtd(req))
+		status = queue_dtd(ep, req);
+	else
+		return -ENOMEM;
+
+	if (status)
+		ERROR(dev, "can't queue ep0 status request\n");
+
+	list_add_tail(&req->queue, &ep->queue);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return status;
+}
+
+
+/* SET_ADDRESS request routine */
+static void set_address(struct langwell_udc *dev, u16 value,
+		u16 index, u16 length)
+{
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* save the new address to device struct */
+	dev->dev_addr = (u8) value;
+	VDBG(dev, "dev->dev_addr = %d\n", dev->dev_addr);
+
+	/* update usb state */
+	dev->usb_state = USB_STATE_ADDRESS;
+
+	/* STATUS phase */
+	if (prime_status_phase(dev, EP_DIR_IN))
+		ep0_stall(dev);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* return endpoint by windex */
+static struct langwell_ep *get_ep_by_windex(struct langwell_udc *dev,
+		u16 wIndex)
+{
+	struct langwell_ep		*ep;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if ((wIndex & USB_ENDPOINT_NUMBER_MASK) == 0)
+		return &dev->ep[0];
+
+	list_for_each_entry(ep, &dev->gadget.ep_list, ep.ep_list) {
+		u8	bEndpointAddress;
+		if (!ep->desc)
+			continue;
+
+		bEndpointAddress = ep->desc->bEndpointAddress;
+		if ((wIndex ^ bEndpointAddress) & USB_DIR_IN)
+			continue;
+
+		if ((wIndex & USB_ENDPOINT_NUMBER_MASK)
+			== (bEndpointAddress & USB_ENDPOINT_NUMBER_MASK))
+			return ep;
+	}
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return NULL;
+}
+
+
+/* return whether endpoint is stalled, 0: not stalled; 1: stalled */
+static int ep_is_stall(struct langwell_ep *ep)
+{
+	struct langwell_udc	*dev = ep->dev;
+	u32			endptctrl;
+	int			retval;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	endptctrl = readl(&dev->op_regs->endptctrl[ep->ep_num]);
+	if (is_in(ep))
+		retval = endptctrl & EPCTRL_TXS ? 1 : 0;
+	else
+		retval = endptctrl & EPCTRL_RXS ? 1 : 0;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return retval;
+}
+
+
+/* GET_STATUS request routine */
+static void get_status(struct langwell_udc *dev, u8 request_type, u16 value,
+		u16 index, u16 length)
+{
+	struct langwell_request	*req;
+	struct langwell_ep	*ep;
+	u16	status_data = 0;	/* 16 bits cpu view status data */
+	int	status = 0;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	ep = &dev->ep[0];
+
+	if ((request_type & USB_RECIP_MASK) == USB_RECIP_DEVICE) {
+		/* get device status */
+		status_data = 1 << USB_DEVICE_SELF_POWERED;
+		status_data |= dev->remote_wakeup << USB_DEVICE_REMOTE_WAKEUP;
+	} else if ((request_type & USB_RECIP_MASK) == USB_RECIP_INTERFACE) {
+		/* get interface status */
+		status_data = 0;
+	} else if ((request_type & USB_RECIP_MASK) == USB_RECIP_ENDPOINT) {
+		/* get endpoint status */
+		struct langwell_ep	*epn;
+		epn = get_ep_by_windex(dev, index);
+		/* stall if endpoint doesn't exist */
+		if (!epn)
+			goto stall;
+
+		status_data = ep_is_stall(epn) << USB_ENDPOINT_HALT;
+	}
+
+	dev->ep0_dir = USB_DIR_IN;
+
+	/* borrow the per device status_req */
+	req = dev->status_req;
+
+	/* fill in the reqest structure */
+	*((u16 *) req->req.buf) = cpu_to_le16(status_data);
+	req->ep = ep;
+	req->req.length = 2;
+	req->req.status = -EINPROGRESS;
+	req->req.actual = 0;
+	req->req.complete = NULL;
+	req->dtd_count = 0;
+
+	/* prime the data phase */
+	if (!req_to_dtd(req))
+		status = queue_dtd(ep, req);
+	else			/* no mem */
+		goto stall;
+
+	if (status) {
+		ERROR(dev, "response error on GET_STATUS request\n");
+		goto stall;
+	}
+
+	list_add_tail(&req->queue, &ep->queue);
+	dev->ep0_state = DATA_STATE_XMIT;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return;
+stall:
+	ep0_stall(dev);
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* setup packet interrupt handler */
+static void handle_setup_packet(struct langwell_udc *dev,
+		struct usb_ctrlrequest *setup)
+{
+	u16	wValue = le16_to_cpu(setup->wValue);
+	u16	wIndex = le16_to_cpu(setup->wIndex);
+	u16	wLength = le16_to_cpu(setup->wLength);
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* ep0 fifo flush */
+	nuke(&dev->ep[0], -ESHUTDOWN);
+
+	DBG(dev, "SETUP %02x.%02x v%04x i%04x l%04x\n",
+			setup->bRequestType, setup->bRequest,
+			wValue, wIndex, wLength);
+
+	/* RNDIS gadget delegate */
+	if ((setup->bRequestType == 0x21) && (setup->bRequest == 0x00)) {
+		/* USB_CDC_SEND_ENCAPSULATED_COMMAND */
+		goto delegate;
+	}
+
+	/* USB_CDC_GET_ENCAPSULATED_RESPONSE */
+	if ((setup->bRequestType == 0xa1) && (setup->bRequest == 0x01)) {
+		/* USB_CDC_GET_ENCAPSULATED_RESPONSE */
+		goto delegate;
+	}
+
+	/* We process some stardard setup requests here */
+	switch (setup->bRequest) {
+	case USB_REQ_GET_STATUS:
+		DBG(dev, "SETUP: USB_REQ_GET_STATUS\n");
+		/* get status, DATA and STATUS phase */
+		if ((setup->bRequestType & (USB_DIR_IN | USB_TYPE_MASK))
+					!= (USB_DIR_IN | USB_TYPE_STANDARD))
+			break;
+		get_status(dev, setup->bRequestType, wValue, wIndex, wLength);
+		goto end;
+
+	case USB_REQ_SET_ADDRESS:
+		DBG(dev, "SETUP: USB_REQ_SET_ADDRESS\n");
+		/* STATUS phase */
+		if (setup->bRequestType != (USB_DIR_OUT | USB_TYPE_STANDARD
+						| USB_RECIP_DEVICE))
+			break;
+		set_address(dev, wValue, wIndex, wLength);
+		goto end;
+
+	case USB_REQ_CLEAR_FEATURE:
+	case USB_REQ_SET_FEATURE:
+		/* STATUS phase */
+	{
+		int rc = -EOPNOTSUPP;
+		if (setup->bRequest == USB_REQ_SET_FEATURE)
+			DBG(dev, "SETUP: USB_REQ_SET_FEATURE\n");
+		else if (setup->bRequest == USB_REQ_CLEAR_FEATURE)
+			DBG(dev, "SETUP: USB_REQ_CLEAR_FEATURE\n");
+
+		if ((setup->bRequestType & (USB_RECIP_MASK | USB_TYPE_MASK))
+				== (USB_RECIP_ENDPOINT | USB_TYPE_STANDARD)) {
+			struct langwell_ep	*epn;
+			epn = get_ep_by_windex(dev, wIndex);
+			/* stall if endpoint doesn't exist */
+			if (!epn) {
+				ep0_stall(dev);
+				goto end;
+			}
+
+			if (wValue != 0 || wLength != 0
+					|| epn->ep_num > dev->ep_max)
+				break;
+
+			spin_unlock(&dev->lock);
+			rc = langwell_ep_set_halt(&epn->ep,
+					(setup->bRequest == USB_REQ_SET_FEATURE)
+						? 1 : 0);
+			spin_lock(&dev->lock);
+
+		} else if ((setup->bRequestType & (USB_RECIP_MASK
+				| USB_TYPE_MASK)) == (USB_RECIP_DEVICE
+				| USB_TYPE_STANDARD)) {
+			if (!gadget_is_otg(&dev->gadget))
+				break;
+			else if (setup->bRequest == USB_DEVICE_B_HNP_ENABLE) {
+				dev->gadget.b_hnp_enable = 1;
+#ifdef	OTG_TRANSCEIVER
+				if (!dev->lotg->otg.default_a)
+					dev->lotg->hsm.b_hnp_enable = 1;
+#endif
+			} else if (setup->bRequest == USB_DEVICE_A_HNP_SUPPORT)
+				dev->gadget.a_hnp_support = 1;
+			else if (setup->bRequest ==
+					USB_DEVICE_A_ALT_HNP_SUPPORT)
+				dev->gadget.a_alt_hnp_support = 1;
+			else
+				break;
+			rc = 0;
+		} else
+			break;
+
+		if (rc == 0) {
+			if (prime_status_phase(dev, EP_DIR_IN))
+				ep0_stall(dev);
+		}
+		goto end;
+	}
+
+	case USB_REQ_GET_DESCRIPTOR:
+		DBG(dev, "SETUP: USB_REQ_GET_DESCRIPTOR\n");
+		goto delegate;
+
+	case USB_REQ_SET_DESCRIPTOR:
+		DBG(dev, "SETUP: USB_REQ_SET_DESCRIPTOR unsupported\n");
+		goto delegate;
+
+	case USB_REQ_GET_CONFIGURATION:
+		DBG(dev, "SETUP: USB_REQ_GET_CONFIGURATION\n");
+		goto delegate;
+
+	case USB_REQ_SET_CONFIGURATION:
+		DBG(dev, "SETUP: USB_REQ_SET_CONFIGURATION\n");
+		goto delegate;
+
+	case USB_REQ_GET_INTERFACE:
+		DBG(dev, "SETUP: USB_REQ_GET_INTERFACE\n");
+		goto delegate;
+
+	case USB_REQ_SET_INTERFACE:
+		DBG(dev, "SETUP: USB_REQ_SET_INTERFACE\n");
+		goto delegate;
+
+	case USB_REQ_SYNCH_FRAME:
+		DBG(dev, "SETUP: USB_REQ_SYNCH_FRAME unsupported\n");
+		goto delegate;
+
+	default:
+		/* delegate USB standard requests to the gadget driver */
+		goto delegate;
+delegate:
+		/* USB requests handled by gadget */
+		if (wLength) {
+			/* DATA phase from gadget, STATUS phase from udc */
+			dev->ep0_dir = (setup->bRequestType & USB_DIR_IN)
+					?  USB_DIR_IN : USB_DIR_OUT;
+			VDBG(dev, "dev->ep0_dir = 0x%x, wLength = %d\n",
+					dev->ep0_dir, wLength);
+			spin_unlock(&dev->lock);
+			if (dev->driver->setup(&dev->gadget,
+					&dev->local_setup_buff) < 0)
+				ep0_stall(dev);
+			spin_lock(&dev->lock);
+			dev->ep0_state = (setup->bRequestType & USB_DIR_IN)
+					?  DATA_STATE_XMIT : DATA_STATE_RECV;
+		} else {
+			/* no DATA phase, IN STATUS phase from gadget */
+			dev->ep0_dir = USB_DIR_IN;
+			VDBG(dev, "dev->ep0_dir = 0x%x, wLength = %d\n",
+					dev->ep0_dir, wLength);
+			spin_unlock(&dev->lock);
+			if (dev->driver->setup(&dev->gadget,
+					&dev->local_setup_buff) < 0)
+				ep0_stall(dev);
+			spin_lock(&dev->lock);
+			dev->ep0_state = WAIT_FOR_OUT_STATUS;
+		}
+		break;
+	}
+end:
+	VDBG(dev, "<--- %s()\n", __func__);
+	return;
+}
+
+
+/* transfer completion, process endpoint request and free the completed dTDs
+ * for this request
+ */
+static int process_ep_req(struct langwell_udc *dev, int index,
+		struct langwell_request *curr_req)
+{
+	struct langwell_dtd	*curr_dtd;
+	struct langwell_dqh	*curr_dqh;
+	int			td_complete, actual, remaining_length;
+	int			i, dir;
+	u8			dtd_status = 0;
+	int			retval = 0;
+
+	curr_dqh = &dev->ep_dqh[index];
+	dir = index % 2;
+
+	curr_dtd = curr_req->head;
+	td_complete = 0;
+	actual = curr_req->req.length;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	for (i = 0; i < curr_req->dtd_count; i++) {
+		remaining_length = le16_to_cpu(curr_dtd->dtd_total);
+		actual -= remaining_length;
+
+		/* command execution states by dTD */
+		dtd_status = curr_dtd->dtd_status;
+
+		if (!dtd_status) {
+			/* transfers completed successfully */
+			if (!remaining_length) {
+				td_complete++;
+				VDBG(dev, "dTD transmitted successfully\n");
+			} else {
+				if (dir) {
+					VDBG(dev, "TX dTD remains data\n");
+					retval = -EPROTO;
+					break;
+
+				} else {
+					td_complete++;
+					break;
+				}
+			}
+		} else {
+			/* transfers completed with errors */
+			if (dtd_status & DTD_STS_ACTIVE) {
+				DBG(dev, "request not completed\n");
+				retval = 1;
+				return retval;
+			} else if (dtd_status & DTD_STS_HALTED) {
+				ERROR(dev, "dTD error %08x dQH[%d]\n",
+						dtd_status, index);
+				/* clear the errors and halt condition */
+				curr_dqh->dtd_status = 0;
+				retval = -EPIPE;
+				break;
+			} else if (dtd_status & DTD_STS_DBE) {
+				DBG(dev, "data buffer (overflow) error\n");
+				retval = -EPROTO;
+				break;
+			} else if (dtd_status & DTD_STS_TRE) {
+				DBG(dev, "transaction(ISO) error\n");
+				retval = -EILSEQ;
+				break;
+			} else
+				ERROR(dev, "unknown error (0x%x)!\n",
+						dtd_status);
+		}
+
+		if (i != curr_req->dtd_count - 1)
+			curr_dtd = (struct langwell_dtd *)
+				curr_dtd->next_dtd_virt;
+	}
+
+	if (retval)
+		return retval;
+
+	curr_req->req.actual = actual;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* complete DATA or STATUS phase of ep0 prime status phase if needed */
+static void ep0_req_complete(struct langwell_udc *dev,
+		struct langwell_ep *ep0, struct langwell_request *req)
+{
+	u32	new_addr;
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (dev->usb_state == USB_STATE_ADDRESS) {
+		/* set the new address */
+		new_addr = (u32)dev->dev_addr;
+		writel(new_addr << USBADR_SHIFT, &dev->op_regs->deviceaddr);
+
+		new_addr = USBADR(readl(&dev->op_regs->deviceaddr));
+		VDBG(dev, "new_addr = %d\n", new_addr);
+	}
+
+	done(ep0, req, 0);
+
+	switch (dev->ep0_state) {
+	case DATA_STATE_XMIT:
+		/* receive status phase */
+		if (prime_status_phase(dev, EP_DIR_OUT))
+			ep0_stall(dev);
+		break;
+	case DATA_STATE_RECV:
+		/* send status phase */
+		if (prime_status_phase(dev, EP_DIR_IN))
+			ep0_stall(dev);
+		break;
+	case WAIT_FOR_OUT_STATUS:
+		dev->ep0_state = WAIT_FOR_SETUP;
+		break;
+	case WAIT_FOR_SETUP:
+		ERROR(dev, "unexpect ep0 packets\n");
+		break;
+	default:
+		ep0_stall(dev);
+		break;
+	}
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* USB transfer completion interrupt */
+static void handle_trans_complete(struct langwell_udc *dev)
+{
+	u32			complete_bits;
+	int			i, ep_num, dir, bit_mask, status;
+	struct langwell_ep	*epn;
+	struct langwell_request	*curr_req, *temp_req;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	complete_bits = readl(&dev->op_regs->endptcomplete);
+	VDBG(dev, "endptcomplete register: 0x%08x\n", complete_bits);
+
+	/* Write-Clear the bits in endptcomplete register */
+	writel(complete_bits, &dev->op_regs->endptcomplete);
+
+	if (!complete_bits) {
+		DBG(dev, "complete_bits = 0\n");
+		goto done;
+	}
+
+	for (i = 0; i < dev->ep_max; i++) {
+		ep_num = i / 2;
+		dir = i % 2;
+
+		bit_mask = 1 << (ep_num + 16 * dir);
+
+		if (!(complete_bits & bit_mask))
+			continue;
+
+		/* ep0 */
+		if (i == 1)
+			epn = &dev->ep[0];
+		else
+			epn = &dev->ep[i];
+
+		if (epn->name == NULL) {
+			WARNING(dev, "invalid endpoint\n");
+			continue;
+		}
+
+		if (i < 2)
+			/* ep0 in and out */
+			DBG(dev, "%s-%s transfer completed\n",
+					epn->name,
+					is_in(epn) ? "in" : "out");
+		else
+			DBG(dev, "%s transfer completed\n", epn->name);
+
+		/* process the req queue until an uncomplete request */
+		list_for_each_entry_safe(curr_req, temp_req,
+				&epn->queue, queue) {
+			status = process_ep_req(dev, i, curr_req);
+			VDBG(dev, "%s req status: %d\n", epn->name, status);
+
+			if (status)
+				break;
+
+			/* write back status to req */
+			curr_req->req.status = status;
+
+			/* ep0 request completion */
+			if (ep_num == 0) {
+				ep0_req_complete(dev, epn, curr_req);
+				break;
+			} else {
+				done(epn, curr_req, status);
+			}
+		}
+	}
+done:
+	VDBG(dev, "<--- %s()\n", __func__);
+	return;
+}
+
+
+/* port change detect interrupt handler */
+static void handle_port_change(struct langwell_udc *dev)
+{
+	u32	portsc1, devlc;
+	u32	speed;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (dev->bus_reset)
+		dev->bus_reset = 0;
+
+	portsc1 = readl(&dev->op_regs->portsc1);
+	devlc = readl(&dev->op_regs->devlc);
+	VDBG(dev, "portsc1 = 0x%08x, devlc = 0x%08x\n",
+			portsc1, devlc);
+
+	/* bus reset is finished */
+	if (!(portsc1 & PORTS_PR)) {
+		/* get the speed */
+		speed = LPM_PSPD(devlc);
+		switch (speed) {
+		case LPM_SPEED_HIGH:
+			dev->gadget.speed = USB_SPEED_HIGH;
+			break;
+		case LPM_SPEED_FULL:
+			dev->gadget.speed = USB_SPEED_FULL;
+			break;
+		case LPM_SPEED_LOW:
+			dev->gadget.speed = USB_SPEED_LOW;
+			break;
+		default:
+			dev->gadget.speed = USB_SPEED_UNKNOWN;
+			break;
+		}
+		VDBG(dev, "speed = %d, dev->gadget.speed = %d\n",
+				speed, dev->gadget.speed);
+	}
+
+	/* LPM L0 to L1 */
+	if (dev->lpm && dev->lpm_state == LPM_L0)
+		if (portsc1 & PORTS_SUSP && portsc1 & PORTS_SLP) {
+				INFO(dev, "LPM L0 to L1\n");
+				dev->lpm_state = LPM_L1;
+		}
+
+	/* LPM L1 to L0, force resume or remote wakeup finished */
+	if (dev->lpm && dev->lpm_state == LPM_L1)
+		if (!(portsc1 & PORTS_SUSP)) {
+			if (portsc1 & PORTS_SLP)
+				INFO(dev, "LPM L1 to L0, force resume\n");
+			else
+				INFO(dev, "LPM L1 to L0, remote wakeup\n");
+
+			dev->lpm_state = LPM_L0;
+		}
+
+	/* update USB state */
+	if (!dev->resume_state)
+		dev->usb_state = USB_STATE_DEFAULT;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* USB reset interrupt handler */
+static void handle_usb_reset(struct langwell_udc *dev)
+{
+	u32		deviceaddr,
+			endptsetupstat,
+			endptcomplete;
+	unsigned long	timeout;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	/* Write-Clear the device address */
+	deviceaddr = readl(&dev->op_regs->deviceaddr);
+	writel(deviceaddr & ~USBADR_MASK, &dev->op_regs->deviceaddr);
+
+	dev->dev_addr = 0;
+
+	/* clear usb state */
+	dev->resume_state = 0;
+
+	/* LPM L1 to L0, reset */
+	if (dev->lpm)
+		dev->lpm_state = LPM_L0;
+
+	dev->ep0_dir = USB_DIR_OUT;
+	dev->ep0_state = WAIT_FOR_SETUP;
+	dev->remote_wakeup = 0;		/* default to 0 on reset */
+	dev->gadget.b_hnp_enable = 0;
+	dev->gadget.a_hnp_support = 0;
+	dev->gadget.a_alt_hnp_support = 0;
+
+	/* Write-Clear all the setup token semaphores */
+	endptsetupstat = readl(&dev->op_regs->endptsetupstat);
+	writel(endptsetupstat, &dev->op_regs->endptsetupstat);
+
+	/* Write-Clear all the endpoint complete status bits */
+	endptcomplete = readl(&dev->op_regs->endptcomplete);
+	writel(endptcomplete, &dev->op_regs->endptcomplete);
+
+	/* wait until all endptprime bits cleared */
+	timeout = jiffies + PRIME_TIMEOUT;
+	while (readl(&dev->op_regs->endptprime)) {
+		if (time_after(jiffies, timeout)) {
+			ERROR(dev, "USB reset timeout\n");
+			break;
+		}
+		cpu_relax();
+	}
+
+	/* write 1s to endptflush register to clear any primed buffers */
+	writel((u32) ~0, &dev->op_regs->endptflush);
+
+	if (readl(&dev->op_regs->portsc1) & PORTS_PR) {
+		VDBG(dev, "USB bus reset\n");
+		/* bus is reseting */
+		dev->bus_reset = 1;
+
+		/* reset all the queues, stop all USB activities */
+		stop_activity(dev, dev->driver);
+		dev->usb_state = USB_STATE_DEFAULT;
+	} else {
+		VDBG(dev, "device controller reset\n");
+		/* controller reset */
+		langwell_udc_reset(dev);
+
+		/* reset all the queues, stop all USB activities */
+		stop_activity(dev, dev->driver);
+
+		/* reset ep0 dQH and endptctrl */
+		ep0_reset(dev);
+
+		/* enable interrupt and set controller to run state */
+		langwell_udc_start(dev);
+
+		dev->usb_state = USB_STATE_ATTACHED;
+	}
+
+#ifdef	OTG_TRANSCEIVER
+	/* refer to USB OTG 6.6.2.3 b_hnp_en is cleared */
+	if (!dev->lotg->otg.default_a)
+		dev->lotg->hsm.b_hnp_enable = 0;
+#endif
+
+	VDBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* USB bus suspend/resume interrupt */
+static void handle_bus_suspend(struct langwell_udc *dev)
+{
+	u32		devlc;
+	DBG(dev, "---> %s()\n", __func__);
+
+	dev->resume_state = dev->usb_state;
+	dev->usb_state = USB_STATE_SUSPENDED;
+
+#ifdef	OTG_TRANSCEIVER
+	if (dev->lotg->otg.default_a) {
+		if (dev->lotg->hsm.b_bus_suspend_vld == 1) {
+			dev->lotg->hsm.b_bus_suspend = 1;
+			/* notify transceiver the state changes */
+			if (spin_trylock(&dev->lotg->wq_lock)) {
+				langwell_update_transceiver();
+				spin_unlock(&dev->lotg->wq_lock);
+			}
+		}
+		dev->lotg->hsm.b_bus_suspend_vld++;
+	} else {
+		if (!dev->lotg->hsm.a_bus_suspend) {
+			dev->lotg->hsm.a_bus_suspend = 1;
+			/* notify transceiver the state changes */
+			if (spin_trylock(&dev->lotg->wq_lock)) {
+				langwell_update_transceiver();
+				spin_unlock(&dev->lotg->wq_lock);
+			}
+		}
+	}
+#endif
+
+	/* report suspend to the driver */
+	if (dev->driver) {
+		if (dev->driver->suspend) {
+			spin_unlock(&dev->lock);
+			dev->driver->suspend(&dev->gadget);
+			spin_lock(&dev->lock);
+			DBG(dev, "suspend %s\n", dev->driver->driver.name);
+		}
+	}
+
+	/* enter PHY low power suspend */
+	devlc = readl(&dev->op_regs->devlc);
+	VDBG(dev, "devlc = 0x%08x\n", devlc);
+	devlc |= LPM_PHCD;
+	writel(devlc, &dev->op_regs->devlc);
+
+	DBG(dev, "<--- %s()\n", __func__);
+}
+
+
+static void handle_bus_resume(struct langwell_udc *dev)
+{
+	u32		devlc;
+	DBG(dev, "---> %s()\n", __func__);
+
+	dev->usb_state = dev->resume_state;
+	dev->resume_state = 0;
+
+	/* exit PHY low power suspend */
+	devlc = readl(&dev->op_regs->devlc);
+	VDBG(dev, "devlc = 0x%08x\n", devlc);
+	devlc &= ~LPM_PHCD;
+	writel(devlc, &dev->op_regs->devlc);
+
+#ifdef	OTG_TRANSCEIVER
+	if (dev->lotg->otg.default_a == 0)
+		dev->lotg->hsm.a_bus_suspend = 0;
+#endif
+
+	/* report resume to the driver */
+	if (dev->driver) {
+		if (dev->driver->resume) {
+			spin_unlock(&dev->lock);
+			dev->driver->resume(&dev->gadget);
+			spin_lock(&dev->lock);
+			DBG(dev, "resume %s\n", dev->driver->driver.name);
+		}
+	}
+
+	DBG(dev, "<--- %s()\n", __func__);
+}
+
+
+/* USB device controller interrupt handler */
+static irqreturn_t langwell_irq(int irq, void *_dev)
+{
+	struct langwell_udc	*dev = _dev;
+	u32			usbsts,
+				usbintr,
+				irq_sts,
+				portsc1;
+
+	VDBG(dev, "---> %s()\n", __func__);
+
+	if (dev->stopped) {
+		VDBG(dev, "handle IRQ_NONE\n");
+		VDBG(dev, "<--- %s()\n", __func__);
+		return IRQ_NONE;
+	}
+
+	spin_lock(&dev->lock);
+
+	/* USB status */
+	usbsts = readl(&dev->op_regs->usbsts);
+
+	/* USB interrupt enable */
+	usbintr = readl(&dev->op_regs->usbintr);
+
+	irq_sts = usbsts & usbintr;
+	VDBG(dev, "usbsts = 0x%08x, usbintr = 0x%08x, irq_sts = 0x%08x\n",
+			usbsts, usbintr, irq_sts);
+
+	if (!irq_sts) {
+		VDBG(dev, "handle IRQ_NONE\n");
+		VDBG(dev, "<--- %s()\n", __func__);
+		spin_unlock(&dev->lock);
+		return IRQ_NONE;
+	}
+
+	/* Write-Clear interrupt status bits */
+	writel(irq_sts, &dev->op_regs->usbsts);
+
+	/* resume from suspend */
+	portsc1 = readl(&dev->op_regs->portsc1);
+	if (dev->usb_state == USB_STATE_SUSPENDED)
+		if (!(portsc1 & PORTS_SUSP))
+			handle_bus_resume(dev);
+
+	/* USB interrupt */
+	if (irq_sts & STS_UI) {
+		VDBG(dev, "USB interrupt\n");
+
+		/* setup packet received from ep0 */
+		if (readl(&dev->op_regs->endptsetupstat)
+				& EP0SETUPSTAT_MASK) {
+			VDBG(dev, "USB SETUP packet received interrupt\n");
+			/* setup tripwire semaphone */
+			setup_tripwire(dev);
+			handle_setup_packet(dev, &dev->local_setup_buff);
+		}
+
+		/* USB transfer completion */
+		if (readl(&dev->op_regs->endptcomplete)) {
+			VDBG(dev, "USB transfer completion interrupt\n");
+			handle_trans_complete(dev);
+		}
+	}
+
+	/* SOF received interrupt (for ISO transfer) */
+	if (irq_sts & STS_SRI) {
+		/* FIXME */
+		/* VDBG(dev, "SOF received interrupt\n"); */
+	}
+
+	/* port change detect interrupt */
+	if (irq_sts & STS_PCI) {
+		VDBG(dev, "port change detect interrupt\n");
+		handle_port_change(dev);
+	}
+
+	/* suspend interrrupt */
+	if (irq_sts & STS_SLI) {
+		VDBG(dev, "suspend interrupt\n");
+		handle_bus_suspend(dev);
+	}
+
+	/* USB reset interrupt */
+	if (irq_sts & STS_URI) {
+		VDBG(dev, "USB reset interrupt\n");
+		handle_usb_reset(dev);
+	}
+
+	/* USB error or system error interrupt */
+	if (irq_sts & (STS_UEI | STS_SEI)) {
+		/* FIXME */
+		WARNING(dev, "error IRQ, irq_sts: %x\n", irq_sts);
+	}
+
+	spin_unlock(&dev->lock);
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return IRQ_HANDLED;
+}
+
+
+/*-------------------------------------------------------------------------*/
+
+/* release device structure */
+static void gadget_release(struct device *_dev)
+{
+	struct langwell_udc	*dev = the_controller;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	complete(dev->done);
+
+	DBG(dev, "<--- %s()\n", __func__);
+	kfree(dev);
+}
+
+
+/* tear down the binding between this driver and the pci device */
+static void langwell_udc_remove(struct pci_dev *pdev)
+{
+	struct langwell_udc	*dev = the_controller;
+
+	DECLARE_COMPLETION(done);
+
+	BUG_ON(dev->driver);
+	DBG(dev, "---> %s()\n", __func__);
+
+	dev->done = &done;
+
+	/* free memory allocated in probe */
+	if (dev->dtd_pool)
+		dma_pool_destroy(dev->dtd_pool);
+
+	if (dev->status_req) {
+		kfree(dev->status_req->req.buf);
+		kfree(dev->status_req);
+	}
+
+	if (dev->ep_dqh)
+		dma_free_coherent(&pdev->dev, dev->ep_dqh_size,
+			dev->ep_dqh, dev->ep_dqh_dma);
+
+	kfree(dev->ep);
+
+	/* diable IRQ handler */
+	if (dev->got_irq)
+		free_irq(pdev->irq, dev);
+
+#ifndef	OTG_TRANSCEIVER
+	if (dev->cap_regs)
+		iounmap(dev->cap_regs);
+
+	if (dev->region)
+		release_mem_region(pci_resource_start(pdev, 0),
+				pci_resource_len(pdev, 0));
+
+	if (dev->enabled)
+		pci_disable_device(pdev);
+#else
+	if (dev->transceiver) {
+		otg_put_transceiver(dev->transceiver);
+		dev->transceiver = NULL;
+		dev->lotg = NULL;
+	}
+#endif
+
+	dev->cap_regs = NULL;
+
+	INFO(dev, "unbind\n");
+	DBG(dev, "<--- %s()\n", __func__);
+
+	device_unregister(&dev->gadget.dev);
+	device_remove_file(&pdev->dev, &dev_attr_langwell_udc);
+
+#ifndef	OTG_TRANSCEIVER
+	pci_set_drvdata(pdev, NULL);
+#endif
+
+	/* free dev, wait for the release() finished */
+	wait_for_completion(&done);
+
+	the_controller = NULL;
+}
+
+
+/*
+ * wrap this driver around the specified device, but
+ * don't respond over USB until a gadget driver binds to us.
+ */
+static int langwell_udc_probe(struct pci_dev *pdev,
+		const struct pci_device_id *id)
+{
+	struct langwell_udc	*dev;
+#ifndef	OTG_TRANSCEIVER
+	unsigned long		resource, len;
+#endif
+	void			__iomem *base = NULL;
+	size_t			size;
+	int			retval;
+
+	if (the_controller) {
+		dev_warn(&pdev->dev, "ignoring\n");
+		return -EBUSY;
+	}
+
+	/* alloc, and start init */
+	dev = kzalloc(sizeof *dev, GFP_KERNEL);
+	if (dev == NULL) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	/* initialize device spinlock */
+	spin_lock_init(&dev->lock);
+
+	dev->pdev = pdev;
+	DBG(dev, "---> %s()\n", __func__);
+
+#ifdef	OTG_TRANSCEIVER
+	/* PCI device is already enabled by otg_transceiver driver */
+	dev->enabled = 1;
+
+	/* mem region and register base */
+	dev->region = 1;
+	dev->transceiver = otg_get_transceiver();
+	dev->lotg = otg_to_langwell(dev->transceiver);
+	base = dev->lotg->regs;
+#else
+	pci_set_drvdata(pdev, dev);
+
+	/* now all the pci goodies ... */
+	if (pci_enable_device(pdev) < 0) {
+		retval = -ENODEV;
+		goto error;
+	}
+	dev->enabled = 1;
+
+	/* control register: BAR 0 */
+	resource = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+	if (!request_mem_region(resource, len, driver_name)) {
+		ERROR(dev, "controller already in use\n");
+		retval = -EBUSY;
+		goto error;
+	}
+	dev->region = 1;
+
+	base = ioremap_nocache(resource, len);
+#endif
+	if (base == NULL) {
+		ERROR(dev, "can't map memory\n");
+		retval = -EFAULT;
+		goto error;
+	}
+
+	dev->cap_regs = (struct langwell_cap_regs __iomem *) base;
+	VDBG(dev, "dev->cap_regs: %p\n", dev->cap_regs);
+	dev->op_regs = (struct langwell_op_regs __iomem *)
+		(base + OP_REG_OFFSET);
+	VDBG(dev, "dev->op_regs: %p\n", dev->op_regs);
+
+	/* irq setup after old hardware is cleaned up */
+	if (!pdev->irq) {
+		ERROR(dev, "No IRQ. Check PCI setup!\n");
+		retval = -ENODEV;
+		goto error;
+	}
+
+#ifndef	OTG_TRANSCEIVER
+	INFO(dev, "irq %d, io mem: 0x%08lx, len: 0x%08lx, pci mem 0x%p\n",
+			pdev->irq, resource, len, base);
+	/* enables bus-mastering for device dev */
+	pci_set_master(pdev);
+
+	if (request_irq(pdev->irq, langwell_irq, IRQF_SHARED,
+				driver_name, dev) != 0) {
+		ERROR(dev, "request interrupt %d failed\n", pdev->irq);
+		retval = -EBUSY;
+		goto error;
+	}
+	dev->got_irq = 1;
+#endif
+
+	/* set stopped bit */
+	dev->stopped = 1;
+
+	/* capabilities and endpoint number */
+	dev->lpm = (readl(&dev->cap_regs->hccparams) & HCC_LEN) ? 1 : 0;
+	dev->dciversion = readw(&dev->cap_regs->dciversion);
+	dev->devcap = (readl(&dev->cap_regs->dccparams) & DEVCAP) ? 1 : 0;
+	VDBG(dev, "dev->lpm: %d\n", dev->lpm);
+	VDBG(dev, "dev->dciversion: 0x%04x\n", dev->dciversion);
+	VDBG(dev, "dccparams: 0x%08x\n", readl(&dev->cap_regs->dccparams));
+	VDBG(dev, "dev->devcap: %d\n", dev->devcap);
+	if (!dev->devcap) {
+		ERROR(dev, "can't support device mode\n");
+		retval = -ENODEV;
+		goto error;
+	}
+
+	/* a pair of endpoints (out/in) for each address */
+	dev->ep_max = DEN(readl(&dev->cap_regs->dccparams)) * 2;
+	VDBG(dev, "dev->ep_max: %d\n", dev->ep_max);
+
+	/* allocate endpoints memory */
+	dev->ep = kzalloc(sizeof(struct langwell_ep) * dev->ep_max,
+			GFP_KERNEL);
+	if (!dev->ep) {
+		ERROR(dev, "allocate endpoints memory failed\n");
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	/* allocate device dQH memory */
+	size = dev->ep_max * sizeof(struct langwell_dqh);
+	VDBG(dev, "orig size = %d\n", size);
+	if (size < DQH_ALIGNMENT)
+		size = DQH_ALIGNMENT;
+	else if ((size % DQH_ALIGNMENT) != 0) {
+		size += DQH_ALIGNMENT + 1;
+		size &= ~(DQH_ALIGNMENT - 1);
+	}
+	dev->ep_dqh = dma_alloc_coherent(&pdev->dev, size,
+					&dev->ep_dqh_dma, GFP_KERNEL);
+	if (!dev->ep_dqh) {
+		ERROR(dev, "allocate dQH memory failed\n");
+		retval = -ENOMEM;
+		goto error;
+	}
+	dev->ep_dqh_size = size;
+	VDBG(dev, "ep_dqh_size = %d\n", dev->ep_dqh_size);
+
+	/* initialize ep0 status request structure */
+	dev->status_req = kzalloc(sizeof(struct langwell_request), GFP_KERNEL);
+	if (!dev->status_req) {
+		ERROR(dev, "allocate status_req memory failed\n");
+		retval = -ENOMEM;
+		goto error;
+	}
+	INIT_LIST_HEAD(&dev->status_req->queue);
+
+	/* allocate a small amount of memory to get valid address */
+	dev->status_req->req.buf = kmalloc(8, GFP_KERNEL);
+	dev->status_req->req.dma = virt_to_phys(dev->status_req->req.buf);
+
+	dev->resume_state = USB_STATE_NOTATTACHED;
+	dev->usb_state = USB_STATE_POWERED;
+	dev->ep0_dir = USB_DIR_OUT;
+	dev->remote_wakeup = 0;	/* default to 0 on reset */
+
+#ifndef	OTG_TRANSCEIVER
+	/* reset device controller */
+	langwell_udc_reset(dev);
+#endif
+
+	/* initialize gadget structure */
+	dev->gadget.ops = &langwell_ops;	/* usb_gadget_ops */
+	dev->gadget.ep0 = &dev->ep[0].ep;	/* gadget ep0 */
+	INIT_LIST_HEAD(&dev->gadget.ep_list);	/* ep_list */
+	dev->gadget.speed = USB_SPEED_UNKNOWN;	/* speed */
+	dev->gadget.is_dualspeed = 1;		/* support dual speed */
+#ifdef	OTG_TRANSCEIVER
+	dev->gadget.is_otg = 1;			/* support otg mode */
+#endif
+
+	/* the "gadget" abstracts/virtualizes the controller */
+	dev_set_name(&dev->gadget.dev, "gadget");
+	dev->gadget.dev.parent = &pdev->dev;
+	dev->gadget.dev.dma_mask = pdev->dev.dma_mask;
+	dev->gadget.dev.release = gadget_release;
+	dev->gadget.name = driver_name;		/* gadget name */
+
+	/* controller endpoints reinit */
+	eps_reinit(dev);
+
+#ifndef	OTG_TRANSCEIVER
+	/* reset ep0 dQH and endptctrl */
+	ep0_reset(dev);
+#endif
+
+	/* create dTD dma_pool resource */
+	dev->dtd_pool = dma_pool_create("langwell_dtd",
+			&dev->pdev->dev,
+			sizeof(struct langwell_dtd),
+			DTD_ALIGNMENT,
+			DMA_BOUNDARY);
+
+	if (!dev->dtd_pool) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	/* done */
+	INFO(dev, "%s\n", driver_desc);
+	INFO(dev, "irq %d, pci mem %p\n", pdev->irq, base);
+	INFO(dev, "Driver version: " DRIVER_VERSION "\n");
+	INFO(dev, "Support (max) %d endpoints\n", dev->ep_max);
+	INFO(dev, "Device interface version: 0x%04x\n", dev->dciversion);
+	INFO(dev, "Controller mode: %s\n", dev->devcap ? "Device" : "Host");
+	INFO(dev, "Support USB LPM: %s\n", dev->lpm ? "Yes" : "No");
+
+	VDBG(dev, "After langwell_udc_probe(), print all registers:\n");
+#ifdef	VERBOSE
+	print_all_registers(dev);
+#endif
+
+	the_controller = dev;
+
+	retval = device_register(&dev->gadget.dev);
+	if (retval)
+		goto error;
+
+	retval = device_create_file(&pdev->dev, &dev_attr_langwell_udc);
+	if (retval)
+		goto error;
+
+	VDBG(dev, "<--- %s()\n", __func__);
+	return 0;
+
+error:
+	if (dev) {
+		DBG(dev, "<--- %s()\n", __func__);
+		langwell_udc_remove(pdev);
+	}
+
+	return retval;
+}
+
+
+/* device controller suspend */
+static int langwell_udc_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct langwell_udc	*dev = the_controller;
+	u32			devlc;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	/* disable interrupt and set controller to stop state */
+	langwell_udc_stop(dev);
+
+	/* diable IRQ handler */
+	if (dev->got_irq)
+		free_irq(pdev->irq, dev);
+	dev->got_irq = 0;
+
+
+	/* save PCI state */
+	pci_save_state(pdev);
+
+	/* set device power state */
+	pci_set_power_state(pdev, PCI_D3hot);
+
+	/* enter PHY low power suspend */
+	devlc = readl(&dev->op_regs->devlc);
+	VDBG(dev, "devlc = 0x%08x\n", devlc);
+	devlc |= LPM_PHCD;
+	writel(devlc, &dev->op_regs->devlc);
+
+	DBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* device controller resume */
+static int langwell_udc_resume(struct pci_dev *pdev)
+{
+	struct langwell_udc	*dev = the_controller;
+	u32			devlc;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	/* exit PHY low power suspend */
+	devlc = readl(&dev->op_regs->devlc);
+	VDBG(dev, "devlc = 0x%08x\n", devlc);
+	devlc &= ~LPM_PHCD;
+	writel(devlc, &dev->op_regs->devlc);
+
+	/* set device D0 power state */
+	pci_set_power_state(pdev, PCI_D0);
+
+	/* restore PCI state */
+	pci_restore_state(pdev);
+
+	/* enable IRQ handler */
+	if (request_irq(pdev->irq, langwell_irq, IRQF_SHARED, driver_name, dev)
+			!= 0) {
+		ERROR(dev, "request interrupt %d failed\n", pdev->irq);
+		return -1;
+	}
+	dev->got_irq = 1;
+
+	/* reset and start controller to run state */
+	if (dev->stopped) {
+		/* reset device controller */
+		langwell_udc_reset(dev);
+
+		/* reset ep0 dQH and endptctrl */
+		ep0_reset(dev);
+
+		/* start device if gadget is loaded */
+		if (dev->driver)
+			langwell_udc_start(dev);
+	}
+
+	/* reset USB status */
+	dev->usb_state = USB_STATE_ATTACHED;
+	dev->ep0_state = WAIT_FOR_SETUP;
+	dev->ep0_dir = USB_DIR_OUT;
+
+	DBG(dev, "<--- %s()\n", __func__);
+	return 0;
+}
+
+
+/* pci driver shutdown */
+static void langwell_udc_shutdown(struct pci_dev *pdev)
+{
+	struct langwell_udc	*dev = the_controller;
+	u32			usbmode;
+
+	DBG(dev, "---> %s()\n", __func__);
+
+	/* reset controller mode to IDLE */
+	usbmode = readl(&dev->op_regs->usbmode);
+	DBG(dev, "usbmode = 0x%08x\n", usbmode);
+	usbmode &= (~3 | MODE_IDLE);
+	writel(usbmode, &dev->op_regs->usbmode);
+
+	DBG(dev, "<--- %s()\n", __func__);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static const struct pci_device_id pci_ids[] = { {
+	.class =	((PCI_CLASS_SERIAL_USB << 8) | 0xfe),
+	.class_mask =	~0,
+	.vendor =	0x8086,
+	.device =	0x0811,
+	.subvendor =	PCI_ANY_ID,
+	.subdevice =	PCI_ANY_ID,
+}, { /* end: all zeroes */ }
+};
+
+
+MODULE_DEVICE_TABLE(pci, pci_ids);
+
+
+static struct pci_driver langwell_pci_driver = {
+	.name =		(char *) driver_name,
+	.id_table =	pci_ids,
+
+	.probe =	langwell_udc_probe,
+	.remove =	langwell_udc_remove,
+
+	/* device controller suspend/resume */
+	.suspend =	langwell_udc_suspend,
+	.resume =	langwell_udc_resume,
+
+	.shutdown =	langwell_udc_shutdown,
+};
+
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_AUTHOR("Xiaochen Shen <xiaochen.shen@intel.com>");
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+
+
+static int __init init(void)
+{
+#ifdef	OTG_TRANSCEIVER
+	return langwell_register_peripheral(&langwell_pci_driver);
+#else
+	return pci_register_driver(&langwell_pci_driver);
+#endif
+}
+module_init(init);
+
+
+static void __exit cleanup(void)
+{
+#ifdef	OTG_TRANSCEIVER
+	return langwell_unregister_peripheral(&langwell_pci_driver);
+#else
+	pci_unregister_driver(&langwell_pci_driver);
+#endif
+}
+module_exit(cleanup);
+
diff --git a/drivers/usb/gadget/langwell_udc.h b/drivers/usb/gadget/langwell_udc.h
new file mode 100644
index 00000000000..9719934e1c0
--- /dev/null
+++ b/drivers/usb/gadget/langwell_udc.h
@@ -0,0 +1,228 @@
+/*
+ * Intel Langwell USB Device Controller driver
+ * Copyright (C) 2008-2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/usb/langwell_udc.h>
+
+#if defined(CONFIG_USB_LANGWELL_OTG)
+#include <linux/usb/langwell_otg.h>
+#endif
+
+
+/*-------------------------------------------------------------------------*/
+
+/* driver data structures and utilities */
+
+/*
+ * dTD: Device Endpoint Transfer Descriptor
+ * describe to the device controller the location and quantity of
+ * data to be send/received for given transfer
+ */
+struct langwell_dtd {
+	u32	dtd_next;
+/* bits 31:5, next transfer element pointer */
+#define	DTD_NEXT(d)	(((d)>>5)&0x7ffffff)
+#define	DTD_NEXT_MASK	(0x7ffffff << 5)
+/* terminate */
+#define	DTD_TERM	BIT(0)
+	/* bits 7:0, execution back states */
+	u32	dtd_status:8;
+#define	DTD_STATUS(d)	(((d)>>0)&0xff)
+#define	DTD_STS_ACTIVE	BIT(7)	/* active */
+#define	DTD_STS_HALTED	BIT(6)	/* halted */
+#define	DTD_STS_DBE	BIT(5)	/* data buffer error */
+#define	DTD_STS_TRE	BIT(3)	/* transaction error  */
+	/* bits 9:8 */
+	u32	dtd_res0:2;
+	/* bits 11:10, multipier override */
+	u32	dtd_multo:2;
+#define	DTD_MULTO	(BIT(11) | BIT(10))
+	/* bits 14:12 */
+	u32	dtd_res1:3;
+	/* bit 15, interrupt on complete */
+	u32	dtd_ioc:1;
+#define	DTD_IOC		BIT(15)
+	/* bits 30:16, total bytes */
+	u32	dtd_total:15;
+#define	DTD_TOTAL(d)	(((d)>>16)&0x7fff)
+#define	DTD_MAX_TRANSFER_LENGTH	0x4000
+	/* bit 31 */
+	u32	dtd_res2:1;
+	/* dTD buffer pointer page 0 to 4 */
+	u32	dtd_buf[5];
+#define	DTD_OFFSET_MASK	0xfff
+/* bits 31:12, buffer pointer */
+#define	DTD_BUFFER(d)	(((d)>>12)&0x3ff)
+/* bits 11:0, current offset */
+#define	DTD_C_OFFSET(d)	(((d)>>0)&0xfff)
+/* bits 10:0, frame number */
+#define	DTD_FRAME(d)	(((d)>>0)&0x7ff)
+
+	/* driver-private parts */
+
+	/* dtd dma address */
+	dma_addr_t		dtd_dma;
+	/* next dtd virtual address */
+	struct langwell_dtd	*next_dtd_virt;
+};
+
+
+/*
+ * dQH: Device Endpoint Queue Head
+ * describe where all transfers are managed
+ * 48-byte data structure, aligned on 64-byte boundary
+ *
+ * These are associated with dTD structure
+ */
+struct langwell_dqh {
+	/* endpoint capabilities and characteristics */
+	u32	dqh_res0:15;	/* bits 14:0 */
+	u32	dqh_ios:1;	/* bit 15, interrupt on setup */
+#define	DQH_IOS		BIT(15)
+	u32	dqh_mpl:11;	/* bits 26:16, maximum packet length */
+#define	DQH_MPL		(0x7ff << 16)
+	u32	dqh_res1:2;	/* bits 28:27 */
+	u32	dqh_zlt:1;	/* bit 29, zero length termination */
+#define	DQH_ZLT		BIT(29)
+	u32	dqh_mult:2;	/* bits 31:30 */
+#define	DQH_MULT	(BIT(30) | BIT(31))
+
+	/* current dTD pointer */
+	u32	dqh_current;	/* locate the transfer in progress */
+#define DQH_C_DTD(e)	\
+	(((e)>>5)&0x7ffffff)	/* bits 31:5, current dTD pointer */
+
+	/* transfer overlay, hardware parts of a struct langwell_dtd */
+	u32	dtd_next;
+	u32	dtd_status:8;	/* bits 7:0, execution back states */
+	u32	dtd_res0:2;	/* bits 9:8 */
+	u32	dtd_multo:2;	/* bits 11:10, multipier override */
+	u32	dtd_res1:3;	/* bits 14:12 */
+	u32	dtd_ioc:1;	/* bit 15, interrupt on complete */
+	u32	dtd_total:15;	/* bits 30:16, total bytes */
+	u32	dtd_res2:1;	/* bit 31 */
+	u32	dtd_buf[5];	/* dTD buffer pointer page 0 to 4 */
+
+	u32	dqh_res2;
+	struct usb_ctrlrequest	dqh_setup;	/* setup packet buffer */
+} __attribute__ ((aligned(64)));
+
+
+/* endpoint data structure */
+struct langwell_ep {
+	struct usb_ep		ep;
+	dma_addr_t		dma;
+	struct langwell_udc	*dev;
+	unsigned long		irqs;
+	struct list_head	queue;
+	struct langwell_dqh	*dqh;
+	const struct usb_endpoint_descriptor	*desc;
+	char			name[14];
+	unsigned		stopped:1,
+				ep_type:2,
+				ep_num:8;
+};
+
+
+/* request data structure */
+struct langwell_request {
+	struct usb_request	req;
+	struct langwell_dtd	*dtd, *head, *tail;
+	struct langwell_ep	*ep;
+	dma_addr_t		dtd_dma;
+	struct list_head	queue;
+	unsigned		dtd_count;
+	unsigned		mapped:1;
+};
+
+
+/* ep0 transfer state */
+enum ep0_state {
+	WAIT_FOR_SETUP,
+	DATA_STATE_XMIT,
+	DATA_STATE_NEED_ZLP,
+	WAIT_FOR_OUT_STATUS,
+	DATA_STATE_RECV,
+};
+
+
+/* device suspend state */
+enum lpm_state {
+	LPM_L0,	/* on */
+	LPM_L1,	/* LPM L1 sleep */
+	LPM_L2,	/* suspend */
+	LPM_L3,	/* off */
+};
+
+
+/* device data structure */
+struct langwell_udc {
+	/* each pci device provides one gadget, several endpoints */
+	struct usb_gadget	gadget;
+	spinlock_t		lock;	/* device lock */
+	struct langwell_ep	*ep;
+	struct usb_gadget_driver	*driver;
+	struct otg_transceiver	*transceiver;
+	u8			dev_addr;
+	u32			usb_state;
+	u32			resume_state;
+	u32			bus_reset;
+	enum lpm_state		lpm_state;
+	enum ep0_state		ep0_state;
+	u32			ep0_dir;
+	u16			dciversion;
+	unsigned		ep_max;
+	unsigned		devcap:1,
+				enabled:1,
+				region:1,
+				got_irq:1,
+				powered:1,
+				remote_wakeup:1,
+				rate:1,
+				is_reset:1,
+				softconnected:1,
+				vbus_active:1,
+				suspended:1,
+				stopped:1,
+				lpm:1;	/* LPM capability */
+
+	/* pci state used to access those endpoints */
+	struct pci_dev		*pdev;
+
+	/* Langwell otg transceiver */
+	struct langwell_otg	*lotg;
+
+	/* control registers */
+	struct langwell_cap_regs	__iomem	*cap_regs;
+	struct langwell_op_regs		__iomem	*op_regs;
+
+	struct usb_ctrlrequest	local_setup_buff;
+	struct langwell_dqh	*ep_dqh;
+	size_t			ep_dqh_size;
+	dma_addr_t		ep_dqh_dma;
+
+	/* ep0 status request */
+	struct langwell_request	*status_req;
+
+	/* dma pool */
+	struct dma_pool		*dtd_pool;
+
+	/* make sure release() is done */
+	struct completion	*done;
+};
+
diff --git a/include/linux/usb/langwell_udc.h b/include/linux/usb/langwell_udc.h
new file mode 100644
index 00000000000..c949178a653
--- /dev/null
+++ b/include/linux/usb/langwell_udc.h
@@ -0,0 +1,310 @@
+/*
+ * Intel Langwell USB Device Controller driver
+ * Copyright (C) 2008-2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef __LANGWELL_UDC_H
+#define __LANGWELL_UDC_H
+
+
+/* MACRO defines */
+#define	CAP_REG_OFFSET		0x0
+#define	OP_REG_OFFSET		0x28
+
+#define	DMA_ADDR_INVALID	(~(dma_addr_t)0)
+
+#define	DQH_ALIGNMENT		2048
+#define	DTD_ALIGNMENT		64
+#define	DMA_BOUNDARY		4096
+
+#define	EP0_MAX_PKT_SIZE	64
+#define EP_DIR_IN		1
+#define EP_DIR_OUT		0
+
+#define FLUSH_TIMEOUT		1000
+#define RESET_TIMEOUT		1000
+#define SETUPSTAT_TIMEOUT	100
+#define PRIME_TIMEOUT		100
+
+
+/* device memory space registers */
+
+/* Capability Registers, BAR0 + CAP_REG_OFFSET */
+struct langwell_cap_regs {
+	/* offset: 0x0 */
+	u8	caplength;	/* offset of Operational Register */
+	u8	_reserved3;
+	u16	hciversion;	/* H: BCD encoding of host version */
+	u32	hcsparams;	/* H: host port steering logic capability */
+	u32	hccparams;	/* H: host multiple mode control capability */
+#define	HCC_LEN	BIT(17)		/* Link power management (LPM) capability */
+	u8	_reserved4[0x20-0xc];
+	/* offset: 0x20 */
+	u16	dciversion;	/* BCD encoding of device version */
+	u8	_reserved5[0x24-0x22];
+	u32	dccparams;	/* overall device controller capability */
+#define	HOSTCAP	BIT(8)		/* host capable */
+#define	DEVCAP	BIT(7)		/* device capable */
+#define DEN(d)	\
+	(((d)>>0)&0x1f)		/* bits 4:0, device endpoint number */
+} __attribute__ ((packed));
+
+
+/* Operational Registers, BAR0 + OP_REG_OFFSET */
+struct langwell_op_regs {
+	/* offset: 0x28 */
+	u32	extsts;
+#define	EXTS_TI1	BIT(4)	/* general purpose timer interrupt 1 */
+#define	EXTS_TI1TI0	BIT(3)	/* general purpose timer interrupt 0 */
+#define	EXTS_TI1UPI	BIT(2)	/* USB host periodic interrupt */
+#define	EXTS_TI1UAI	BIT(1)	/* USB host asynchronous interrupt */
+#define	EXTS_TI1NAKI	BIT(0)	/* NAK interrupt */
+	u32	extintr;
+#define	EXTI_TIE1	BIT(4)	/* general purpose timer interrupt enable 1 */
+#define	EXTI_TIE0	BIT(3)	/* general purpose timer interrupt enable 0 */
+#define	EXTI_UPIE	BIT(2)	/* USB host periodic interrupt enable */
+#define	EXTI_UAIE	BIT(1)	/* USB host asynchronous interrupt enable */
+#define	EXTI_NAKE	BIT(0)	/* NAK interrupt enable */
+	/* offset: 0x30 */
+	u32	usbcmd;
+#define	CMD_HIRD(u)	\
+	(((u)>>24)&0xf)		/* bits 27:24, host init resume duration */
+#define	CMD_ITC(u)	\
+	(((u)>>16)&0xff)	/* bits 23:16, interrupt threshold control */
+#define	CMD_PPE		BIT(15)	/* per-port change events enable */
+#define	CMD_ATDTW	BIT(14)	/* add dTD tripwire */
+#define	CMD_SUTW	BIT(13)	/* setup tripwire */
+#define	CMD_ASPE	BIT(11) /* asynchronous schedule park mode enable */
+#define	CMD_FS2		BIT(10)	/* frame list size */
+#define	CMD_ASP1	BIT(9)	/* asynchronous schedule park mode count */
+#define	CMD_ASP0	BIT(8)
+#define	CMD_LR		BIT(7)	/* light host/device controller reset */
+#define	CMD_IAA		BIT(6)	/* interrupt on async advance doorbell */
+#define	CMD_ASE		BIT(5)	/* asynchronous schedule enable */
+#define	CMD_PSE		BIT(4)	/* periodic schedule enable */
+#define	CMD_FS1		BIT(3)
+#define	CMD_FS0		BIT(2)
+#define	CMD_RST		BIT(1)	/* controller reset */
+#define	CMD_RUNSTOP	BIT(0)	/* run/stop */
+	u32	usbsts;
+#define	STS_PPCI(u)	\
+	(((u)>>16)&0xffff)	/* bits 31:16, port-n change detect */
+#define	STS_AS		BIT(15)	/* asynchronous schedule status */
+#define	STS_PS		BIT(14)	/* periodic schedule status */
+#define	STS_RCL		BIT(13)	/* reclamation */
+#define	STS_HCH		BIT(12)	/* HC halted */
+#define	STS_ULPII	BIT(10)	/* ULPI interrupt */
+#define	STS_SLI		BIT(8)	/* DC suspend */
+#define	STS_SRI		BIT(7)	/* SOF received */
+#define	STS_URI		BIT(6)	/* USB reset received */
+#define	STS_AAI		BIT(5)	/* interrupt on async advance */
+#define	STS_SEI		BIT(4)	/* system error */
+#define	STS_FRI		BIT(3)	/* frame list rollover */
+#define	STS_PCI		BIT(2)	/* port change detect */
+#define	STS_UEI		BIT(1)	/* USB error interrupt */
+#define	STS_UI		BIT(0)	/* USB interrupt */
+	u32	usbintr;
+/* bits 31:16, per-port interrupt enable */
+#define	INTR_PPCE(u)	(((u)>>16)&0xffff)
+#define	INTR_ULPIE	BIT(10)	/* ULPI enable */
+#define	INTR_SLE	BIT(8)	/* DC sleep/suspend enable */
+#define	INTR_SRE	BIT(7)	/* SOF received enable */
+#define	INTR_URE	BIT(6)	/* USB reset enable */
+#define	INTR_AAE	BIT(5)	/* interrupt on async advance enable */
+#define	INTR_SEE	BIT(4)	/* system error enable */
+#define	INTR_FRE	BIT(3)	/* frame list rollover enable */
+#define	INTR_PCE	BIT(2)	/* port change detect enable */
+#define	INTR_UEE	BIT(1)	/* USB error interrupt enable */
+#define	INTR_UE		BIT(0)	/* USB interrupt enable */
+	u32	frindex;	/* frame index */
+#define	FRINDEX_MASK	(0x3fff << 0)
+	u32	ctrldssegment;	/* not used */
+	u32	deviceaddr;
+#define USBADR_SHIFT	25
+#define	USBADR(d)	\
+	(((d)>>25)&0x7f)	/* bits 31:25, device address */
+#define USBADR_MASK	(0x7f << 25)
+#define	USBADRA		BIT(24)	/* device address advance */
+	u32	endpointlistaddr;/* endpoint list top memory address */
+/* bits 31:11, endpoint list pointer */
+#define	EPBASE(d)	(((d)>>11)&0x1fffff)
+#define	ENDPOINTLISTADDR_MASK	(0x1fffff << 11)
+	u32	ttctrl;		/* H: TT operatin, not used */
+	/* offset: 0x50 */
+	u32	burstsize;	/* burst size of data movement */
+#define	TXPBURST(b)	\
+	(((b)>>8)&0xff)		/* bits 15:8, TX burst length */
+#define	RXPBURST(b)	\
+	(((b)>>0)&0xff)		/* bits 7:0, RX burst length */
+	u32	txfilltuning;	/* TX tuning */
+	u32	txttfilltuning;	/* H: TX TT tuning */
+	u32	ic_usb;		/* control the IC_USB FS/LS transceiver */
+	/* offset: 0x60 */
+	u32	ulpi_viewport;	/* indirect access to ULPI PHY */
+#define	ULPIWU		BIT(31)	/* ULPI wakeup */
+#define	ULPIRUN		BIT(30)	/* ULPI read/write run */
+#define	ULPIRW		BIT(29)	/* ULPI read/write control */
+#define	ULPISS		BIT(27)	/* ULPI sync state */
+#define	ULPIPORT(u)	\
+	(((u)>>24)&7)		/* bits 26:24, ULPI port number */
+#define	ULPIADDR(u)	\
+	(((u)>>16)&0xff)	/* bits 23:16, ULPI data address */
+#define	ULPIDATRD(u)	\
+	(((u)>>8)&0xff)		/* bits 15:8, ULPI data read */
+#define	ULPIDATWR(u)	\
+	(((u)>>0)&0xff)		/* bits 7:0, ULPI date write */
+	u8	_reserved6[0x70-0x64];
+	/* offset: 0x70 */
+	u32	configflag;	/* H: not used */
+	u32	portsc1;	/* port status */
+#define	DA(p)	\
+	(((p)>>25)&0x7f)	/* bits 31:25, device address */
+#define	PORTS_SSTS	(BIT(24) | BIT(23))	/* suspend status */
+#define	PORTS_WKOC	BIT(22)	/* wake on over-current enable */
+#define	PORTS_WKDS	BIT(21)	/* wake on disconnect enable */
+#define	PORTS_WKCN	BIT(20)	/* wake on connect enable */
+#define	PORTS_PTC(p)	(((p)>>16)&0xf)	/* bits 19:16, port test control */
+#define	PORTS_PIC	(BIT(15) | BIT(14))	/* port indicator control */
+#define	PORTS_PO	BIT(13)	/* port owner */
+#define	PORTS_PP	BIT(12)	/* port power */
+#define	PORTS_LS	(BIT(11) | BIT(10)) 	/* line status */
+#define	PORTS_SLP	BIT(9)	/* suspend using L1 */
+#define	PORTS_PR	BIT(8)	/* port reset */
+#define	PORTS_SUSP	BIT(7)	/* suspend */
+#define	PORTS_FPR	BIT(6)	/* force port resume */
+#define	PORTS_OCC	BIT(5)	/* over-current change */
+#define	PORTS_OCA	BIT(4)	/* over-current active */
+#define	PORTS_PEC	BIT(3)	/* port enable/disable change */
+#define	PORTS_PE	BIT(2)	/* port enable/disable */
+#define	PORTS_CSC	BIT(1)	/* connect status change */
+#define	PORTS_CCS	BIT(0)	/* current connect status */
+	u8	_reserved7[0xb4-0x78];
+	/* offset: 0xb4 */
+	u32	devlc;		/* control LPM and each USB port behavior */
+/* bits 31:29, parallel transceiver select */
+#define	LPM_PTS(d)	(((d)>>29)&7)
+#define	LPM_STS		BIT(28)	/* serial transceiver select */
+#define	LPM_PTW		BIT(27)	/* parallel transceiver width */
+#define	LPM_PSPD(d)	(((d)>>25)&3)	/* bits 26:25, port speed */
+#define LPM_PSPD_MASK	(BIT(26) | BIT(25))
+#define LPM_SPEED_FULL	0
+#define LPM_SPEED_LOW	1
+#define LPM_SPEED_HIGH	2
+#define	LPM_SRT		BIT(24)	/* shorten reset time */
+#define	LPM_PFSC	BIT(23)	/* port force full speed connect */
+#define	LPM_PHCD	BIT(22) /* PHY low power suspend clock disable */
+#define	LPM_STL		BIT(16)	/* STALL reply to LPM token */
+#define	LPM_BA(d)	\
+	(((d)>>1)&0x7ff)	/* bits 11:1, BmAttributes */
+#define	LPM_NYT_ACK	BIT(0)	/* NYET/ACK reply to LPM token */
+	u8	_reserved8[0xf4-0xb8];
+	/* offset: 0xf4 */
+	u32	otgsc;		/* On-The-Go status and control */
+#define	OTGSC_DPIE	BIT(30)	/* data pulse interrupt enable */
+#define	OTGSC_MSE	BIT(29)	/* 1 ms timer interrupt enable */
+#define	OTGSC_BSEIE	BIT(28)	/* B session end interrupt enable */
+#define	OTGSC_BSVIE	BIT(27)	/* B session valid interrupt enable */
+#define	OTGSC_ASVIE	BIT(26)	/* A session valid interrupt enable */
+#define	OTGSC_AVVIE	BIT(25)	/* A VBUS valid interrupt enable */
+#define	OTGSC_IDIE	BIT(24)	/* USB ID interrupt enable */
+#define	OTGSC_DPIS	BIT(22)	/* data pulse interrupt status */
+#define	OTGSC_MSS	BIT(21)	/* 1 ms timer interrupt status */
+#define	OTGSC_BSEIS	BIT(20)	/* B session end interrupt status */
+#define	OTGSC_BSVIS	BIT(19)	/* B session valid interrupt status */
+#define	OTGSC_ASVIS	BIT(18)	/* A session valid interrupt status */
+#define	OTGSC_AVVIS	BIT(17)	/* A VBUS valid interrupt status */
+#define	OTGSC_IDIS	BIT(16)	/* USB ID interrupt status */
+#define	OTGSC_DPS	BIT(14)	/* data bus pulsing status */
+#define	OTGSC_MST	BIT(13)	/* 1 ms timer toggle */
+#define	OTGSC_BSE	BIT(12)	/* B session end */
+#define	OTGSC_BSV	BIT(11)	/* B session valid */
+#define	OTGSC_ASV	BIT(10)	/* A session valid */
+#define	OTGSC_AVV	BIT(9)	/* A VBUS valid */
+#define	OTGSC_USBID	BIT(8)	/* USB ID */
+#define	OTGSC_HABA	BIT(7)	/* hw assist B-disconnect to A-connect */
+#define	OTGSC_HADP	BIT(6)	/* hw assist data pulse */
+#define	OTGSC_IDPU	BIT(5)	/* ID pullup */
+#define	OTGSC_DP	BIT(4)	/* data pulsing */
+#define	OTGSC_OT	BIT(3)	/* OTG termination */
+#define	OTGSC_HAAR	BIT(2)	/* hw assist auto reset */
+#define	OTGSC_VC	BIT(1)	/* VBUS charge */
+#define	OTGSC_VD	BIT(0)	/* VBUS discharge */
+	u32	usbmode;
+#define	MODE_VBPS	BIT(5)	/* R/W VBUS power select */
+#define	MODE_SDIS	BIT(4)	/* R/W stream disable mode */
+#define	MODE_SLOM	BIT(3)	/* R/W setup lockout mode */
+#define	MODE_ENSE	BIT(2)	/* endian select */
+#define	MODE_CM(u)	(((u)>>0)&3)	/* bits 1:0, controller mode */
+#define	MODE_IDLE	0
+#define	MODE_DEVICE	2
+#define	MODE_HOST	3
+	u8	_reserved9[0x100-0xfc];
+	/* offset: 0x100 */
+	u32	endptnak;
+#define	EPTN(e)		\
+	(((e)>>16)&0xffff)	/* bits 31:16, TX endpoint NAK */
+#define	EPRN(e)		\
+	(((e)>>0)&0xffff)	/* bits 15:0, RX endpoint NAK */
+	u32	endptnaken;
+#define	EPTNE(e)	\
+	(((e)>>16)&0xffff)	/* bits 31:16, TX endpoint NAK enable */
+#define	EPRNE(e)	\
+	(((e)>>0)&0xffff)	/* bits 15:0, RX endpoint NAK enable */
+	u32	endptsetupstat;
+#define	SETUPSTAT_MASK		(0xffff << 0)	/* bits 15:0 */
+#define EP0SETUPSTAT_MASK	1
+	u32	endptprime;
+/* bits 31:16, prime endpoint transmit buffer */
+#define	PETB(e)		(((e)>>16)&0xffff)
+/* bits 15:0, prime endpoint receive buffer */
+#define	PERB(e)		(((e)>>0)&0xffff)
+	/* offset: 0x110 */
+	u32	endptflush;
+/* bits 31:16, flush endpoint transmit buffer */
+#define	FETB(e)		(((e)>>16)&0xffff)
+/* bits 15:0, flush endpoint receive buffer */
+#define	FERB(e)		(((e)>>0)&0xffff)
+	u32	endptstat;
+/* bits 31:16, endpoint transmit buffer ready */
+#define	ETBR(e)		(((e)>>16)&0xffff)
+/* bits 15:0, endpoint receive buffer ready */
+#define	ERBR(e)		(((e)>>0)&0xffff)
+	u32	endptcomplete;
+/* bits 31:16, endpoint transmit complete event */
+#define	ETCE(e)		(((e)>>16)&0xffff)
+/* bits 15:0, endpoint receive complete event */
+#define	ERCE(e)		(((e)>>0)&0xffff)
+	/* offset: 0x11c */
+	u32	endptctrl[16];
+#define	EPCTRL_TXE	BIT(23)	/* TX endpoint enable */
+#define	EPCTRL_TXR	BIT(22)	/* TX data toggle reset */
+#define	EPCTRL_TXI	BIT(21)	/* TX data toggle inhibit */
+#define	EPCTRL_TXT(e)	(((e)>>18)&3)	/* bits 19:18, TX endpoint type */
+#define	EPCTRL_TXT_SHIFT	18
+#define	EPCTRL_TXD	BIT(17)	/* TX endpoint data source */
+#define	EPCTRL_TXS	BIT(16)	/* TX endpoint STALL */
+#define	EPCTRL_RXE	BIT(7)	/* RX endpoint enable */
+#define	EPCTRL_RXR	BIT(6)	/* RX data toggle reset */
+#define	EPCTRL_RXI	BIT(5)	/* RX data toggle inhibit */
+#define	EPCTRL_RXT(e)	(((e)>>2)&3)	/* bits 3:2, RX endpoint type */
+#define	EPCTRL_RXT_SHIFT	2	/* bits 19:18, TX endpoint type */
+#define	EPCTRL_RXD	BIT(1)	/* RX endpoint data sink */
+#define	EPCTRL_RXS	BIT(0)	/* RX endpoint STALL */
+} __attribute__ ((packed));
+
+#endif /* __LANGWELL_UDC_H */
+
-- 
cgit v1.2.3-70-g09d2


From 453f77558810ffa669ed5a510a7173ec49def396 Mon Sep 17 00:00:00 2001
From: Hao Wu <hao.wu@intel.com>
Date: Thu, 4 Jun 2009 16:06:50 +0800
Subject: USB: Add Intel Langwell USB OTG Transceiver Drive

Description:
This driver is used for Intel Langwell* USB OTG controller in Intel
Moorestown* platform. It tries to implement host/device role switch
according to OTG spec.  The actual hsot and device functions are
accomplished in modified EHCI driver and Intel Langwell USB OTG client
controller driver.

* Langwell and Moorestown are names used in development. They are not
  approved official name.

Note:
This patch is the first version Intel Langwell USB OTG Transceiver
driver. The development is not finished, and the bug fixing is on going
for some hardware and software issues. The main purpose of this
submission is for code view.

Supported features:
- Data-line Pulsing SRP
- Support HNP to switch roles
- PCI D0/D3 power management support

Known issues:
- HNP is only tested with another Moorestown platform.
- PCI D0/D3 power management support is not fully tested.
- VBus Pulsing SRP is not support in current version.

Signed-off-by: Hao Wu <hao.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/otg/Kconfig          |   14 +
 drivers/usb/otg/Makefile         |    1 +
 drivers/usb/otg/langwell_otg.c   | 1915 ++++++++++++++++++++++++++++++++++++++
 include/linux/usb/langwell_otg.h |  177 ++++
 4 files changed, 2107 insertions(+)
 create mode 100644 drivers/usb/otg/langwell_otg.c
 create mode 100644 include/linux/usb/langwell_otg.h

(limited to 'include/linux')

diff --git a/drivers/usb/otg/Kconfig b/drivers/usb/otg/Kconfig
index aa884d072f0..69feeec1628 100644
--- a/drivers/usb/otg/Kconfig
+++ b/drivers/usb/otg/Kconfig
@@ -59,4 +59,18 @@ config NOP_USB_XCEIV
 	 built-in with usb ip or which are autonomous and doesn't require any
 	 phy programming such as ISP1x04 etc.
 
+config USB_LANGWELL_OTG
+	tristate "Intel Langwell USB OTG dual-role support"
+	depends on USB && MRST
+	select USB_OTG
+	select USB_OTG_UTILS
+	help
+	  Say Y here if you want to build Intel Langwell USB OTG
+	  transciever driver in kernel. This driver implements role
+	  switch between EHCI host driver and Langwell USB OTG
+	  client driver.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called langwell_otg.
+
 endif # USB || OTG
diff --git a/drivers/usb/otg/Makefile b/drivers/usb/otg/Makefile
index 20816785652..6d1abdd3c0a 100644
--- a/drivers/usb/otg/Makefile
+++ b/drivers/usb/otg/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_USB_OTG_UTILS)	+= otg.o
 obj-$(CONFIG_USB_GPIO_VBUS)	+= gpio_vbus.o
 obj-$(CONFIG_ISP1301_OMAP)	+= isp1301_omap.o
 obj-$(CONFIG_TWL4030_USB)	+= twl4030-usb.o
+obj-$(CONFIG_USB_LANGWELL_OTG)	+= langwell_otg.o
 obj-$(CONFIG_NOP_USB_XCEIV)	+= nop-usb-xceiv.o
 
 ccflags-$(CONFIG_USB_DEBUG)	+= -DDEBUG
diff --git a/drivers/usb/otg/langwell_otg.c b/drivers/usb/otg/langwell_otg.c
new file mode 100644
index 00000000000..6f628d0e9f3
--- /dev/null
+++ b/drivers/usb/otg/langwell_otg.c
@@ -0,0 +1,1915 @@
+/*
+ * Intel Langwell USB OTG transceiver driver
+ * Copyright (C) 2008 - 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+/* This driver helps to switch Langwell OTG controller function between host
+ * and peripheral. It works with EHCI driver and Langwell client controller
+ * driver together.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/moduleparam.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/gadget.h>
+#include <linux/usb.h>
+#include <linux/usb/otg.h>
+#include <linux/notifier.h>
+#include <asm/ipc_defs.h>
+#include <linux/delay.h>
+#include "../core/hcd.h"
+
+#include <linux/usb/langwell_otg.h>
+
+#define	DRIVER_DESC		"Intel Langwell USB OTG transceiver driver"
+#define	DRIVER_VERSION		"3.0.0.32L.0002"
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_AUTHOR("Henry Yuan <hang.yuan@intel.com>, Hao Wu <hao.wu@intel.com>");
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+
+static const char driver_name[] = "langwell_otg";
+
+static int langwell_otg_probe(struct pci_dev *pdev,
+			const struct pci_device_id *id);
+static void langwell_otg_remove(struct pci_dev *pdev);
+static int langwell_otg_suspend(struct pci_dev *pdev, pm_message_t message);
+static int langwell_otg_resume(struct pci_dev *pdev);
+
+static int langwell_otg_set_host(struct otg_transceiver *otg,
+				struct usb_bus *host);
+static int langwell_otg_set_peripheral(struct otg_transceiver *otg,
+				struct usb_gadget *gadget);
+static int langwell_otg_start_srp(struct otg_transceiver *otg);
+
+static const struct pci_device_id pci_ids[] = {{
+	.class =        ((PCI_CLASS_SERIAL_USB << 8) | 0xfe),
+	.class_mask =   ~0,
+	.vendor =	0x8086,
+	.device =	0x0811,
+	.subvendor =	PCI_ANY_ID,
+	.subdevice =	PCI_ANY_ID,
+}, { /* end: all zeroes */ }
+};
+
+static struct pci_driver otg_pci_driver = {
+	.name =		(char *) driver_name,
+	.id_table =	pci_ids,
+
+	.probe =	langwell_otg_probe,
+	.remove =	langwell_otg_remove,
+
+	.suspend =	langwell_otg_suspend,
+	.resume =	langwell_otg_resume,
+};
+
+static const char *state_string(enum usb_otg_state state)
+{
+	switch (state) {
+	case OTG_STATE_A_IDLE:
+		return "a_idle";
+	case OTG_STATE_A_WAIT_VRISE:
+		return "a_wait_vrise";
+	case OTG_STATE_A_WAIT_BCON:
+		return "a_wait_bcon";
+	case OTG_STATE_A_HOST:
+		return "a_host";
+	case OTG_STATE_A_SUSPEND:
+		return "a_suspend";
+	case OTG_STATE_A_PERIPHERAL:
+		return "a_peripheral";
+	case OTG_STATE_A_WAIT_VFALL:
+		return "a_wait_vfall";
+	case OTG_STATE_A_VBUS_ERR:
+		return "a_vbus_err";
+	case OTG_STATE_B_IDLE:
+		return "b_idle";
+	case OTG_STATE_B_SRP_INIT:
+		return "b_srp_init";
+	case OTG_STATE_B_PERIPHERAL:
+		return "b_peripheral";
+	case OTG_STATE_B_WAIT_ACON:
+		return "b_wait_acon";
+	case OTG_STATE_B_HOST:
+		return "b_host";
+	default:
+		return "UNDEFINED";
+	}
+}
+
+/* HSM timers */
+static inline struct langwell_otg_timer *otg_timer_initializer
+(void (*function)(unsigned long), unsigned long expires, unsigned long data)
+{
+	struct langwell_otg_timer *timer;
+	timer = kmalloc(sizeof(struct langwell_otg_timer), GFP_KERNEL);
+	timer->function = function;
+	timer->expires = expires;
+	timer->data = data;
+	return timer;
+}
+
+static struct langwell_otg_timer *a_wait_vrise_tmr, *a_wait_bcon_tmr,
+	*a_aidl_bdis_tmr, *b_ase0_brst_tmr, *b_se0_srp_tmr, *b_srp_res_tmr,
+	*b_bus_suspend_tmr;
+
+static struct list_head active_timers;
+
+static struct langwell_otg *the_transceiver;
+
+/* host/client notify transceiver when event affects HNP state */
+void langwell_update_transceiver()
+{
+	otg_dbg("transceiver driver is notified\n");
+	queue_work(the_transceiver->qwork, &the_transceiver->work);
+}
+EXPORT_SYMBOL(langwell_update_transceiver);
+
+static int langwell_otg_set_host(struct otg_transceiver *otg,
+					struct usb_bus *host)
+{
+	otg->host = host;
+
+	return 0;
+}
+
+static int langwell_otg_set_peripheral(struct otg_transceiver *otg,
+					struct usb_gadget *gadget)
+{
+	otg->gadget = gadget;
+
+	return 0;
+}
+
+static int langwell_otg_set_power(struct otg_transceiver *otg,
+				unsigned mA)
+{
+	return 0;
+}
+
+/* A-device drives vbus, controlled through PMIC CHRGCNTL register*/
+static void langwell_otg_drv_vbus(int on)
+{
+	struct ipc_pmic_reg_data	pmic_data = {0};
+	struct ipc_pmic_reg_data	battery_data;
+
+	/* Check if battery is attached or not */
+	battery_data.pmic_reg_data[0].register_address = 0xd2;
+	battery_data.ioc = 0;
+	battery_data.num_entries = 1;
+	if (ipc_pmic_register_read(&battery_data)) {
+		otg_dbg("Failed to read PMIC register 0xd2.\n");
+		return;
+	}
+
+	if ((battery_data.pmic_reg_data[0].value & 0x20) == 0) {
+		otg_dbg("no battery attached\n");
+		return;
+	}
+
+	/* Workaround for battery attachment issue */
+	if (battery_data.pmic_reg_data[0].value == 0x34) {
+		otg_dbg("battery \n");
+		return;
+	}
+
+	otg_dbg("battery attached\n");
+
+	pmic_data.ioc = 0;
+	pmic_data.pmic_reg_data[0].register_address = 0xD4;
+	pmic_data.num_entries = 1;
+	if (on)
+		pmic_data.pmic_reg_data[0].value = 0x20;
+	else
+		pmic_data.pmic_reg_data[0].value = 0xc0;
+
+	if (ipc_pmic_register_write(&pmic_data, TRUE))
+		otg_dbg("Failed to write PMIC.\n");
+
+}
+
+/* charge vbus or discharge vbus through a resistor to ground */
+static void langwell_otg_chrg_vbus(int on)
+{
+
+	u32	val;
+
+	val = readl(the_transceiver->regs + CI_OTGSC);
+
+	if (on)
+		writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_VC,
+				the_transceiver->regs + CI_OTGSC);
+	else
+		writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_VD,
+				the_transceiver->regs + CI_OTGSC);
+
+}
+
+/* Start SRP */
+static int langwell_otg_start_srp(struct otg_transceiver *otg)
+{
+	u32	val;
+
+	otg_dbg("Start SRP ->\n");
+
+	val = readl(the_transceiver->regs + CI_OTGSC);
+
+	writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_HADP,
+		the_transceiver->regs + CI_OTGSC);
+
+	/* Check if the data plus is finished or not */
+	msleep(8);
+	val = readl(the_transceiver->regs + CI_OTGSC);
+	if (val & (OTGSC_HADP | OTGSC_DP))
+		otg_dbg("DataLine SRP Error\n");
+
+	/* FIXME: VBus SRP */
+
+	return 0;
+}
+
+
+/* stop SOF via bus_suspend */
+static void langwell_otg_loc_sof(int on)
+{
+	struct usb_hcd	*hcd;
+	int		err;
+
+	otg_dbg("loc_sof -> %d\n", on);
+
+	hcd = bus_to_hcd(the_transceiver->otg.host);
+	if (on)
+		err = hcd->driver->bus_resume(hcd);
+	else
+		err = hcd->driver->bus_suspend(hcd);
+
+	if (err)
+		otg_dbg("Failed to resume/suspend bus - %d\n", err);
+}
+
+static void langwell_otg_phy_low_power(int on)
+{
+	u32	val;
+
+	otg_dbg("phy low power mode-> %d\n", on);
+
+	val = readl(the_transceiver->regs + CI_HOSTPC1);
+	if (on)
+		writel(val | HOSTPC1_PHCD, the_transceiver->regs + CI_HOSTPC1);
+	else
+		writel(val & ~HOSTPC1_PHCD, the_transceiver->regs + CI_HOSTPC1);
+}
+
+/* Enable/Disable OTG interrupt */
+static void langwell_otg_intr(int on)
+{
+	u32 val;
+
+	otg_dbg("interrupt -> %d\n", on);
+
+	val = readl(the_transceiver->regs + CI_OTGSC);
+	if (on) {
+		val = val | (OTGSC_INTEN_MASK | OTGSC_IDPU);
+		writel(val, the_transceiver->regs + CI_OTGSC);
+	} else {
+		val = val & ~(OTGSC_INTEN_MASK | OTGSC_IDPU);
+		writel(val, the_transceiver->regs + CI_OTGSC);
+	}
+}
+
+/* set HAAR: Hardware Assist Auto-Reset */
+static void langwell_otg_HAAR(int on)
+{
+	u32	val;
+
+	otg_dbg("HAAR -> %d\n", on);
+
+	val = readl(the_transceiver->regs + CI_OTGSC);
+	if (on)
+		writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_HAAR,
+				the_transceiver->regs + CI_OTGSC);
+	else
+		writel((val & ~OTGSC_INTSTS_MASK) & ~OTGSC_HAAR,
+				the_transceiver->regs + CI_OTGSC);
+}
+
+/* set HABA: Hardware Assist B-Disconnect to A-Connect */
+static void langwell_otg_HABA(int on)
+{
+	u32	val;
+
+	otg_dbg("HABA -> %d\n", on);
+
+	val = readl(the_transceiver->regs + CI_OTGSC);
+	if (on)
+		writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_HABA,
+				the_transceiver->regs + CI_OTGSC);
+	else
+		writel((val & ~OTGSC_INTSTS_MASK) & ~OTGSC_HABA,
+				the_transceiver->regs + CI_OTGSC);
+}
+
+static int langwell_otg_check_se0_srp(int on)
+{
+	u32 val;
+
+	int delay_time = TB_SE0_SRP * 10; /* step is 100us */
+
+	otg_dbg("check_se0_srp -> \n");
+
+	do {
+		udelay(100);
+		if (!delay_time--)
+			break;
+		val = readl(the_transceiver->regs + CI_PORTSC1);
+		val &= PORTSC_LS;
+	} while (!val);
+
+	otg_dbg("check_se0_srp <- \n");
+	return val;
+}
+
+/* The timeout callback function to set time out bit */
+static void set_tmout(unsigned long indicator)
+{
+	*(int *)indicator = 1;
+}
+
+void langwell_otg_nsf_msg(unsigned long indicator)
+{
+	switch (indicator) {
+	case 2:
+	case 4:
+	case 6:
+	case 7:
+		printk(KERN_ERR "OTG:NSF-%lu - deivce not responding\n",
+				indicator);
+		break;
+	case 3:
+		printk(KERN_ERR "OTG:NSF-%lu - deivce not supported\n",
+				indicator);
+		break;
+	default:
+		printk(KERN_ERR "Do not have this kind of NSF\n");
+		break;
+	}
+}
+
+/* Initialize timers */
+static void langwell_otg_init_timers(struct otg_hsm *hsm)
+{
+	/* HSM used timers */
+	a_wait_vrise_tmr = otg_timer_initializer(&set_tmout, TA_WAIT_VRISE,
+				(unsigned long)&hsm->a_wait_vrise_tmout);
+	a_wait_bcon_tmr = otg_timer_initializer(&set_tmout, TA_WAIT_BCON,
+				(unsigned long)&hsm->a_wait_bcon_tmout);
+	a_aidl_bdis_tmr = otg_timer_initializer(&set_tmout, TA_AIDL_BDIS,
+				(unsigned long)&hsm->a_aidl_bdis_tmout);
+	b_ase0_brst_tmr = otg_timer_initializer(&set_tmout, TB_ASE0_BRST,
+				(unsigned long)&hsm->b_ase0_brst_tmout);
+	b_se0_srp_tmr = otg_timer_initializer(&set_tmout, TB_SE0_SRP,
+				(unsigned long)&hsm->b_se0_srp);
+	b_srp_res_tmr = otg_timer_initializer(&set_tmout, TB_SRP_RES,
+				(unsigned long)&hsm->b_srp_res_tmout);
+	b_bus_suspend_tmr = otg_timer_initializer(&set_tmout, TB_BUS_SUSPEND,
+				(unsigned long)&hsm->b_bus_suspend_tmout);
+}
+
+/* Free timers */
+static void langwell_otg_free_timers(void)
+{
+	kfree(a_wait_vrise_tmr);
+	kfree(a_wait_bcon_tmr);
+	kfree(a_aidl_bdis_tmr);
+	kfree(b_ase0_brst_tmr);
+	kfree(b_se0_srp_tmr);
+	kfree(b_srp_res_tmr);
+	kfree(b_bus_suspend_tmr);
+}
+
+/* Add timer to timer list */
+static void langwell_otg_add_timer(void *gtimer)
+{
+	struct langwell_otg_timer *timer = (struct langwell_otg_timer *)gtimer;
+	struct langwell_otg_timer *tmp_timer;
+	u32	val32;
+
+	/* Check if the timer is already in the active list,
+	 * if so update timer count
+	 */
+	list_for_each_entry(tmp_timer, &active_timers, list)
+		if (tmp_timer == timer) {
+			timer->count = timer->expires;
+			return;
+		}
+	timer->count = timer->expires;
+
+	if (list_empty(&active_timers)) {
+		val32 = readl(the_transceiver->regs + CI_OTGSC);
+		writel(val32 | OTGSC_1MSE, the_transceiver->regs + CI_OTGSC);
+	}
+
+	list_add_tail(&timer->list, &active_timers);
+}
+
+/* Remove timer from the timer list; clear timeout status */
+static void langwell_otg_del_timer(void *gtimer)
+{
+	struct langwell_otg_timer *timer = (struct langwell_otg_timer *)gtimer;
+	struct langwell_otg_timer *tmp_timer, *del_tmp;
+	u32 val32;
+
+	list_for_each_entry_safe(tmp_timer, del_tmp, &active_timers, list)
+		if (tmp_timer == timer)
+			list_del(&timer->list);
+
+	if (list_empty(&active_timers)) {
+		val32 = readl(the_transceiver->regs + CI_OTGSC);
+		writel(val32 & ~OTGSC_1MSE, the_transceiver->regs + CI_OTGSC);
+	}
+}
+
+/* Reduce timer count by 1, and find timeout conditions.*/
+static int langwell_otg_tick_timer(u32 *int_sts)
+{
+	struct langwell_otg_timer *tmp_timer, *del_tmp;
+	int expired = 0;
+
+	list_for_each_entry_safe(tmp_timer, del_tmp, &active_timers, list) {
+		tmp_timer->count--;
+		/* check if timer expires */
+		if (!tmp_timer->count) {
+			list_del(&tmp_timer->list);
+			tmp_timer->function(tmp_timer->data);
+			expired = 1;
+		}
+	}
+
+	if (list_empty(&active_timers)) {
+		otg_dbg("tick timer: disable 1ms int\n");
+		*int_sts = *int_sts & ~OTGSC_1MSE;
+	}
+	return expired;
+}
+
+static void reset_otg(void)
+{
+	u32	val;
+	int	delay_time = 1000;
+
+	otg_dbg("reseting OTG controller ...\n");
+	val = readl(the_transceiver->regs + CI_USBCMD);
+	writel(val | USBCMD_RST, the_transceiver->regs + CI_USBCMD);
+	do {
+		udelay(100);
+		if (!delay_time--)
+			otg_dbg("reset timeout\n");
+		val = readl(the_transceiver->regs + CI_USBCMD);
+		val &= USBCMD_RST;
+	} while (val != 0);
+	otg_dbg("reset done.\n");
+}
+
+static void set_host_mode(void)
+{
+	u32 	val;
+
+	reset_otg();
+	val = readl(the_transceiver->regs + CI_USBMODE);
+	val = (val & (~USBMODE_CM)) | USBMODE_HOST;
+	writel(val, the_transceiver->regs + CI_USBMODE);
+}
+
+static void set_client_mode(void)
+{
+	u32 	val;
+
+	reset_otg();
+	val = readl(the_transceiver->regs + CI_USBMODE);
+	val = (val & (~USBMODE_CM)) | USBMODE_DEVICE;
+	writel(val, the_transceiver->regs + CI_USBMODE);
+}
+
+static void init_hsm(void)
+{
+	struct langwell_otg	*langwell = the_transceiver;
+	u32			val32;
+
+	/* read OTGSC after reset */
+	val32 = readl(langwell->regs + CI_OTGSC);
+	otg_dbg("%s: OTGSC init value = 0x%x\n", __func__, val32);
+
+	/* set init state */
+	if (val32 & OTGSC_ID) {
+		langwell->hsm.id = 1;
+		langwell->otg.default_a = 0;
+		set_client_mode();
+		langwell->otg.state = OTG_STATE_B_IDLE;
+		langwell_otg_drv_vbus(0);
+	} else {
+		langwell->hsm.id = 0;
+		langwell->otg.default_a = 1;
+		set_host_mode();
+		langwell->otg.state = OTG_STATE_A_IDLE;
+	}
+
+	/* set session indicator */
+	if (val32 & OTGSC_BSE)
+		langwell->hsm.b_sess_end = 1;
+	if (val32 & OTGSC_BSV)
+		langwell->hsm.b_sess_vld = 1;
+	if (val32 & OTGSC_ASV)
+		langwell->hsm.a_sess_vld = 1;
+	if (val32 & OTGSC_AVV)
+		langwell->hsm.a_vbus_vld = 1;
+
+	/* defautly power the bus */
+	langwell->hsm.a_bus_req = 1;
+	langwell->hsm.a_bus_drop = 0;
+	/* defautly don't request bus as B device */
+	langwell->hsm.b_bus_req = 0;
+	/* no system error */
+	langwell->hsm.a_clr_err = 0;
+}
+
+static irqreturn_t otg_dummy_irq(int irq, void *_dev)
+{
+	void __iomem	*reg_base = _dev;
+	u32	val;
+	u32	int_mask = 0;
+
+	val = readl(reg_base + CI_USBMODE);
+	if ((val & USBMODE_CM) != USBMODE_DEVICE)
+		return IRQ_NONE;
+
+	val = readl(reg_base + CI_USBSTS);
+	int_mask = val & INTR_DUMMY_MASK;
+
+	if (int_mask == 0)
+		return IRQ_NONE;
+
+	/* clear hsm.b_conn here since host driver can't detect it
+	*  otg_dummy_irq called means B-disconnect happened.
+	*/
+	if (the_transceiver->hsm.b_conn) {
+		the_transceiver->hsm.b_conn = 0;
+		if (spin_trylock(&the_transceiver->wq_lock)) {
+			queue_work(the_transceiver->qwork,
+				&the_transceiver->work);
+			spin_unlock(&the_transceiver->wq_lock);
+		}
+	}
+	/* Clear interrupts */
+	writel(int_mask, reg_base + CI_USBSTS);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t otg_irq(int irq, void *_dev)
+{
+	struct	langwell_otg *langwell = _dev;
+	u32	int_sts, int_en;
+	u32	int_mask = 0;
+	int	flag = 0;
+
+	int_sts = readl(langwell->regs + CI_OTGSC);
+	int_en = (int_sts & OTGSC_INTEN_MASK) >> 8;
+	int_mask = int_sts & int_en;
+	if (int_mask == 0)
+		return IRQ_NONE;
+
+	if (int_mask & OTGSC_IDIS) {
+		otg_dbg("%s: id change int\n", __func__);
+		langwell->hsm.id = (int_sts & OTGSC_ID) ? 1 : 0;
+		flag = 1;
+	}
+	if (int_mask & OTGSC_DPIS) {
+		otg_dbg("%s: data pulse int\n", __func__);
+		langwell->hsm.a_srp_det = (int_sts & OTGSC_DPS) ? 1 : 0;
+		flag = 1;
+	}
+	if (int_mask & OTGSC_BSEIS) {
+		otg_dbg("%s: b session end int\n", __func__);
+		langwell->hsm.b_sess_end = (int_sts & OTGSC_BSE) ? 1 : 0;
+		flag = 1;
+	}
+	if (int_mask & OTGSC_BSVIS) {
+		otg_dbg("%s: b session valid int\n", __func__);
+		langwell->hsm.b_sess_vld = (int_sts & OTGSC_BSV) ? 1 : 0;
+		flag = 1;
+	}
+	if (int_mask & OTGSC_ASVIS) {
+		otg_dbg("%s: a session valid int\n", __func__);
+		langwell->hsm.a_sess_vld = (int_sts & OTGSC_ASV) ? 1 : 0;
+		flag = 1;
+	}
+	if (int_mask & OTGSC_AVVIS) {
+		otg_dbg("%s: a vbus valid int\n", __func__);
+		langwell->hsm.a_vbus_vld = (int_sts & OTGSC_AVV) ? 1 : 0;
+		flag = 1;
+	}
+
+	if (int_mask & OTGSC_1MSS) {
+		/* need to schedule otg_work if any timer is expired */
+		if (langwell_otg_tick_timer(&int_sts))
+			flag = 1;
+	}
+
+	writel((int_sts & ~OTGSC_INTSTS_MASK) | int_mask,
+			langwell->regs + CI_OTGSC);
+	if (flag)
+		queue_work(langwell->qwork, &langwell->work);
+
+	return IRQ_HANDLED;
+}
+
+static void langwell_otg_work(struct work_struct *work)
+{
+	struct langwell_otg *langwell = container_of(work,
+					struct langwell_otg, work);
+	int	retval;
+
+	otg_dbg("%s: old state = %s\n", __func__,
+			state_string(langwell->otg.state));
+
+	switch (langwell->otg.state) {
+	case OTG_STATE_UNDEFINED:
+	case OTG_STATE_B_IDLE:
+		if (!langwell->hsm.id) {
+			langwell_otg_del_timer(b_srp_res_tmr);
+			langwell->otg.default_a = 1;
+			langwell->hsm.a_srp_det = 0;
+
+			langwell_otg_chrg_vbus(0);
+			langwell_otg_drv_vbus(0);
+
+			set_host_mode();
+			langwell->otg.state = OTG_STATE_A_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.b_srp_res_tmout) {
+			langwell->hsm.b_srp_res_tmout = 0;
+			langwell->hsm.b_bus_req = 0;
+			langwell_otg_nsf_msg(6);
+		} else if (langwell->hsm.b_sess_vld) {
+			langwell_otg_del_timer(b_srp_res_tmr);
+			langwell->hsm.b_sess_end = 0;
+			langwell->hsm.a_bus_suspend = 0;
+
+			langwell_otg_chrg_vbus(0);
+			if (langwell->client_ops) {
+				langwell->client_ops->resume(langwell->pdev);
+				langwell->otg.state = OTG_STATE_B_PERIPHERAL;
+			} else
+				otg_dbg("client driver not loaded.\n");
+
+		} else if (langwell->hsm.b_bus_req &&
+				(langwell->hsm.b_sess_end)) {
+			/* workaround for b_se0_srp detection */
+			retval = langwell_otg_check_se0_srp(0);
+			if (retval) {
+				langwell->hsm.b_bus_req = 0;
+				otg_dbg("LS is not SE0, try again later\n");
+			} else {
+				/* Start SRP */
+				langwell_otg_start_srp(&langwell->otg);
+				langwell_otg_add_timer(b_srp_res_tmr);
+			}
+		}
+		break;
+	case OTG_STATE_B_SRP_INIT:
+		if (!langwell->hsm.id) {
+			langwell->otg.default_a = 1;
+			langwell->hsm.a_srp_det = 0;
+
+			langwell_otg_drv_vbus(0);
+			langwell_otg_chrg_vbus(0);
+
+			langwell->otg.state = OTG_STATE_A_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.b_sess_vld) {
+			langwell_otg_chrg_vbus(0);
+			if (langwell->client_ops) {
+				langwell->client_ops->resume(langwell->pdev);
+				langwell->otg.state = OTG_STATE_B_PERIPHERAL;
+			} else
+				otg_dbg("client driver not loaded.\n");
+		}
+		break;
+	case OTG_STATE_B_PERIPHERAL:
+		if (!langwell->hsm.id) {
+			langwell->otg.default_a = 1;
+			langwell->hsm.a_srp_det = 0;
+
+			langwell_otg_drv_vbus(0);
+			langwell_otg_chrg_vbus(0);
+			set_host_mode();
+
+			if (langwell->client_ops) {
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			} else
+				otg_dbg("client driver has been removed.\n");
+
+			langwell->otg.state = OTG_STATE_A_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (!langwell->hsm.b_sess_vld) {
+			langwell->hsm.b_hnp_enable = 0;
+
+			if (langwell->client_ops) {
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			} else
+				otg_dbg("client driver has been removed.\n");
+
+			langwell->otg.state = OTG_STATE_B_IDLE;
+		} else if (langwell->hsm.b_bus_req && langwell->hsm.b_hnp_enable
+			&& langwell->hsm.a_bus_suspend) {
+
+			if (langwell->client_ops) {
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			} else
+				otg_dbg("client driver has been removed.\n");
+
+			langwell_otg_HAAR(1);
+			langwell->hsm.a_conn = 0;
+
+			if (langwell->host_ops) {
+				langwell->host_ops->probe(langwell->pdev,
+					langwell->host_ops->id_table);
+				langwell->otg.state = OTG_STATE_B_WAIT_ACON;
+			} else
+				otg_dbg("host driver not loaded.\n");
+
+			langwell->hsm.a_bus_resume = 0;
+			langwell->hsm.b_ase0_brst_tmout = 0;
+			langwell_otg_add_timer(b_ase0_brst_tmr);
+		}
+		break;
+
+	case OTG_STATE_B_WAIT_ACON:
+		if (!langwell->hsm.id) {
+			langwell_otg_del_timer(b_ase0_brst_tmr);
+			langwell->otg.default_a = 1;
+			langwell->hsm.a_srp_det = 0;
+
+			langwell_otg_drv_vbus(0);
+			langwell_otg_chrg_vbus(0);
+			set_host_mode();
+
+			langwell_otg_HAAR(0);
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell->otg.state = OTG_STATE_A_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (!langwell->hsm.b_sess_vld) {
+			langwell_otg_del_timer(b_ase0_brst_tmr);
+			langwell->hsm.b_hnp_enable = 0;
+			langwell->hsm.b_bus_req = 0;
+			langwell_otg_chrg_vbus(0);
+			langwell_otg_HAAR(0);
+
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell->otg.state = OTG_STATE_B_IDLE;
+		} else if (langwell->hsm.a_conn) {
+			langwell_otg_del_timer(b_ase0_brst_tmr);
+			langwell_otg_HAAR(0);
+			langwell->otg.state = OTG_STATE_B_HOST;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.a_bus_resume ||
+				langwell->hsm.b_ase0_brst_tmout) {
+			langwell_otg_del_timer(b_ase0_brst_tmr);
+			langwell_otg_HAAR(0);
+			langwell_otg_nsf_msg(7);
+
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+
+			langwell->hsm.a_bus_suspend = 0;
+			langwell->hsm.b_bus_req = 0;
+
+			if (langwell->client_ops)
+				langwell->client_ops->resume(langwell->pdev);
+			else
+				otg_dbg("client driver not loaded.\n");
+
+			langwell->otg.state = OTG_STATE_B_PERIPHERAL;
+		}
+		break;
+
+	case OTG_STATE_B_HOST:
+		if (!langwell->hsm.id) {
+			langwell->otg.default_a = 1;
+			langwell->hsm.a_srp_det = 0;
+
+			langwell_otg_drv_vbus(0);
+			langwell_otg_chrg_vbus(0);
+			set_host_mode();
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell->otg.state = OTG_STATE_A_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (!langwell->hsm.b_sess_vld) {
+			langwell->hsm.b_hnp_enable = 0;
+			langwell->hsm.b_bus_req = 0;
+			langwell_otg_chrg_vbus(0);
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell->otg.state = OTG_STATE_B_IDLE;
+		} else if ((!langwell->hsm.b_bus_req) ||
+				(!langwell->hsm.a_conn)) {
+			langwell->hsm.b_bus_req = 0;
+			langwell_otg_loc_sof(0);
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+
+			langwell->hsm.a_bus_suspend = 0;
+
+			if (langwell->client_ops)
+				langwell->client_ops->resume(langwell->pdev);
+			else
+				otg_dbg("client driver not loaded.\n");
+
+			langwell->otg.state = OTG_STATE_B_PERIPHERAL;
+		}
+		break;
+
+	case OTG_STATE_A_IDLE:
+		langwell->otg.default_a = 1;
+		if (langwell->hsm.id) {
+			langwell->otg.default_a = 0;
+			langwell->hsm.b_bus_req = 0;
+			langwell_otg_drv_vbus(0);
+			langwell_otg_chrg_vbus(0);
+
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.a_sess_vld) {
+			langwell_otg_drv_vbus(1);
+			langwell->hsm.a_srp_det = 1;
+			langwell->hsm.a_wait_vrise_tmout = 0;
+			langwell_otg_add_timer(a_wait_vrise_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_VRISE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (!langwell->hsm.a_bus_drop &&
+			(langwell->hsm.a_srp_det || langwell->hsm.a_bus_req)) {
+			langwell_otg_drv_vbus(1);
+			langwell->hsm.a_wait_vrise_tmout = 0;
+			langwell_otg_add_timer(a_wait_vrise_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_VRISE;
+			queue_work(langwell->qwork, &langwell->work);
+		}
+		break;
+	case OTG_STATE_A_WAIT_VRISE:
+		if (langwell->hsm.id) {
+			langwell_otg_del_timer(a_wait_vrise_tmr);
+			langwell->hsm.b_bus_req = 0;
+			langwell->otg.default_a = 0;
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_B_IDLE;
+		} else if (langwell->hsm.a_vbus_vld) {
+			langwell_otg_del_timer(a_wait_vrise_tmr);
+			if (langwell->host_ops)
+				langwell->host_ops->probe(langwell->pdev,
+						langwell->host_ops->id_table);
+			else
+				otg_dbg("host driver not loaded.\n");
+			langwell->hsm.b_conn = 0;
+			langwell->hsm.a_set_b_hnp_en = 0;
+			langwell->hsm.a_wait_bcon_tmout = 0;
+			langwell_otg_add_timer(a_wait_bcon_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_BCON;
+		} else if (langwell->hsm.a_wait_vrise_tmout) {
+			if (langwell->hsm.a_vbus_vld) {
+				if (langwell->host_ops)
+					langwell->host_ops->probe(
+						langwell->pdev,
+						langwell->host_ops->id_table);
+				else
+					otg_dbg("host driver not loaded.\n");
+				langwell->hsm.b_conn = 0;
+				langwell->hsm.a_set_b_hnp_en = 0;
+				langwell->hsm.a_wait_bcon_tmout = 0;
+				langwell_otg_add_timer(a_wait_bcon_tmr);
+				langwell->otg.state = OTG_STATE_A_WAIT_BCON;
+			} else {
+				langwell_otg_drv_vbus(0);
+				langwell->otg.state = OTG_STATE_A_VBUS_ERR;
+			}
+		}
+		break;
+	case OTG_STATE_A_WAIT_BCON:
+		if (langwell->hsm.id) {
+			langwell_otg_del_timer(a_wait_bcon_tmr);
+
+			langwell->otg.default_a = 0;
+			langwell->hsm.b_bus_req = 0;
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (!langwell->hsm.a_vbus_vld) {
+			langwell_otg_del_timer(a_wait_bcon_tmr);
+
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_VBUS_ERR;
+		} else if (langwell->hsm.a_bus_drop ||
+				(langwell->hsm.a_wait_bcon_tmout &&
+				!langwell->hsm.a_bus_req)) {
+			langwell_otg_del_timer(a_wait_bcon_tmr);
+
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+		} else if (langwell->hsm.b_conn) {
+			langwell_otg_del_timer(a_wait_bcon_tmr);
+
+			langwell->hsm.a_suspend_req = 0;
+			langwell->otg.state = OTG_STATE_A_HOST;
+			if (!langwell->hsm.a_bus_req &&
+				langwell->hsm.a_set_b_hnp_en) {
+				/* It is not safe enough to do a fast
+				 * transistion from A_WAIT_BCON to
+				 * A_SUSPEND */
+				msleep(10000);
+				if (langwell->hsm.a_bus_req)
+					break;
+
+				if (request_irq(langwell->pdev->irq,
+					otg_dummy_irq, IRQF_SHARED,
+					driver_name, langwell->regs) != 0) {
+					otg_dbg("request interrupt %d fail\n",
+					langwell->pdev->irq);
+				}
+
+				langwell_otg_HABA(1);
+				langwell->hsm.b_bus_resume = 0;
+				langwell->hsm.a_aidl_bdis_tmout = 0;
+				langwell_otg_add_timer(a_aidl_bdis_tmr);
+
+				langwell_otg_loc_sof(0);
+				langwell->otg.state = OTG_STATE_A_SUSPEND;
+			} else if (!langwell->hsm.a_bus_req &&
+				!langwell->hsm.a_set_b_hnp_en) {
+				struct pci_dev *pdev = langwell->pdev;
+				if (langwell->host_ops)
+					langwell->host_ops->remove(pdev);
+				else
+					otg_dbg("host driver removed.\n");
+				langwell_otg_drv_vbus(0);
+				langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+			}
+		}
+		break;
+	case OTG_STATE_A_HOST:
+		if (langwell->hsm.id) {
+			langwell->otg.default_a = 0;
+			langwell->hsm.b_bus_req = 0;
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.a_bus_drop ||
+		(!langwell->hsm.a_set_b_hnp_en && !langwell->hsm.a_bus_req)) {
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+		} else if (!langwell->hsm.a_vbus_vld) {
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_VBUS_ERR;
+		} else if (langwell->hsm.a_set_b_hnp_en
+				&& !langwell->hsm.a_bus_req) {
+			/* Set HABA to enable hardware assistance to signal
+			 *  A-connect after receiver B-disconnect. Hardware
+			 *  will then set client mode and enable URE, SLE and
+			 *  PCE after the assistance. otg_dummy_irq is used to
+			 *  clean these ints when client driver is not resumed.
+			 */
+			if (request_irq(langwell->pdev->irq,
+				otg_dummy_irq, IRQF_SHARED, driver_name,
+				langwell->regs) != 0) {
+				otg_dbg("request interrupt %d failed\n",
+						langwell->pdev->irq);
+			}
+
+			/* set HABA */
+			langwell_otg_HABA(1);
+			langwell->hsm.b_bus_resume = 0;
+			langwell->hsm.a_aidl_bdis_tmout = 0;
+			langwell_otg_add_timer(a_aidl_bdis_tmr);
+			langwell_otg_loc_sof(0);
+			langwell->otg.state = OTG_STATE_A_SUSPEND;
+		} else if (!langwell->hsm.b_conn || !langwell->hsm.a_bus_req) {
+			langwell->hsm.a_wait_bcon_tmout = 0;
+			langwell->hsm.a_set_b_hnp_en = 0;
+			langwell_otg_add_timer(a_wait_bcon_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_BCON;
+		}
+		break;
+	case OTG_STATE_A_SUSPEND:
+		if (langwell->hsm.id) {
+			langwell_otg_del_timer(a_aidl_bdis_tmr);
+			langwell_otg_HABA(0);
+			free_irq(langwell->pdev->irq, langwell->regs);
+			langwell->otg.default_a = 0;
+			langwell->hsm.b_bus_req = 0;
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.a_bus_req ||
+				langwell->hsm.b_bus_resume) {
+			langwell_otg_del_timer(a_aidl_bdis_tmr);
+			langwell_otg_HABA(0);
+			free_irq(langwell->pdev->irq, langwell->regs);
+			langwell->hsm.a_suspend_req = 0;
+			langwell_otg_loc_sof(1);
+			langwell->otg.state = OTG_STATE_A_HOST;
+		} else if (langwell->hsm.a_aidl_bdis_tmout ||
+				langwell->hsm.a_bus_drop) {
+			langwell_otg_del_timer(a_aidl_bdis_tmr);
+			langwell_otg_HABA(0);
+			free_irq(langwell->pdev->irq, langwell->regs);
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+		} else if (!langwell->hsm.b_conn &&
+				langwell->hsm.a_set_b_hnp_en) {
+			langwell_otg_del_timer(a_aidl_bdis_tmr);
+			langwell_otg_HABA(0);
+			free_irq(langwell->pdev->irq, langwell->regs);
+
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+
+			langwell->hsm.b_bus_suspend = 0;
+			langwell->hsm.b_bus_suspend_vld = 0;
+			langwell->hsm.b_bus_suspend_tmout = 0;
+
+			/* msleep(200); */
+			if (langwell->client_ops)
+				langwell->client_ops->resume(langwell->pdev);
+			else
+				otg_dbg("client driver not loaded.\n");
+
+			langwell_otg_add_timer(b_bus_suspend_tmr);
+			langwell->otg.state = OTG_STATE_A_PERIPHERAL;
+			break;
+		} else if (!langwell->hsm.a_vbus_vld) {
+			langwell_otg_del_timer(a_aidl_bdis_tmr);
+			langwell_otg_HABA(0);
+			free_irq(langwell->pdev->irq, langwell->regs);
+			if (langwell->host_ops)
+				langwell->host_ops->remove(langwell->pdev);
+			else
+				otg_dbg("host driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_VBUS_ERR;
+		}
+		break;
+	case OTG_STATE_A_PERIPHERAL:
+		if (langwell->hsm.id) {
+			langwell_otg_del_timer(b_bus_suspend_tmr);
+			langwell->otg.default_a = 0;
+			langwell->hsm.b_bus_req = 0;
+			if (langwell->client_ops)
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			else
+				otg_dbg("client driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (!langwell->hsm.a_vbus_vld) {
+			langwell_otg_del_timer(b_bus_suspend_tmr);
+			if (langwell->client_ops)
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			else
+				otg_dbg("client driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_VBUS_ERR;
+		} else if (langwell->hsm.a_bus_drop) {
+			langwell_otg_del_timer(b_bus_suspend_tmr);
+			if (langwell->client_ops)
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			else
+				otg_dbg("client driver has been removed.\n");
+			langwell_otg_drv_vbus(0);
+			langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+		} else if (langwell->hsm.b_bus_suspend) {
+			langwell_otg_del_timer(b_bus_suspend_tmr);
+			if (langwell->client_ops)
+				langwell->client_ops->suspend(langwell->pdev,
+					PMSG_FREEZE);
+			else
+				otg_dbg("client driver has been removed.\n");
+
+			if (langwell->host_ops)
+				langwell->host_ops->probe(langwell->pdev,
+						langwell->host_ops->id_table);
+			else
+				otg_dbg("host driver not loaded.\n");
+			langwell->hsm.a_set_b_hnp_en = 0;
+			langwell->hsm.a_wait_bcon_tmout = 0;
+			langwell_otg_add_timer(a_wait_bcon_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_BCON;
+		} else if (langwell->hsm.b_bus_suspend_tmout) {
+			u32	val;
+			val = readl(langwell->regs + CI_PORTSC1);
+			if (!(val & PORTSC_SUSP))
+				break;
+			if (langwell->client_ops)
+				langwell->client_ops->suspend(langwell->pdev,
+						PMSG_FREEZE);
+			else
+				otg_dbg("client driver has been removed.\n");
+			if (langwell->host_ops)
+				langwell->host_ops->probe(langwell->pdev,
+						langwell->host_ops->id_table);
+			else
+				otg_dbg("host driver not loaded.\n");
+			langwell->hsm.a_set_b_hnp_en = 0;
+			langwell->hsm.a_wait_bcon_tmout = 0;
+			langwell_otg_add_timer(a_wait_bcon_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_BCON;
+		}
+		break;
+	case OTG_STATE_A_VBUS_ERR:
+		if (langwell->hsm.id) {
+			langwell->otg.default_a = 0;
+			langwell->hsm.a_clr_err = 0;
+			langwell->hsm.a_srp_det = 0;
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.a_clr_err) {
+			langwell->hsm.a_clr_err = 0;
+			langwell->hsm.a_srp_det = 0;
+			reset_otg();
+			init_hsm();
+			if (langwell->otg.state == OTG_STATE_A_IDLE)
+				queue_work(langwell->qwork, &langwell->work);
+		}
+		break;
+	case OTG_STATE_A_WAIT_VFALL:
+		if (langwell->hsm.id) {
+			langwell->otg.default_a = 0;
+			langwell->otg.state = OTG_STATE_B_IDLE;
+			queue_work(langwell->qwork, &langwell->work);
+		} else if (langwell->hsm.a_bus_req) {
+			langwell_otg_drv_vbus(1);
+			langwell->hsm.a_wait_vrise_tmout = 0;
+			langwell_otg_add_timer(a_wait_vrise_tmr);
+			langwell->otg.state = OTG_STATE_A_WAIT_VRISE;
+		} else if (!langwell->hsm.a_sess_vld) {
+			langwell->hsm.a_srp_det = 0;
+			langwell_otg_drv_vbus(0);
+			set_host_mode();
+			langwell->otg.state = OTG_STATE_A_IDLE;
+		}
+		break;
+	default:
+		;
+	}
+
+	otg_dbg("%s: new state = %s\n", __func__,
+			state_string(langwell->otg.state));
+}
+
+	static ssize_t
+show_registers(struct device *_dev, struct device_attribute *attr, char *buf)
+{
+	struct langwell_otg *langwell;
+	char *next;
+	unsigned size;
+	unsigned t;
+
+	langwell = the_transceiver;
+	next = buf;
+	size = PAGE_SIZE;
+
+	t = scnprintf(next, size,
+		"\n"
+		"USBCMD = 0x%08x \n"
+		"USBSTS = 0x%08x \n"
+		"USBINTR = 0x%08x \n"
+		"ASYNCLISTADDR = 0x%08x \n"
+		"PORTSC1 = 0x%08x \n"
+		"HOSTPC1 = 0x%08x \n"
+		"OTGSC = 0x%08x \n"
+		"USBMODE = 0x%08x \n",
+		readl(langwell->regs + 0x30),
+		readl(langwell->regs + 0x34),
+		readl(langwell->regs + 0x38),
+		readl(langwell->regs + 0x48),
+		readl(langwell->regs + 0x74),
+		readl(langwell->regs + 0xb4),
+		readl(langwell->regs + 0xf4),
+		readl(langwell->regs + 0xf8)
+		);
+	size -= t;
+	next += t;
+
+	return PAGE_SIZE - size;
+}
+static DEVICE_ATTR(registers, S_IRUGO, show_registers, NULL);
+
+static ssize_t
+show_hsm(struct device *_dev, struct device_attribute *attr, char *buf)
+{
+	struct langwell_otg *langwell;
+	char *next;
+	unsigned size;
+	unsigned t;
+
+	langwell = the_transceiver;
+	next = buf;
+	size = PAGE_SIZE;
+
+	t = scnprintf(next, size,
+		"\n"
+		"current state = %s\n"
+		"a_bus_resume = \t%d\n"
+		"a_bus_suspend = \t%d\n"
+		"a_conn = \t%d\n"
+		"a_sess_vld = \t%d\n"
+		"a_srp_det = \t%d\n"
+		"a_vbus_vld = \t%d\n"
+		"b_bus_resume = \t%d\n"
+		"b_bus_suspend = \t%d\n"
+		"b_conn = \t%d\n"
+		"b_se0_srp = \t%d\n"
+		"b_sess_end = \t%d\n"
+		"b_sess_vld = \t%d\n"
+		"id = \t%d\n"
+		"a_set_b_hnp_en = \t%d\n"
+		"b_srp_done = \t%d\n"
+		"b_hnp_enable = \t%d\n"
+		"a_wait_vrise_tmout = \t%d\n"
+		"a_wait_bcon_tmout = \t%d\n"
+		"a_aidl_bdis_tmout = \t%d\n"
+		"b_ase0_brst_tmout = \t%d\n"
+		"a_bus_drop = \t%d\n"
+		"a_bus_req = \t%d\n"
+		"a_clr_err = \t%d\n"
+		"a_suspend_req = \t%d\n"
+		"b_bus_req = \t%d\n"
+		"b_bus_suspend_tmout = \t%d\n"
+		"b_bus_suspend_vld = \t%d\n",
+		state_string(langwell->otg.state),
+		langwell->hsm.a_bus_resume,
+		langwell->hsm.a_bus_suspend,
+		langwell->hsm.a_conn,
+		langwell->hsm.a_sess_vld,
+		langwell->hsm.a_srp_det,
+		langwell->hsm.a_vbus_vld,
+		langwell->hsm.b_bus_resume,
+		langwell->hsm.b_bus_suspend,
+		langwell->hsm.b_conn,
+		langwell->hsm.b_se0_srp,
+		langwell->hsm.b_sess_end,
+		langwell->hsm.b_sess_vld,
+		langwell->hsm.id,
+		langwell->hsm.a_set_b_hnp_en,
+		langwell->hsm.b_srp_done,
+		langwell->hsm.b_hnp_enable,
+		langwell->hsm.a_wait_vrise_tmout,
+		langwell->hsm.a_wait_bcon_tmout,
+		langwell->hsm.a_aidl_bdis_tmout,
+		langwell->hsm.b_ase0_brst_tmout,
+		langwell->hsm.a_bus_drop,
+		langwell->hsm.a_bus_req,
+		langwell->hsm.a_clr_err,
+		langwell->hsm.a_suspend_req,
+		langwell->hsm.b_bus_req,
+		langwell->hsm.b_bus_suspend_tmout,
+		langwell->hsm.b_bus_suspend_vld
+		);
+	size -= t;
+	next += t;
+
+	return PAGE_SIZE - size;
+}
+static DEVICE_ATTR(hsm, S_IRUGO, show_hsm, NULL);
+
+static ssize_t
+get_a_bus_req(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct langwell_otg *langwell;
+	char *next;
+	unsigned size;
+	unsigned t;
+
+	langwell =  the_transceiver;
+	next = buf;
+	size = PAGE_SIZE;
+
+	t = scnprintf(next, size, "%d", langwell->hsm.a_bus_req);
+	size -= t;
+	next += t;
+
+	return PAGE_SIZE - size;
+}
+
+static ssize_t
+set_a_bus_req(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct langwell_otg *langwell;
+	langwell = the_transceiver;
+	if (!langwell->otg.default_a)
+		return -1;
+	if (count > 2)
+		return -1;
+
+	if (buf[0] == '0') {
+		langwell->hsm.a_bus_req = 0;
+		otg_dbg("a_bus_req = 0\n");
+	} else if (buf[0] == '1') {
+		/* If a_bus_drop is TRUE, a_bus_req can't be set */
+		if (langwell->hsm.a_bus_drop)
+			return -1;
+		langwell->hsm.a_bus_req = 1;
+		otg_dbg("a_bus_req = 1\n");
+	}
+	if (spin_trylock(&langwell->wq_lock)) {
+		queue_work(langwell->qwork, &langwell->work);
+		spin_unlock(&langwell->wq_lock);
+	}
+	return count;
+}
+static DEVICE_ATTR(a_bus_req, S_IRUGO | S_IWUGO, get_a_bus_req, set_a_bus_req);
+
+static ssize_t
+get_a_bus_drop(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct langwell_otg *langwell;
+	char *next;
+	unsigned size;
+	unsigned t;
+
+	langwell =  the_transceiver;
+	next = buf;
+	size = PAGE_SIZE;
+
+	t = scnprintf(next, size, "%d", langwell->hsm.a_bus_drop);
+	size -= t;
+	next += t;
+
+	return PAGE_SIZE - size;
+}
+
+static ssize_t
+set_a_bus_drop(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct langwell_otg *langwell;
+	langwell = the_transceiver;
+	if (!langwell->otg.default_a)
+		return -1;
+	if (count > 2)
+		return -1;
+
+	if (buf[0] == '0') {
+		langwell->hsm.a_bus_drop = 0;
+		otg_dbg("a_bus_drop = 0\n");
+	} else if (buf[0] == '1') {
+		langwell->hsm.a_bus_drop = 1;
+		langwell->hsm.a_bus_req = 0;
+		otg_dbg("a_bus_drop = 1, then a_bus_req = 0\n");
+	}
+	if (spin_trylock(&langwell->wq_lock)) {
+		queue_work(langwell->qwork, &langwell->work);
+		spin_unlock(&langwell->wq_lock);
+	}
+	return count;
+}
+static DEVICE_ATTR(a_bus_drop, S_IRUGO | S_IWUGO,
+	get_a_bus_drop, set_a_bus_drop);
+
+static ssize_t
+get_b_bus_req(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct langwell_otg *langwell;
+	char *next;
+	unsigned size;
+	unsigned t;
+
+	langwell =  the_transceiver;
+	next = buf;
+	size = PAGE_SIZE;
+
+	t = scnprintf(next, size, "%d", langwell->hsm.b_bus_req);
+	size -= t;
+	next += t;
+
+	return PAGE_SIZE - size;
+}
+
+static ssize_t
+set_b_bus_req(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct langwell_otg *langwell;
+	langwell = the_transceiver;
+
+	if (langwell->otg.default_a)
+		return -1;
+
+	if (count > 2)
+		return -1;
+
+	if (buf[0] == '0') {
+		langwell->hsm.b_bus_req = 0;
+		otg_dbg("b_bus_req = 0\n");
+	} else if (buf[0] == '1') {
+		langwell->hsm.b_bus_req = 1;
+		otg_dbg("b_bus_req = 1\n");
+	}
+	if (spin_trylock(&langwell->wq_lock)) {
+		queue_work(langwell->qwork, &langwell->work);
+		spin_unlock(&langwell->wq_lock);
+	}
+	return count;
+}
+static DEVICE_ATTR(b_bus_req, S_IRUGO | S_IWUGO, get_b_bus_req, set_b_bus_req);
+
+static ssize_t
+set_a_clr_err(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct langwell_otg *langwell;
+	langwell = the_transceiver;
+
+	if (!langwell->otg.default_a)
+		return -1;
+	if (count > 2)
+		return -1;
+
+	if (buf[0] == '1') {
+		langwell->hsm.a_clr_err = 1;
+		otg_dbg("a_clr_err = 1\n");
+	}
+	if (spin_trylock(&langwell->wq_lock)) {
+		queue_work(langwell->qwork, &langwell->work);
+		spin_unlock(&langwell->wq_lock);
+	}
+	return count;
+}
+static DEVICE_ATTR(a_clr_err, S_IWUGO, NULL, set_a_clr_err);
+
+static struct attribute *inputs_attrs[] = {
+	&dev_attr_a_bus_req.attr,
+	&dev_attr_a_bus_drop.attr,
+	&dev_attr_b_bus_req.attr,
+	&dev_attr_a_clr_err.attr,
+	NULL,
+};
+
+static struct attribute_group debug_dev_attr_group = {
+	.name = "inputs",
+	.attrs = inputs_attrs,
+};
+
+int langwell_register_host(struct pci_driver *host_driver)
+{
+	int	ret = 0;
+
+	the_transceiver->host_ops = host_driver;
+	queue_work(the_transceiver->qwork, &the_transceiver->work);
+	otg_dbg("host controller driver is registered\n");
+
+	return ret;
+}
+EXPORT_SYMBOL(langwell_register_host);
+
+void langwell_unregister_host(struct pci_driver *host_driver)
+{
+	if (the_transceiver->host_ops)
+		the_transceiver->host_ops->remove(the_transceiver->pdev);
+	the_transceiver->host_ops = NULL;
+	the_transceiver->hsm.a_bus_drop = 1;
+	queue_work(the_transceiver->qwork, &the_transceiver->work);
+	otg_dbg("host controller driver is unregistered\n");
+}
+EXPORT_SYMBOL(langwell_unregister_host);
+
+int langwell_register_peripheral(struct pci_driver *client_driver)
+{
+	int	ret = 0;
+
+	if (client_driver)
+		ret = client_driver->probe(the_transceiver->pdev,
+				client_driver->id_table);
+	if (!ret) {
+		the_transceiver->client_ops = client_driver;
+		queue_work(the_transceiver->qwork, &the_transceiver->work);
+		otg_dbg("client controller driver is registered\n");
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(langwell_register_peripheral);
+
+void langwell_unregister_peripheral(struct pci_driver *client_driver)
+{
+	if (the_transceiver->client_ops)
+		the_transceiver->client_ops->remove(the_transceiver->pdev);
+	the_transceiver->client_ops = NULL;
+	the_transceiver->hsm.b_bus_req = 0;
+	queue_work(the_transceiver->qwork, &the_transceiver->work);
+	otg_dbg("client controller driver is unregistered\n");
+}
+EXPORT_SYMBOL(langwell_unregister_peripheral);
+
+static int langwell_otg_probe(struct pci_dev *pdev,
+		const struct pci_device_id *id)
+{
+	unsigned long		resource, len;
+	void __iomem 		*base = NULL;
+	int			retval;
+	u32			val32;
+	struct langwell_otg	*langwell;
+	char			qname[] = "langwell_otg_queue";
+
+	retval = 0;
+	otg_dbg("\notg controller is detected.\n");
+	if (pci_enable_device(pdev) < 0) {
+		retval = -ENODEV;
+		goto done;
+	}
+
+	langwell = kzalloc(sizeof *langwell, GFP_KERNEL);
+	if (langwell == NULL) {
+		retval = -ENOMEM;
+		goto done;
+	}
+	the_transceiver = langwell;
+
+	/* control register: BAR 0 */
+	resource = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+	if (!request_mem_region(resource, len, driver_name)) {
+		retval = -EBUSY;
+		goto err;
+	}
+	langwell->region = 1;
+
+	base = ioremap_nocache(resource, len);
+	if (base == NULL) {
+		retval = -EFAULT;
+		goto err;
+	}
+	langwell->regs = base;
+
+	if (!pdev->irq) {
+		otg_dbg("No IRQ.\n");
+		retval = -ENODEV;
+		goto err;
+	}
+
+	langwell->qwork = create_workqueue(qname);
+	if (!langwell->qwork) {
+		otg_dbg("cannot create workqueue %s\n", qname);
+		retval = -ENOMEM;
+		goto err;
+	}
+	INIT_WORK(&langwell->work, langwell_otg_work);
+
+	/* OTG common part */
+	langwell->pdev = pdev;
+	langwell->otg.dev = &pdev->dev;
+	langwell->otg.label = driver_name;
+	langwell->otg.set_host = langwell_otg_set_host;
+	langwell->otg.set_peripheral = langwell_otg_set_peripheral;
+	langwell->otg.set_power = langwell_otg_set_power;
+	langwell->otg.start_srp = langwell_otg_start_srp;
+	langwell->otg.state = OTG_STATE_UNDEFINED;
+	if (otg_set_transceiver(&langwell->otg)) {
+		otg_dbg("can't set transceiver\n");
+		retval = -EBUSY;
+		goto err;
+	}
+
+	reset_otg();
+	init_hsm();
+
+	spin_lock_init(&langwell->lock);
+	spin_lock_init(&langwell->wq_lock);
+	INIT_LIST_HEAD(&active_timers);
+	langwell_otg_init_timers(&langwell->hsm);
+
+	if (request_irq(pdev->irq, otg_irq, IRQF_SHARED,
+				driver_name, langwell) != 0) {
+		otg_dbg("request interrupt %d failed\n", pdev->irq);
+		retval = -EBUSY;
+		goto err;
+	}
+
+	/* enable OTGSC int */
+	val32 = OTGSC_DPIE | OTGSC_BSEIE | OTGSC_BSVIE |
+		OTGSC_ASVIE | OTGSC_AVVIE | OTGSC_IDIE | OTGSC_IDPU;
+	writel(val32, langwell->regs + CI_OTGSC);
+
+	retval = device_create_file(&pdev->dev, &dev_attr_registers);
+	if (retval < 0) {
+		otg_dbg("Can't register sysfs attribute: %d\n", retval);
+		goto err;
+	}
+
+	retval = device_create_file(&pdev->dev, &dev_attr_hsm);
+	if (retval < 0) {
+		otg_dbg("Can't hsm sysfs attribute: %d\n", retval);
+		goto err;
+	}
+
+	retval = sysfs_create_group(&pdev->dev.kobj, &debug_dev_attr_group);
+	if (retval < 0) {
+		otg_dbg("Can't register sysfs attr group: %d\n", retval);
+		goto err;
+	}
+
+	if (langwell->otg.state == OTG_STATE_A_IDLE)
+		queue_work(langwell->qwork, &langwell->work);
+
+	return 0;
+
+err:
+	if (the_transceiver)
+		langwell_otg_remove(pdev);
+done:
+	return retval;
+}
+
+static void langwell_otg_remove(struct pci_dev *pdev)
+{
+	struct langwell_otg *langwell;
+
+	langwell = the_transceiver;
+
+	if (langwell->qwork) {
+		flush_workqueue(langwell->qwork);
+		destroy_workqueue(langwell->qwork);
+	}
+	langwell_otg_free_timers();
+
+	/* disable OTGSC interrupt as OTGSC doesn't change in reset */
+	writel(0, langwell->regs + CI_OTGSC);
+
+	if (pdev->irq)
+		free_irq(pdev->irq, langwell);
+	if (langwell->regs)
+		iounmap(langwell->regs);
+	if (langwell->region)
+		release_mem_region(pci_resource_start(pdev, 0),
+				pci_resource_len(pdev, 0));
+
+	otg_set_transceiver(NULL);
+	pci_disable_device(pdev);
+	sysfs_remove_group(&pdev->dev.kobj, &debug_dev_attr_group);
+	device_remove_file(&pdev->dev, &dev_attr_hsm);
+	device_remove_file(&pdev->dev, &dev_attr_registers);
+	kfree(langwell);
+	langwell = NULL;
+}
+
+static void transceiver_suspend(struct pci_dev *pdev)
+{
+	pci_save_state(pdev);
+	pci_set_power_state(pdev, PCI_D3hot);
+	langwell_otg_phy_low_power(1);
+}
+
+static int langwell_otg_suspend(struct pci_dev *pdev, pm_message_t message)
+{
+	int 	ret = 0;
+	struct langwell_otg *langwell;
+
+	langwell = the_transceiver;
+
+	/* Disbale OTG interrupts */
+	langwell_otg_intr(0);
+
+	if (pdev->irq)
+		free_irq(pdev->irq, langwell);
+
+	/* Prevent more otg_work */
+	flush_workqueue(langwell->qwork);
+	spin_lock(&langwell->wq_lock);
+
+	/* start actions */
+	switch (langwell->otg.state) {
+	case OTG_STATE_A_IDLE:
+	case OTG_STATE_B_IDLE:
+	case OTG_STATE_A_WAIT_VFALL:
+	case OTG_STATE_A_VBUS_ERR:
+		transceiver_suspend(pdev);
+		break;
+	case OTG_STATE_A_WAIT_VRISE:
+		langwell_otg_del_timer(a_wait_vrise_tmr);
+		langwell->hsm.a_srp_det = 0;
+		langwell_otg_drv_vbus(0);
+		langwell->otg.state = OTG_STATE_A_IDLE;
+		transceiver_suspend(pdev);
+		break;
+	case OTG_STATE_A_WAIT_BCON:
+		langwell_otg_del_timer(a_wait_bcon_tmr);
+		if (langwell->host_ops)
+			ret = langwell->host_ops->suspend(pdev, message);
+		langwell_otg_drv_vbus(0);
+		break;
+	case OTG_STATE_A_HOST:
+		if (langwell->host_ops)
+			ret = langwell->host_ops->suspend(pdev, message);
+		langwell_otg_drv_vbus(0);
+		langwell_otg_phy_low_power(1);
+		break;
+	case OTG_STATE_A_SUSPEND:
+		langwell_otg_del_timer(a_aidl_bdis_tmr);
+		langwell_otg_HABA(0);
+		if (langwell->host_ops)
+			langwell->host_ops->remove(pdev);
+		else
+			otg_dbg("host driver has been removed.\n");
+		langwell_otg_drv_vbus(0);
+		transceiver_suspend(pdev);
+		langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+		break;
+	case OTG_STATE_A_PERIPHERAL:
+		if (langwell->client_ops)
+			ret = langwell->client_ops->suspend(pdev, message);
+		else
+			otg_dbg("client driver has been removed.\n");
+		langwell_otg_drv_vbus(0);
+		transceiver_suspend(pdev);
+		langwell->otg.state = OTG_STATE_A_WAIT_VFALL;
+		break;
+	case OTG_STATE_B_HOST:
+		if (langwell->host_ops)
+			langwell->host_ops->remove(pdev);
+		else
+			otg_dbg("host driver has been removed.\n");
+		langwell->hsm.b_bus_req = 0;
+		transceiver_suspend(pdev);
+		langwell->otg.state = OTG_STATE_B_IDLE;
+		break;
+	case OTG_STATE_B_PERIPHERAL:
+		if (langwell->client_ops)
+			ret = langwell->client_ops->suspend(pdev, message);
+		else
+			otg_dbg("client driver has been removed.\n");
+		break;
+	case OTG_STATE_B_WAIT_ACON:
+		langwell_otg_del_timer(b_ase0_brst_tmr);
+		langwell_otg_HAAR(0);
+		if (langwell->host_ops)
+			langwell->host_ops->remove(pdev);
+		else
+			otg_dbg("host driver has been removed.\n");
+		langwell->hsm.b_bus_req = 0;
+		langwell->otg.state = OTG_STATE_B_IDLE;
+		transceiver_suspend(pdev);
+		break;
+	default:
+		otg_dbg("error state before suspend\n ");
+		break;
+	}
+	spin_unlock(&langwell->wq_lock);
+
+	return ret;
+}
+
+static void transceiver_resume(struct pci_dev *pdev)
+{
+	pci_restore_state(pdev);
+	pci_set_power_state(pdev, PCI_D0);
+	langwell_otg_phy_low_power(0);
+}
+
+static int langwell_otg_resume(struct pci_dev *pdev)
+{
+	int 	ret = 0;
+	struct langwell_otg *langwell;
+
+	langwell = the_transceiver;
+
+	spin_lock(&langwell->wq_lock);
+
+	switch (langwell->otg.state) {
+	case OTG_STATE_A_IDLE:
+	case OTG_STATE_B_IDLE:
+	case OTG_STATE_A_WAIT_VFALL:
+	case OTG_STATE_A_VBUS_ERR:
+		transceiver_resume(pdev);
+		break;
+	case OTG_STATE_A_WAIT_BCON:
+		langwell_otg_add_timer(a_wait_bcon_tmr);
+		langwell_otg_drv_vbus(1);
+		if (langwell->host_ops)
+			ret = langwell->host_ops->resume(pdev);
+		break;
+	case OTG_STATE_A_HOST:
+		langwell_otg_drv_vbus(1);
+		langwell_otg_phy_low_power(0);
+		if (langwell->host_ops)
+			ret = langwell->host_ops->resume(pdev);
+		break;
+	case OTG_STATE_B_PERIPHERAL:
+		if (langwell->client_ops)
+			ret = langwell->client_ops->resume(pdev);
+		else
+			otg_dbg("client driver not loaded.\n");
+		break;
+	default:
+		otg_dbg("error state before suspend\n ");
+		break;
+	}
+
+	if (request_irq(pdev->irq, otg_irq, IRQF_SHARED,
+				driver_name, the_transceiver) != 0) {
+		otg_dbg("request interrupt %d failed\n", pdev->irq);
+		ret = -EBUSY;
+	}
+
+	/* enable OTG interrupts */
+	langwell_otg_intr(1);
+
+	spin_unlock(&langwell->wq_lock);
+
+	queue_work(langwell->qwork, &langwell->work);
+
+
+	return ret;
+}
+
+static int __init langwell_otg_init(void)
+{
+	return pci_register_driver(&otg_pci_driver);
+}
+module_init(langwell_otg_init);
+
+static void __exit langwell_otg_cleanup(void)
+{
+	pci_unregister_driver(&otg_pci_driver);
+}
+module_exit(langwell_otg_cleanup);
diff --git a/include/linux/usb/langwell_otg.h b/include/linux/usb/langwell_otg.h
new file mode 100644
index 00000000000..e115ae6df1d
--- /dev/null
+++ b/include/linux/usb/langwell_otg.h
@@ -0,0 +1,177 @@
+/*
+ * Intel Langwell USB OTG transceiver driver
+ * Copyright (C) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef __LANGWELL_OTG_H__
+#define __LANGWELL_OTG_H__
+
+/* notify transceiver driver about OTG events */
+extern void langwell_update_transceiver(void);
+/* HCD register bus driver */
+extern int langwell_register_host(struct pci_driver *host_driver);
+/* HCD unregister bus driver */
+extern void langwell_unregister_host(struct pci_driver *host_driver);
+/* DCD register bus driver */
+extern int langwell_register_peripheral(struct pci_driver *client_driver);
+/* DCD unregister bus driver */
+extern void langwell_unregister_peripheral(struct pci_driver *client_driver);
+/* No silent failure, output warning message */
+extern void langwell_otg_nsf_msg(unsigned long message);
+
+#define CI_USBCMD		0x30
+#	define USBCMD_RST		BIT(1)
+#	define USBCMD_RS		BIT(0)
+#define CI_USBSTS		0x34
+#	define USBSTS_SLI		BIT(8)
+#	define USBSTS_URI		BIT(6)
+#	define USBSTS_PCI		BIT(2)
+#define CI_PORTSC1		0x74
+#	define PORTSC_PP		BIT(12)
+#	define PORTSC_LS		(BIT(11) | BIT(10))
+#	define PORTSC_SUSP		BIT(7)
+#	define PORTSC_CCS		BIT(0)
+#define CI_HOSTPC1		0xb4
+#	define HOSTPC1_PHCD		BIT(22)
+#define CI_OTGSC		0xf4
+#	define OTGSC_DPIE		BIT(30)
+#	define OTGSC_1MSE		BIT(29)
+#	define OTGSC_BSEIE		BIT(28)
+#	define OTGSC_BSVIE		BIT(27)
+#	define OTGSC_ASVIE		BIT(26)
+#	define OTGSC_AVVIE		BIT(25)
+#	define OTGSC_IDIE		BIT(24)
+#	define OTGSC_DPIS		BIT(22)
+#	define OTGSC_1MSS		BIT(21)
+#	define OTGSC_BSEIS		BIT(20)
+#	define OTGSC_BSVIS		BIT(19)
+#	define OTGSC_ASVIS		BIT(18)
+#	define OTGSC_AVVIS		BIT(17)
+#	define OTGSC_IDIS		BIT(16)
+#	define OTGSC_DPS		BIT(14)
+#	define OTGSC_1MST		BIT(13)
+#	define OTGSC_BSE		BIT(12)
+#	define OTGSC_BSV		BIT(11)
+#	define OTGSC_ASV		BIT(10)
+#	define OTGSC_AVV		BIT(9)
+#	define OTGSC_ID			BIT(8)
+#	define OTGSC_HABA		BIT(7)
+#	define OTGSC_HADP		BIT(6)
+#	define OTGSC_IDPU		BIT(5)
+#	define OTGSC_DP			BIT(4)
+#	define OTGSC_OT			BIT(3)
+#	define OTGSC_HAAR		BIT(2)
+#	define OTGSC_VC			BIT(1)
+#	define OTGSC_VD			BIT(0)
+#	define OTGSC_INTEN_MASK		(0x7f << 24)
+#	define OTGSC_INTSTS_MASK	(0x7f << 16)
+#define CI_USBMODE		0xf8
+#	define USBMODE_CM		(BIT(1) | BIT(0))
+#	define USBMODE_IDLE		0
+#	define USBMODE_DEVICE		0x2
+#	define USBMODE_HOST		0x3
+
+#define INTR_DUMMY_MASK (USBSTS_SLI | USBSTS_URI | USBSTS_PCI)
+
+struct otg_hsm {
+	/* Input */
+	int a_bus_resume;
+	int a_bus_suspend;
+	int a_conn;
+	int a_sess_vld;
+	int a_srp_det;
+	int a_vbus_vld;
+	int b_bus_resume;
+	int b_bus_suspend;
+	int b_conn;
+	int b_se0_srp;
+	int b_sess_end;
+	int b_sess_vld;
+	int id;
+
+	/* Internal variables */
+	int a_set_b_hnp_en;
+	int b_srp_done;
+	int b_hnp_enable;
+
+	/* Timeout indicator for timers */
+	int a_wait_vrise_tmout;
+	int a_wait_bcon_tmout;
+	int a_aidl_bdis_tmout;
+	int b_ase0_brst_tmout;
+	int b_bus_suspend_tmout;
+	int b_srp_res_tmout;
+
+	/* Informative variables */
+	int a_bus_drop;
+	int a_bus_req;
+	int a_clr_err;
+	int a_suspend_req;
+	int b_bus_req;
+
+	/* Output */
+	int drv_vbus;
+	int loc_conn;
+	int loc_sof;
+
+	/* Others */
+	int b_bus_suspend_vld;
+};
+
+#define TA_WAIT_VRISE	100
+#define TA_WAIT_BCON	30000
+#define TA_AIDL_BDIS	15000
+#define TB_ASE0_BRST	5000
+#define TB_SE0_SRP	2
+#define TB_SRP_RES	100
+#define TB_BUS_SUSPEND	500
+
+struct langwell_otg_timer {
+	unsigned long expires;	/* Number of count increase to timeout */
+	unsigned long count;	/* Tick counter */
+	void (*function)(unsigned long);	/* Timeout function */
+	unsigned long data;	/* Data passed to function */
+	struct list_head list;
+};
+
+struct langwell_otg {
+	struct otg_transceiver 	otg;
+	struct otg_hsm 		hsm;
+	void __iomem 		*regs;
+	unsigned 		region;
+	struct pci_driver	*host_ops;
+	struct pci_driver	*client_ops;
+	struct pci_dev		*pdev;
+	struct work_struct 	work;
+	struct workqueue_struct	*qwork;
+	spinlock_t 		lock;
+	spinlock_t 		wq_lock;
+};
+
+static inline struct langwell_otg *otg_to_langwell(struct otg_transceiver *otg)
+{
+	return container_of(otg, struct langwell_otg, otg);
+}
+
+#ifdef DEBUG
+#define otg_dbg(fmt, args...) \
+	printk(KERN_DEBUG fmt , ## args)
+#else
+#define otg_dbg(fmt, args...) \
+	do { } while (0)
+#endif /* DEBUG */
+#endif /* __LANGWELL_OTG_H__ */
-- 
cgit v1.2.3-70-g09d2


From 66d4eadd8d067269ea8fead1a50fe87c2979a80d Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:52:28 -0700
Subject: USB: xhci: BIOS handoff and HW initialization.

Add PCI initialization code to take control of the xHCI host controller
away from the BIOS, halt, and reset the host controller.  The xHCI spec
says that BIOSes must give up the host controller within 5 seconds.

Add some host controller glue functions to handle hardware initialization
and memory allocation for the host controller.  The current xHCI
prototypes use PCI interrupts, but the xHCI spec requires MSI-X
interrupts.  Add code to support MSI-X interrupts, but use the PCI
interrupts for now.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.h        |   1 +
 drivers/usb/host/pci-quirks.c | 123 ++++++++++++++
 drivers/usb/host/xhci-hcd.c   | 377 ++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/xhci-mem.c   |  75 +++++++++
 drivers/usb/host/xhci-pci.c   | 150 +++++++++++++++++
 drivers/usb/host/xhci.h       |  63 ++++++-
 include/linux/pci_ids.h       |   1 +
 7 files changed, 788 insertions(+), 2 deletions(-)
 create mode 100644 drivers/usb/host/xhci-hcd.c
 create mode 100644 drivers/usb/host/xhci-mem.c
 create mode 100644 drivers/usb/host/xhci-pci.c

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
index 174170f14f0..7a47498b0aa 100644
--- a/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@ -173,6 +173,7 @@ struct hc_driver {
 #define	HCD_LOCAL_MEM	0x0002		/* HC needs local memory */
 #define	HCD_USB11	0x0010		/* USB 1.1 */
 #define	HCD_USB2	0x0020		/* USB 2.0 */
+#define	HCD_USB3	0x0040		/* USB 3.0 */
 
 	/* called to init HCD and root hub */
 	int	(*reset) (struct usb_hcd *hcd);
diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index 033c2846ce5..83b5f9cea85 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c
@@ -15,6 +15,7 @@
 #include <linux/delay.h>
 #include <linux/acpi.h>
 #include "pci-quirks.h"
+#include "xhci-ext-caps.h"
 
 
 #define UHCI_USBLEGSUP		0xc0		/* legacy support */
@@ -341,7 +342,127 @@ static void __devinit quirk_usb_disable_ehci(struct pci_dev *pdev)
 	return;
 }
 
+/*
+ * handshake - spin reading a register until handshake completes
+ * @ptr: address of hc register to be read
+ * @mask: bits to look at in result of read
+ * @done: value of those bits when handshake succeeds
+ * @wait_usec: timeout in microseconds
+ * @delay_usec: delay in microseconds to wait between polling
+ *
+ * Polls a register every delay_usec microseconds.
+ * Returns 0 when the mask bits have the value done.
+ * Returns -ETIMEDOUT if this condition is not true after
+ * wait_usec microseconds have passed.
+ */
+static int handshake(void __iomem *ptr, u32 mask, u32 done,
+		int wait_usec, int delay_usec)
+{
+	u32	result;
+
+	do {
+		result = readl(ptr);
+		result &= mask;
+		if (result == done)
+			return 0;
+		udelay(delay_usec);
+		wait_usec -= delay_usec;
+	} while (wait_usec > 0);
+	return -ETIMEDOUT;
+}
+
+/**
+ * PCI Quirks for xHCI.
+ *
+ * Takes care of the handoff between the Pre-OS (i.e. BIOS) and the OS.
+ * It signals to the BIOS that the OS wants control of the host controller,
+ * and then waits 5 seconds for the BIOS to hand over control.
+ * If we timeout, assume the BIOS is broken and take control anyway.
+ */
+static void __devinit quirk_usb_handoff_xhci(struct pci_dev *pdev)
+{
+	void __iomem *base;
+	int ext_cap_offset;
+	void __iomem *op_reg_base;
+	u32 val;
+	int timeout;
+
+	if (!mmio_resource_enabled(pdev, 0))
+		return;
+
+	base = ioremap_nocache(pci_resource_start(pdev, 0),
+				pci_resource_len(pdev, 0));
+	if (base == NULL)
+		return;
 
+	/*
+	 * Find the Legacy Support Capability register -
+	 * this is optional for xHCI host controllers.
+	 */
+	ext_cap_offset = xhci_find_next_cap_offset(base, XHCI_HCC_PARAMS_OFFSET);
+	do {
+		if (!ext_cap_offset)
+			/* We've reached the end of the extended capabilities */
+			goto hc_init;
+		val = readl(base + ext_cap_offset);
+		if (XHCI_EXT_CAPS_ID(val) == XHCI_EXT_CAPS_LEGACY)
+			break;
+		ext_cap_offset = xhci_find_next_cap_offset(base, ext_cap_offset);
+	} while (1);
+
+	/* If the BIOS owns the HC, signal that the OS wants it, and wait */
+	if (val & XHCI_HC_BIOS_OWNED) {
+		writel(val & XHCI_HC_OS_OWNED, base + ext_cap_offset);
+
+		/* Wait for 5 seconds with 10 microsecond polling interval */
+		timeout = handshake(base + ext_cap_offset, XHCI_HC_BIOS_OWNED,
+				0, 5000, 10);
+
+		/* Assume a buggy BIOS and take HC ownership anyway */
+		if (timeout) {
+			dev_warn(&pdev->dev, "xHCI BIOS handoff failed"
+					" (BIOS bug ?) %08x\n", val);
+			writel(val & ~XHCI_HC_BIOS_OWNED, base + ext_cap_offset);
+		}
+	}
+
+	/* Disable any BIOS SMIs */
+	writel(XHCI_LEGACY_DISABLE_SMI,
+			base + ext_cap_offset + XHCI_LEGACY_CONTROL_OFFSET);
+
+hc_init:
+	op_reg_base = base + XHCI_HC_LENGTH(readl(base));
+
+	/* Wait for the host controller to be ready before writing any
+	 * operational or runtime registers.  Wait 5 seconds and no more.
+	 */
+	timeout = handshake(op_reg_base + XHCI_STS_OFFSET, XHCI_STS_CNR, 0,
+			5000, 10);
+	/* Assume a buggy HC and start HC initialization anyway */
+	if (timeout) {
+		val = readl(op_reg_base + XHCI_STS_OFFSET);
+		dev_warn(&pdev->dev,
+				"xHCI HW not ready after 5 sec (HC bug?) "
+				"status = 0x%x\n", val);
+	}
+
+	/* Send the halt and disable interrupts command */
+	val = readl(op_reg_base + XHCI_CMD_OFFSET);
+	val &= ~(XHCI_CMD_RUN | XHCI_IRQS);
+	writel(val, op_reg_base + XHCI_CMD_OFFSET);
+
+	/* Wait for the HC to halt - poll every 125 usec (one microframe). */
+	timeout = handshake(op_reg_base + XHCI_STS_OFFSET, XHCI_STS_HALT, 1,
+			XHCI_MAX_HALT_USEC, 125);
+	if (timeout) {
+		val = readl(op_reg_base + XHCI_STS_OFFSET);
+		dev_warn(&pdev->dev,
+				"xHCI HW did not halt within %d usec "
+				"status = 0x%x\n", XHCI_MAX_HALT_USEC, val);
+	}
+
+	iounmap(base);
+}
 
 static void __devinit quirk_usb_early_handoff(struct pci_dev *pdev)
 {
@@ -351,5 +472,7 @@ static void __devinit quirk_usb_early_handoff(struct pci_dev *pdev)
 		quirk_usb_handoff_ohci(pdev);
 	else if (pdev->class == PCI_CLASS_SERIAL_USB_EHCI)
 		quirk_usb_disable_ehci(pdev);
+	else if (pdev->class == PCI_CLASS_SERIAL_USB_XHCI)
+		quirk_usb_handoff_xhci(pdev);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, quirk_usb_early_handoff);
diff --git a/drivers/usb/host/xhci-hcd.c b/drivers/usb/host/xhci-hcd.c
new file mode 100644
index 00000000000..64fcc22e9d5
--- /dev/null
+++ b/drivers/usb/host/xhci-hcd.c
@@ -0,0 +1,377 @@
+/*
+ * xHCI host controller driver
+ *
+ * Copyright (C) 2008 Intel Corp.
+ *
+ * Author: Sarah Sharp
+ * Some code borrowed from the Linux EHCI driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include "xhci.h"
+
+#define DRIVER_AUTHOR "Sarah Sharp"
+#define DRIVER_DESC "'eXtensible' Host Controller (xHC) Driver"
+
+/* TODO: copied from ehci-hcd.c - can this be refactored? */
+/*
+ * handshake - spin reading hc until handshake completes or fails
+ * @ptr: address of hc register to be read
+ * @mask: bits to look at in result of read
+ * @done: value of those bits when handshake succeeds
+ * @usec: timeout in microseconds
+ *
+ * Returns negative errno, or zero on success
+ *
+ * Success happens when the "mask" bits have the specified value (hardware
+ * handshake done).  There are two failure modes:  "usec" have passed (major
+ * hardware flakeout), or the register reads as all-ones (hardware removed).
+ */
+static int handshake(struct xhci_hcd *xhci, void __iomem *ptr,
+		      u32 mask, u32 done, int usec)
+{
+	u32	result;
+
+	do {
+		result = xhci_readl(xhci, ptr);
+		if (result == ~(u32)0)		/* card removed */
+			return -ENODEV;
+		result &= mask;
+		if (result == done)
+			return 0;
+		udelay(1);
+		usec--;
+	} while (usec > 0);
+	return -ETIMEDOUT;
+}
+
+/*
+ * Force HC into halt state.
+ *
+ * Disable any IRQs and clear the run/stop bit.
+ * HC will complete any current and actively pipelined transactions, and
+ * should halt within 16 microframes of the run/stop bit being cleared.
+ * Read HC Halted bit in the status register to see when the HC is finished.
+ * XXX: shouldn't we set HC_STATE_HALT here somewhere?
+ */
+int xhci_halt(struct xhci_hcd *xhci)
+{
+	u32 halted;
+	u32 cmd;
+	u32 mask;
+
+	xhci_dbg(xhci, "// Halt the HC\n");
+	/* Disable all interrupts from the host controller */
+	mask = ~(XHCI_IRQS);
+	halted = xhci_readl(xhci, &xhci->op_regs->status) & STS_HALT;
+	if (!halted)
+		mask &= ~CMD_RUN;
+
+	cmd = xhci_readl(xhci, &xhci->op_regs->command);
+	cmd &= mask;
+	xhci_writel(xhci, cmd, &xhci->op_regs->command);
+
+	return handshake(xhci, &xhci->op_regs->status,
+			STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC);
+}
+
+/*
+ * Reset a halted HC, and set the internal HC state to HC_STATE_HALT.
+ *
+ * This resets pipelines, timers, counters, state machines, etc.
+ * Transactions will be terminated immediately, and operational registers
+ * will be set to their defaults.
+ */
+int xhci_reset(struct xhci_hcd *xhci)
+{
+	u32 command;
+	u32 state;
+
+	state = xhci_readl(xhci, &xhci->op_regs->status);
+	BUG_ON((state & STS_HALT) == 0);
+
+	xhci_dbg(xhci, "// Reset the HC\n");
+	command = xhci_readl(xhci, &xhci->op_regs->command);
+	command |= CMD_RESET;
+	xhci_writel(xhci, command, &xhci->op_regs->command);
+	/* XXX: Why does EHCI set this here?  Shouldn't other code do this? */
+	xhci_to_hcd(xhci)->state = HC_STATE_HALT;
+
+	return handshake(xhci, &xhci->op_regs->command, CMD_RESET, 0, 250 * 1000);
+}
+
+/*
+ * Stop the HC from processing the endpoint queues.
+ */
+static void xhci_quiesce(struct xhci_hcd *xhci)
+{
+	/*
+	 * Queues are per endpoint, so we need to disable an endpoint or slot.
+	 *
+	 * To disable a slot, we need to insert a disable slot command on the
+	 * command ring and ring the doorbell.  This will also free any internal
+	 * resources associated with the slot (which might not be what we want).
+	 *
+	 * A Release Endpoint command sounds better - doesn't free internal HC
+	 * memory, but removes the endpoints from the schedule and releases the
+	 * bandwidth, disables the doorbells, and clears the endpoint enable
+	 * flag.  Usually used prior to a set interface command.
+	 *
+	 * TODO: Implement after command ring code is done.
+	 */
+	BUG_ON(!HC_IS_RUNNING(xhci_to_hcd(xhci)->state));
+	xhci_dbg(xhci, "Finished quiescing -- code not written yet\n");
+}
+
+#if 0
+/* Set up MSI-X table for entry 0 (may claim other entries later) */
+static int xhci_setup_msix(struct xhci_hcd *xhci)
+{
+	int ret;
+	struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller);
+
+	xhci->msix_count = 0;
+	/* XXX: did I do this right?  ixgbe does kcalloc for more than one */
+	xhci->msix_entries = kmalloc(sizeof(struct msix_entry), GFP_KERNEL);
+	if (!xhci->msix_entries) {
+		xhci_err(xhci, "Failed to allocate MSI-X entries\n");
+		return -ENOMEM;
+	}
+	xhci->msix_entries[0].entry = 0;
+
+	ret = pci_enable_msix(pdev, xhci->msix_entries, xhci->msix_count);
+	if (ret) {
+		xhci_err(xhci, "Failed to enable MSI-X\n");
+		goto free_entries;
+	}
+
+	/*
+	 * Pass the xhci pointer value as the request_irq "cookie".
+	 * If more irqs are added, this will need to be unique for each one.
+	 */
+	ret = request_irq(xhci->msix_entries[0].vector, &xhci_irq, 0,
+			"xHCI", xhci_to_hcd(xhci));
+	if (ret) {
+		xhci_err(xhci, "Failed to allocate MSI-X interrupt\n");
+		goto disable_msix;
+	}
+	xhci_dbg(xhci, "Finished setting up MSI-X\n");
+	return 0;
+
+disable_msix:
+	pci_disable_msix(pdev);
+free_entries:
+	kfree(xhci->msix_entries);
+	xhci->msix_entries = NULL;
+	return ret;
+}
+
+/* XXX: code duplication; can xhci_setup_msix call this? */
+/* Free any IRQs and disable MSI-X */
+static void xhci_cleanup_msix(struct xhci_hcd *xhci)
+{
+	struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller);
+	if (!xhci->msix_entries)
+		return;
+
+	free_irq(xhci->msix_entries[0].vector, xhci);
+	pci_disable_msix(pdev);
+	kfree(xhci->msix_entries);
+	xhci->msix_entries = NULL;
+	xhci_dbg(xhci, "Finished cleaning up MSI-X\n");
+}
+#endif
+
+/*
+ * Initialize memory for HCD and xHC (one-time init).
+ *
+ * Program the PAGESIZE register, initialize the device context array, create
+ * device contexts (?), set up a command ring segment (or two?), create event
+ * ring (one for now).
+ */
+int xhci_init(struct usb_hcd *hcd)
+{
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+	int retval = 0;
+
+	xhci_dbg(xhci, "xhci_init\n");
+	spin_lock_init(&xhci->lock);
+	retval = xhci_mem_init(xhci, GFP_KERNEL);
+	xhci_dbg(xhci, "Finished xhci_init\n");
+
+	return retval;
+}
+
+/*
+ * Start the HC after it was halted.
+ *
+ * This function is called by the USB core when the HC driver is added.
+ * Its opposite is xhci_stop().
+ *
+ * xhci_init() must be called once before this function can be called.
+ * Reset the HC, enable device slot contexts, program DCBAAP, and
+ * set command ring pointer and event ring pointer.
+ *
+ * Setup MSI-X vectors and enable interrupts.
+ */
+int xhci_run(struct usb_hcd *hcd)
+{
+	u32 temp;
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+	xhci_dbg(xhci, "xhci_run\n");
+
+#if 0	/* FIXME: MSI not setup yet */
+	/* Do this at the very last minute */
+	ret = xhci_setup_msix(xhci);
+	if (!ret)
+		return ret;
+
+	return -ENOSYS;
+#endif
+	xhci_dbg(xhci, "// Set the interrupt modulation register\n");
+	temp = xhci_readl(xhci, &xhci->ir_set->irq_control);
+	temp &= 0xffff;
+	temp |= (u32) 160;
+	xhci_writel(xhci, temp, &xhci->ir_set->irq_control);
+
+	/* Set the HCD state before we enable the irqs */
+	hcd->state = HC_STATE_RUNNING;
+	temp = xhci_readl(xhci, &xhci->op_regs->command);
+	temp |= (CMD_EIE);
+	xhci_dbg(xhci, "// Enable interrupts, cmd = 0x%x.\n",
+			temp);
+	xhci_writel(xhci, temp, &xhci->op_regs->command);
+
+	temp = xhci_readl(xhci, &xhci->ir_set->irq_pending);
+	xhci_dbg(xhci, "// Enabling event ring interrupter 0x%x"
+			" by writing 0x%x to irq_pending\n",
+			(unsigned int) xhci->ir_set,
+			(unsigned int) ER_IRQ_ENABLE(temp));
+	xhci_writel(xhci, ER_IRQ_ENABLE(temp),
+			&xhci->ir_set->irq_pending);
+	xhci_print_ir_set(xhci, xhci->ir_set, 0);
+
+	temp = xhci_readl(xhci, &xhci->op_regs->command);
+	temp |= (CMD_RUN);
+	xhci_dbg(xhci, "// Turn on HC, cmd = 0x%x.\n",
+			temp);
+	xhci_writel(xhci, temp, &xhci->op_regs->command);
+	/* Flush PCI posted writes */
+	temp = xhci_readl(xhci, &xhci->op_regs->command);
+	xhci_dbg(xhci, "// @%x = 0x%x\n",
+			(unsigned int) &xhci->op_regs->command, temp);
+
+	xhci_dbg(xhci, "Finished xhci_run\n");
+	return 0;
+}
+
+/*
+ * Stop xHCI driver.
+ *
+ * This function is called by the USB core when the HC driver is removed.
+ * Its opposite is xhci_run().
+ *
+ * Disable device contexts, disable IRQs, and quiesce the HC.
+ * Reset the HC, finish any completed transactions, and cleanup memory.
+ */
+void xhci_stop(struct usb_hcd *hcd)
+{
+	u32 temp;
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+	spin_lock_irq(&xhci->lock);
+	if (HC_IS_RUNNING(hcd->state))
+		xhci_quiesce(xhci);
+	xhci_halt(xhci);
+	xhci_reset(xhci);
+	spin_unlock_irq(&xhci->lock);
+
+#if 0	/* No MSI yet */
+	xhci_cleanup_msix(xhci);
+#endif
+	xhci_dbg(xhci, "// Disabling event ring interrupts\n");
+	temp = xhci_readl(xhci, &xhci->op_regs->status);
+	xhci_writel(xhci, temp & ~STS_EINT, &xhci->op_regs->status);
+	temp = xhci_readl(xhci, &xhci->ir_set->irq_pending);
+	xhci_writel(xhci, ER_IRQ_DISABLE(temp),
+			&xhci->ir_set->irq_pending);
+	xhci_print_ir_set(xhci, xhci->ir_set, 0);
+
+	xhci_dbg(xhci, "cleaning up memory\n");
+	xhci_mem_cleanup(xhci);
+	xhci_dbg(xhci, "xhci_stop completed - status = %x\n",
+		    xhci_readl(xhci, &xhci->op_regs->status));
+}
+
+/*
+ * Shutdown HC (not bus-specific)
+ *
+ * This is called when the machine is rebooting or halting.  We assume that the
+ * machine will be powered off, and the HC's internal state will be reset.
+ * Don't bother to free memory.
+ */
+void xhci_shutdown(struct usb_hcd *hcd)
+{
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+	spin_lock_irq(&xhci->lock);
+	xhci_halt(xhci);
+	spin_unlock_irq(&xhci->lock);
+
+#if 0
+	xhci_cleanup_msix(xhci);
+#endif
+
+	xhci_dbg(xhci, "xhci_shutdown completed - status = %x\n",
+		    xhci_readl(xhci, &xhci->op_regs->status));
+}
+
+int xhci_get_frame(struct usb_hcd *hcd)
+{
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+	/* EHCI mods by the periodic size.  Why? */
+	return xhci_readl(xhci, &xhci->run_regs->microframe_index) >> 3;
+}
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_LICENSE("GPL");
+
+static int __init xhci_hcd_init(void)
+{
+#ifdef CONFIG_PCI
+	int retval = 0;
+
+	retval = xhci_register_pci();
+
+	if (retval < 0) {
+		printk(KERN_DEBUG "Problem registering PCI driver.");
+		return retval;
+	}
+#endif
+	return 0;
+}
+module_init(xhci_hcd_init);
+
+static void __exit xhci_hcd_cleanup(void)
+{
+#ifdef CONFIG_PCI
+	xhci_unregister_pci();
+#endif
+}
+module_exit(xhci_hcd_cleanup);
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
new file mode 100644
index 00000000000..0e383f9c380
--- /dev/null
+++ b/drivers/usb/host/xhci-mem.c
@@ -0,0 +1,75 @@
+/*
+ * xHCI host controller driver
+ *
+ * Copyright (C) 2008 Intel Corp.
+ *
+ * Author: Sarah Sharp
+ * Some code borrowed from the Linux EHCI driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/usb.h>
+
+#include "xhci.h"
+
+void xhci_mem_cleanup(struct xhci_hcd *xhci)
+{
+	xhci->page_size = 0;
+	xhci->page_shift = 0;
+}
+
+int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags)
+{
+	unsigned int	val, val2;
+	u32 page_size;
+	int i;
+
+	page_size = xhci_readl(xhci, &xhci->op_regs->page_size);
+	xhci_dbg(xhci, "Supported page size register = 0x%x\n", page_size);
+	for (i = 0; i < 16; i++) {
+		if ((0x1 & page_size) != 0)
+			break;
+		page_size = page_size >> 1;
+	}
+	if (i < 16)
+		xhci_dbg(xhci, "Supported page size of %iK\n", (1 << (i+12)) / 1024);
+	else
+		xhci_warn(xhci, "WARN: no supported page size\n");
+	/* Use 4K pages, since that's common and the minimum the HC supports */
+	xhci->page_shift = 12;
+	xhci->page_size = 1 << xhci->page_shift;
+	xhci_dbg(xhci, "HCD page size set to %iK\n", xhci->page_size / 1024);
+
+	/*
+	 * Program the Number of Device Slots Enabled field in the CONFIG
+	 * register with the max value of slots the HC can handle.
+	 */
+	val = HCS_MAX_SLOTS(xhci_readl(xhci, &xhci->cap_regs->hcs_params1));
+	xhci_dbg(xhci, "// xHC can handle at most %d device slots.\n",
+			(unsigned int) val);
+	val2 = xhci_readl(xhci, &xhci->op_regs->config_reg);
+	val |= (val2 & ~HCS_SLOTS_MASK);
+	xhci_dbg(xhci, "// Setting Max device slots reg = 0x%x.\n",
+			(unsigned int) val);
+	xhci_writel(xhci, val, &xhci->op_regs->config_reg);
+
+	xhci->ir_set = &xhci->run_regs->ir_set[0];
+
+	return 0;
+fail:
+	xhci_warn(xhci, "Couldn't initialize memory\n");
+	xhci_mem_cleanup(xhci);
+	return -ENOMEM;
+}
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
new file mode 100644
index 00000000000..4015082adf6
--- /dev/null
+++ b/drivers/usb/host/xhci-pci.c
@@ -0,0 +1,150 @@
+/*
+ * xHCI host controller driver PCI Bus Glue.
+ *
+ * Copyright (C) 2008 Intel Corp.
+ *
+ * Author: Sarah Sharp
+ * Some code borrowed from the Linux EHCI driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pci.h>
+
+#include "xhci.h"
+
+static const char hcd_name[] = "xhci_hcd";
+
+/* called after powerup, by probe or system-pm "wakeup" */
+static int xhci_pci_reinit(struct xhci_hcd *xhci, struct pci_dev *pdev)
+{
+	/*
+	 * TODO: Implement finding debug ports later.
+	 * TODO: see if there are any quirks that need to be added to handle
+	 * new extended capabilities.
+	 */
+
+	/* PCI Memory-Write-Invalidate cycle support is optional (uncommon) */
+	if (!pci_set_mwi(pdev))
+		xhci_dbg(xhci, "MWI active\n");
+
+	xhci_dbg(xhci, "Finished xhci_pci_reinit\n");
+	return 0;
+}
+
+/* called during probe() after chip reset completes */
+static int xhci_pci_setup(struct usb_hcd *hcd)
+{
+	struct xhci_hcd		*xhci = hcd_to_xhci(hcd);
+	struct pci_dev		*pdev = to_pci_dev(hcd->self.controller);
+	int			retval;
+
+	xhci->cap_regs = hcd->regs;
+	xhci->op_regs = hcd->regs +
+		HC_LENGTH(xhci_readl(xhci, &xhci->cap_regs->hc_capbase));
+	xhci->run_regs = hcd->regs +
+		(xhci_readl(xhci, &xhci->cap_regs->run_regs_off) & RTSOFF_MASK);
+	/* Cache read-only capability registers */
+	xhci->hcs_params1 = xhci_readl(xhci, &xhci->cap_regs->hcs_params1);
+	xhci->hcs_params2 = xhci_readl(xhci, &xhci->cap_regs->hcs_params2);
+	xhci->hcs_params3 = xhci_readl(xhci, &xhci->cap_regs->hcs_params3);
+	xhci->hcc_params = xhci_readl(xhci, &xhci->cap_regs->hcc_params);
+	xhci_print_registers(xhci);
+
+	/* Make sure the HC is halted. */
+	retval = xhci_halt(xhci);
+	if (retval)
+		return retval;
+
+	xhci_dbg(xhci, "Resetting HCD\n");
+	/* Reset the internal HC memory state and registers. */
+	retval = xhci_reset(xhci);
+	if (retval)
+		return retval;
+	xhci_dbg(xhci, "Reset complete\n");
+
+	xhci_dbg(xhci, "Calling HCD init\n");
+	/* Initialize HCD and host controller data structures. */
+	retval = xhci_init(hcd);
+	if (retval)
+		return retval;
+	xhci_dbg(xhci, "Called HCD init\n");
+
+	pci_read_config_byte(pdev, XHCI_SBRN_OFFSET, &xhci->sbrn);
+	xhci_dbg(xhci, "Got SBRN %u\n", (unsigned int) xhci->sbrn);
+
+	/* Find any debug ports */
+	return xhci_pci_reinit(xhci, pdev);
+}
+
+static const struct hc_driver xhci_pci_hc_driver = {
+	.description =		hcd_name,
+	.product_desc =		"xHCI Host Controller",
+	.hcd_priv_size =	sizeof(struct xhci_hcd),
+
+	/*
+	 * generic hardware linkage
+	 */
+	.flags =		HCD_MEMORY | HCD_USB3,
+
+	/*
+	 * basic lifecycle operations
+	 */
+	.reset =		xhci_pci_setup,
+	.start =		xhci_run,
+	/* suspend and resume implemented later */
+	.stop =			xhci_stop,
+	.shutdown =		xhci_shutdown,
+
+	/*
+	 * scheduling support
+	 */
+	.get_frame_number =	xhci_get_frame,
+
+	/* Implement root hub support later. */
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* PCI driver selection metadata; PCI hotplugging uses this */
+static const struct pci_device_id pci_ids[] = { {
+	/* handle any USB 3.0 xHCI controller */
+	PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_XHCI, ~0),
+	.driver_data =	(unsigned long) &xhci_pci_hc_driver,
+	},
+	{ /* end: all zeroes */ }
+};
+MODULE_DEVICE_TABLE(pci, pci_ids);
+
+/* pci driver glue; this is a "new style" PCI driver module */
+static struct pci_driver xhci_pci_driver = {
+	.name =		(char *) hcd_name,
+	.id_table =	pci_ids,
+
+	.probe =	usb_hcd_pci_probe,
+	.remove =	usb_hcd_pci_remove,
+	/* suspend and resume implemented later */
+
+	.shutdown = 	usb_hcd_pci_shutdown,
+};
+
+int xhci_register_pci()
+{
+	return pci_register_driver(&xhci_pci_driver);
+}
+
+void xhci_unregister_pci()
+{
+	pci_unregister_driver(&xhci_pci_driver);
+}
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index a4d44aad069..59fae2e5ea5 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -32,6 +32,9 @@
 /* xHCI PCI Configuration Registers */
 #define XHCI_SBRN_OFFSET	(0x60)
 
+/* Max number of USB devices for any host controller - limit in section 6.1 */
+#define MAX_HC_SLOTS		256
+
 /*
  * xHCI register interface.
  * This corresponds to the eXtensible Host Controller Interface (xHCI)
@@ -359,10 +362,35 @@ struct intr_reg {
 	u32	erst_dequeue[2];
 } __attribute__ ((packed));
 
+/* irq_pending bitmasks */
 #define	ER_IRQ_PENDING(p)	((p) & 0x1)
-#define	ER_IRQ_ENABLE(p)	((p) | 0x2)
+/* bits 2:31 need to be preserved */
+#define	ER_IRQ_CLEAR(p)		((p) & 0xfffffffe)
+#define	ER_IRQ_ENABLE(p)	((ER_IRQ_CLEAR(p)) | 0x2)
+#define	ER_IRQ_DISABLE(p)	((ER_IRQ_CLEAR(p)) & ~(0x2))
+
+/* irq_control bitmasks */
+/* Minimum interval between interrupts (in 250ns intervals).  The interval
+ * between interrupts will be longer if there are no events on the event ring.
+ * Default is 4000 (1 ms).
+ */
+#define ER_IRQ_INTERVAL_MASK	(0xffff)
+/* Counter used to count down the time to the next interrupt - HW use only */
+#define ER_IRQ_COUNTER_MASK	(0xffff << 16)
+
+/* erst_size bitmasks */
 /* Preserve bits 16:31 of erst_size */
-#define	ERST_SIZE_MASK	(0xffff<<16)
+#define	ERST_SIZE_MASK		(0xffff << 16)
+
+/* erst_dequeue bitmasks */
+/* Dequeue ERST Segment Index (DESI) - Segment number (or alias)
+ * where the current dequeue pointer lies.  This is an optional HW hint.
+ */
+#define ERST_DESI_MASK		(0x7)
+/* Event Handler Busy (EHB) - is the event ring scheduled to be serviced by
+ * a work queue (or delayed service routine)?
+ */
+#define ERST_EHB		(1 << 3)
 
 /**
  * struct xhci_run_regs
@@ -386,6 +414,8 @@ struct xhci_hcd {
 	struct xhci_cap_regs __iomem *cap_regs;
 	struct xhci_op_regs __iomem *op_regs;
 	struct xhci_run_regs __iomem *run_regs;
+	/* Our HCD's current interrupter register set */
+	struct	intr_reg __iomem *ir_set;
 
 	/* Cached register copies of read-only HC data */
 	__u32		hcs_params1;
@@ -404,7 +434,13 @@ struct xhci_hcd {
 	u8		isoc_threshold;
 	int		event_ring_max;
 	int		addr_64;
+	/* 4KB min, 128MB max */
 	int		page_size;
+	/* Valid values are 12 to 20, inclusive */
+	int		page_shift;
+	/* only one MSI vector for now, but might need more later */
+	int		msix_count;
+	struct msix_entry	*msix_entries;
 };
 
 /* convert between an HCD pointer and the corresponding EHCI_HCD */
@@ -449,4 +485,27 @@ static inline void xhci_writel(const struct xhci_hcd *xhci,
 	writel(val, regs);
 }
 
+/* xHCI debugging */
+void xhci_print_ir_set(struct xhci_hcd *xhci, struct intr_reg *ir_set, int set_num);
+void xhci_print_registers(struct xhci_hcd *xhci);
+
+/* xHCI memory managment */
+void xhci_mem_cleanup(struct xhci_hcd *xhci);
+int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags);
+
+#ifdef CONFIG_PCI
+/* xHCI PCI glue */
+int xhci_register_pci(void);
+void xhci_unregister_pci(void);
+#endif
+
+/* xHCI host controller glue */
+int xhci_halt(struct xhci_hcd *xhci);
+int xhci_reset(struct xhci_hcd *xhci);
+int xhci_init(struct usb_hcd *hcd);
+int xhci_run(struct usb_hcd *hcd);
+void xhci_stop(struct usb_hcd *hcd);
+void xhci_shutdown(struct usb_hcd *hcd);
+int xhci_get_frame(struct usb_hcd *hcd);
+
 #endif /* __LINUX_XHCI_HCD_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index aa01d38c997..a3b00036579 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -104,6 +104,7 @@
 #define PCI_CLASS_SERIAL_USB_UHCI	0x0c0300
 #define PCI_CLASS_SERIAL_USB_OHCI	0x0c0310
 #define PCI_CLASS_SERIAL_USB_EHCI	0x0c0320
+#define PCI_CLASS_SERIAL_USB_XHCI	0x0c0330
 #define PCI_CLASS_SERIAL_FIBER		0x0c04
 #define PCI_CLASS_SERIAL_SMBUS		0x0c05
 
-- 
cgit v1.2.3-70-g09d2


From 6b403b020c1f42180b14d28d832da61167cff822 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:54:10 -0700
Subject: USB: Add SuperSpeed to the list of USB device speeds.

Modify the USB core to handle the new USB 3.0 speed, "SuperSpeed".  This
is 5.0 Gbps (wire speed).  There are probably more places that check for
speed that I've missed.

SuperSpeed devices have a 512 byte endpoint 0 max packet size.  This shows
up as a bMaxPacketSize0 set to 0x09 (see table 9-8 of the USB 3.0 bus
spec).

xHCI spec says that the xHC can handle intervals up to 2^15 microframes.  That
might change when real silicon becomes available.

Add FIXME note for SuperSpeed isochronous endpoints.  They can transmit up
to 16 packets in one "burst" before they wait for an acknowledgment of the
packets.  They can do up to 3 bursts per microframe (determined by the
mult value in the endpoint companion descriptor).  The xHCI driver doesn't
have support for isoc yet, so fix this later.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/config.c |  1 +
 drivers/usb/core/hcd.c    | 16 ++++++++++++++--
 drivers/usb/core/hcd.h    |  1 +
 drivers/usb/core/hub.c    | 11 +++++++++--
 drivers/usb/core/urb.c    |  6 ++++++
 include/linux/usb/ch9.h   |  1 +
 6 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 568244c99bd..e9426acf568 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -92,6 +92,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
 	if (usb_endpoint_xfer_int(d)) {
 		i = 1;
 		switch (to_usb_device(ddev)->speed) {
+		case USB_SPEED_SUPER:
 		case USB_SPEED_HIGH:
 			/* Many device manufacturers are using full-speed
 			 * bInterval values in high-speed interrupt endpoint
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index c65d9560402..120bb56c4b8 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1870,8 +1870,20 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		retval = -ENOMEM;
 		goto err_allocate_root_hub;
 	}
-	rhdev->speed = (hcd->driver->flags & HCD_USB2) ? USB_SPEED_HIGH :
-			USB_SPEED_FULL;
+
+	switch (hcd->driver->flags & HCD_MASK) {
+	case HCD_USB11:
+		rhdev->speed = USB_SPEED_FULL;
+		break;
+	case HCD_USB2:
+		rhdev->speed = USB_SPEED_HIGH;
+		break;
+	case HCD_USB3:
+		rhdev->speed = USB_SPEED_SUPER;
+		break;
+	default:
+		goto err_allocate_root_hub;
+	}
 	hcd->self.root_hub = rhdev;
 
 	/* wakeup flag init defaults to "everything works" for root hubs,
diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
index 7a47498b0aa..4f6ee60d97c 100644
--- a/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@ -174,6 +174,7 @@ struct hc_driver {
 #define	HCD_USB11	0x0010		/* USB 1.1 */
 #define	HCD_USB2	0x0020		/* USB 2.0 */
 #define	HCD_USB3	0x0040		/* USB 3.0 */
+#define	HCD_MASK	0x0070
 
 	/* called to init HCD and root hub */
 	int	(*reset) (struct usb_hcd *hcd);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index c7a1a1d4bcb..fa0208e0da6 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2471,6 +2471,7 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 	 * reported as 0xff in the device descriptor). WUSB1.0[4.8.1].
 	 */
 	switch (udev->speed) {
+	case USB_SPEED_SUPER:
 	case USB_SPEED_VARIABLE:	/* fixed at 512 */
 		udev->ep0.desc.wMaxPacketSize = cpu_to_le16(512);
 		break;
@@ -2496,6 +2497,9 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 	case USB_SPEED_LOW:	speed = "low";	break;
 	case USB_SPEED_FULL:	speed = "full";	break;
 	case USB_SPEED_HIGH:	speed = "high";	break;
+	case USB_SPEED_SUPER:
+				speed = "super";
+				break;
 	case USB_SPEED_VARIABLE:
 				speed = "variable";
 				type = "Wireless ";
@@ -2634,8 +2638,11 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 	if (retval)
 		goto fail;
 
-	i = udev->descriptor.bMaxPacketSize0 == 0xff?	/* wusb device? */
-	    512 : udev->descriptor.bMaxPacketSize0;
+	if (udev->descriptor.bMaxPacketSize0 == 0xff ||
+			udev->speed == USB_SPEED_SUPER)
+		i = 512;
+	else
+		i = udev->descriptor.bMaxPacketSize0;
 	if (le16_to_cpu(udev->ep0.desc.wMaxPacketSize) != i) {
 		if (udev->speed != USB_SPEED_FULL ||
 				!(i == 8 || i == 16 || i == 32 || i == 64)) {
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 3376055f36e..02eb0ef7a4c 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -351,6 +351,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 	if (xfertype == USB_ENDPOINT_XFER_ISOC) {
 		int	n, len;
 
+		/* FIXME SuperSpeed isoc endpoints have up to 16 bursts */
 		/* "high bandwidth" mode, 1-3 packets/uframe? */
 		if (dev->speed == USB_SPEED_HIGH) {
 			int	mult = 1 + ((max >> 11) & 0x03);
@@ -426,6 +427,11 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 			return -EINVAL;
 		/* too big? */
 		switch (dev->speed) {
+		case USB_SPEED_SUPER:	/* units are 125us */
+			/* Handle up to 2^(16-1) microframes */
+			if (urb->interval > (1 << 15))
+				return -EINVAL;
+			max = 1 << 15;
 		case USB_SPEED_HIGH:	/* units are microframes */
 			/* NOTE usb handles 2^15 */
 			if (urb->interval > (1024 * 8))
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index b145119a90d..93bfe635234 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -752,6 +752,7 @@ enum usb_device_speed {
 	USB_SPEED_LOW, USB_SPEED_FULL,		/* usb 1.1 */
 	USB_SPEED_HIGH,				/* usb 2.0 */
 	USB_SPEED_VARIABLE,			/* wireless (usb 2.5) */
+	USB_SPEED_SUPER,			/* usb 3.0 */
 };
 
 enum usb_device_state {
-- 
cgit v1.2.3-70-g09d2


From 7206b00164a1c3ca533e01db285955617e1019f8 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:54:49 -0700
Subject: USB: Add route string to struct usb_device.

This patch adds a hex route string to each USB device.  The route string is used
by the USB 3.0 host controller to send packets through the device tree.  USB 3.0
hubs use this string to route packets to the correct port.  This is fundamental
bus change from USB 2.0, where all packets were broadcast across the bus.

Devices (including hubs) under a root port receive the route string 0x0.  Every
four bits in the route string represent a port on a hub.  This length works
because USB 3.0 hubs are limited to 15 ports, and USB 2.0 hubs (with potentially
more ports) will never see packets with a route string.  A port number of 0
means the packet is destined for that hub.

For example, a peripheral device might have a route string of 0x00097.
This means the device is connected to port 9 of the hub at depth 1.
The hub at depth 1 is connected to port 7 of a hub at depth 0.
The hub at depth 0 is connected to a root port.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/usb.c | 10 ++++++++--
 include/linux/usb.h    |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 020b58528d9..f026991d0bd 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -375,18 +375,24 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent,
 	 */
 	if (unlikely(!parent)) {
 		dev->devpath[0] = '0';
+		dev->route = 0;
 
 		dev->dev.parent = bus->controller;
 		dev_set_name(&dev->dev, "usb%d", bus->busnum);
 		root_hub = 1;
 	} else {
 		/* match any labeling on the hubs; it's one-based */
-		if (parent->devpath[0] == '0')
+		if (parent->devpath[0] == '0') {
 			snprintf(dev->devpath, sizeof dev->devpath,
 				"%d", port1);
-		else
+			/* Root ports are not counted in route string */
+			dev->route = 0;
+		} else {
 			snprintf(dev->devpath, sizeof dev->devpath,
 				"%s.%d", parent->devpath, port1);
+			dev->route = parent->route +
+				(port1 << ((parent->level - 1)*4));
+		}
 
 		dev->dev.parent = &parent->dev;
 		dev_set_name(&dev->dev, "%d-%s", bus->busnum, dev->devpath);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index fb1f2a32ae0..2b380a16c62 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -362,6 +362,7 @@ struct usb_tt;
  * struct usb_device - kernel's representation of a USB device
  * @devnum: device number; address on a USB bus
  * @devpath: device ID string for use in messages (e.g., /port/...)
+ * @route: tree topology hex string for use with xHCI
  * @state: device state: configured, not attached, etc.
  * @speed: device speed: high/full/low (or error)
  * @tt: Transaction Translator info; used with low/full speed dev, highspeed hub
@@ -427,6 +428,7 @@ struct usb_tt;
 struct usb_device {
 	int		devnum;
 	char		devpath [16];
+	u32		route;
 	enum usb_device_state	state;
 	enum usb_device_speed	speed;
 
-- 
cgit v1.2.3-70-g09d2


From c6515272b858742962c1de0f3bf497a048b9abd7 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:57:26 -0700
Subject: USB: Support for addressing a USB device under xHCI

Add host controller driver API and a slot_id variable to struct
usb_device.  This allows the xHCI host controller driver to ask the
hardware to allocate a slot for the device when a struct usb_device is
allocated.  The slot needs to be allocated at that point because the
hardware can run out of internal resources, and we want to know that very
early in the device connection process.  Don't call this new API for root
hubs, since they aren't real devices.

Add HCD API to let the host controller choose the device address.  This is
especially important for xHCI hardware running in a virtualized
environment.  The guests running under the VM don't need to know which
addresses on the bus are taken, because the hardware picks the address for
them.  Announce SuperSpeed USB devices after the address has been assigned
by the hardware.

Don't use the new get descriptor/set address scheme with xHCI.  Unless
special handling is done in the host controller driver, the xHC can't
issue control transfers before you set the device address.  Support for
the older addressing scheme will be added when the xHCI driver supports
the Block Set Address Request (BSR) flag in the Address Device command.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.h |  8 ++++++
 drivers/usb/core/hub.c | 74 +++++++++++++++++++++++++++++++++++++-------------
 drivers/usb/core/usb.c | 14 +++++++++-
 include/linux/usb.h    |  2 ++
 4 files changed, 78 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
index 4f6ee60d97c..ae6d9db41ca 100644
--- a/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@ -226,6 +226,14 @@ struct hc_driver {
 	void	(*relinquish_port)(struct usb_hcd *, int);
 		/* has a port been handed over to a companion? */
 	int	(*port_handed_over)(struct usb_hcd *, int);
+
+	/* xHCI specific functions */
+		/* Called by usb_alloc_dev to alloc HC device structures */
+	int	(*alloc_dev)(struct usb_hcd *, struct usb_device *);
+		/* Called by usb_release_dev to free HC device structures */
+	void	(*free_dev)(struct usb_hcd *, struct usb_device *);
+		/* Returns the hardware-chosen device address */
+	int	(*address_device)(struct usb_hcd *, struct usb_device *udev);
 };
 
 extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 3c28bde6cbd..2af3b4f0605 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1328,6 +1328,11 @@ EXPORT_SYMBOL_GPL(usb_set_device_state);
  * 0 is reserved by USB for default address; (b) Linux's USB stack
  * uses always #1 for the root hub of the controller. So USB stack's
  * port #1, which is wusb virtual-port #0 has address #2.
+ *
+ * Devices connected under xHCI are not as simple.  The host controller
+ * supports virtualization, so the hardware assigns device addresses and
+ * the HCD must setup data structures before issuing a set address
+ * command to the hardware.
  */
 static void choose_address(struct usb_device *udev)
 {
@@ -1647,6 +1652,9 @@ int usb_new_device(struct usb_device *udev)
 	err = usb_configure_device(udev);	/* detect & probe dev/intfs */
 	if (err < 0)
 		goto fail;
+	dev_dbg(&udev->dev, "udev %d, busnum %d, minor = %d\n",
+			udev->devnum, udev->bus->busnum,
+			(((udev->bus->busnum-1) * 128) + (udev->devnum-1)));
 	/* export the usbdev device-node for libusb */
 	udev->dev.devt = MKDEV(USB_DEVICE_MAJOR,
 			(((udev->bus->busnum-1) * 128) + (udev->devnum-1)));
@@ -2400,19 +2408,29 @@ EXPORT_SYMBOL_GPL(usb_ep0_reinit);
 static int hub_set_address(struct usb_device *udev, int devnum)
 {
 	int retval;
+	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
 
-	if (devnum <= 1)
+	/*
+	 * The host controller will choose the device address,
+	 * instead of the core having chosen it earlier
+	 */
+	if (!hcd->driver->address_device && devnum <= 1)
 		return -EINVAL;
 	if (udev->state == USB_STATE_ADDRESS)
 		return 0;
 	if (udev->state != USB_STATE_DEFAULT)
 		return -EINVAL;
-	retval = usb_control_msg(udev, usb_sndaddr0pipe(),
-		USB_REQ_SET_ADDRESS, 0, devnum, 0,
-		NULL, 0, USB_CTRL_SET_TIMEOUT);
+	if (hcd->driver->address_device) {
+		retval = hcd->driver->address_device(hcd, udev);
+	} else {
+		retval = usb_control_msg(udev, usb_sndaddr0pipe(),
+				USB_REQ_SET_ADDRESS, 0, devnum, 0,
+				NULL, 0, USB_CTRL_SET_TIMEOUT);
+		if (retval == 0)
+			update_address(udev, devnum);
+	}
 	if (retval == 0) {
 		/* Device now using proper address. */
-		update_address(udev, devnum);
 		usb_set_device_state(udev, USB_STATE_ADDRESS);
 		usb_ep0_reinit(udev);
 	}
@@ -2525,10 +2543,11 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 				break;
 	default: 		speed = "?";	break;
 	}
-	dev_info (&udev->dev,
-		  "%s %s speed %sUSB device using %s and address %d\n",
-		  (udev->config) ? "reset" : "new", speed, type,
-		  udev->bus->controller->driver->name, devnum);
+	if (udev->speed != USB_SPEED_SUPER)
+		dev_info(&udev->dev,
+				"%s %s speed %sUSB device using %s and address %d\n",
+				(udev->config) ? "reset" : "new", speed, type,
+				udev->bus->controller->driver->name, devnum);
 
 	/* Set up TT records, if needed  */
 	if (hdev->tt) {
@@ -2553,7 +2572,11 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 	 * value.
 	 */
 	for (i = 0; i < GET_DESCRIPTOR_TRIES; (++i, msleep(100))) {
-		if (USE_NEW_SCHEME(retry_counter)) {
+		/*
+		 * An xHCI controller cannot send any packets to a device until
+		 * a set address command successfully completes.
+		 */
+		if (USE_NEW_SCHEME(retry_counter) && !(hcd->driver->flags & HCD_USB3)) {
 			struct usb_device_descriptor *buf;
 			int r = 0;
 
@@ -2619,7 +2642,7 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
  		 * unauthorized address in the Connect Ack sequence;
  		 * authorization will assign the final address.
  		 */
- 		if (udev->wusb == 0) {
+		if (udev->wusb == 0) {
 			for (j = 0; j < SET_ADDRESS_TRIES; ++j) {
 				retval = hub_set_address(udev, devnum);
 				if (retval >= 0)
@@ -2632,13 +2655,20 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 					devnum, retval);
 				goto fail;
 			}
+			if (udev->speed == USB_SPEED_SUPER) {
+				devnum = udev->devnum;
+				dev_info(&udev->dev,
+						"%s SuperSpeed USB device using %s and address %d\n",
+						(udev->config) ? "reset" : "new",
+						udev->bus->controller->driver->name, devnum);
+			}
 
 			/* cope with hardware quirkiness:
 			 *  - let SET_ADDRESS settle, some device hardware wants it
 			 *  - read ep0 maxpacket even for high and low speed,
 			 */
 			msleep(10);
-			if (USE_NEW_SCHEME(retry_counter))
+			if (USE_NEW_SCHEME(retry_counter) && !(hcd->driver->flags & HCD_USB3))
 				break;
   		}
 
@@ -2877,13 +2907,6 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		udev->level = hdev->level + 1;
 		udev->wusb = hub_is_wusb(hub);
 
-		/* set the address */
-		choose_address(udev);
-		if (udev->devnum <= 0) {
-			status = -ENOTCONN;	/* Don't retry */
-			goto loop;
-		}
-
 		/*
 		 * USB 3.0 devices are reset automatically before the connect
 		 * port status change appears, and the root hub port status
@@ -2901,6 +2924,19 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		else
 			udev->speed = USB_SPEED_UNKNOWN;
 
+		/*
+		 * xHCI needs to issue an address device command later
+		 * in the hub_port_init sequence for SS/HS/FS/LS devices.
+		 */
+		if (!(hcd->driver->flags & HCD_USB3)) {
+			/* set the address */
+			choose_address(udev);
+			if (udev->devnum <= 0) {
+				status = -ENOTCONN;	/* Don't retry */
+				goto loop;
+			}
+		}
+
 		/* reset (non-USB 3.0 devices) and get descriptor */
 		status = hub_port_init(hub, udev, port1, i);
 		if (status < 0)
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index f026991d0bd..55b8d3a22d2 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -184,11 +184,16 @@ EXPORT_SYMBOL_GPL(usb_find_interface);
 static void usb_release_dev(struct device *dev)
 {
 	struct usb_device *udev;
+	struct usb_hcd *hcd;
 
 	udev = to_usb_device(dev);
+	hcd = bus_to_hcd(udev->bus);
 
 	usb_destroy_configuration(udev);
-	usb_put_hcd(bus_to_hcd(udev->bus));
+	/* Root hubs aren't real devices, so don't free HCD resources */
+	if (hcd->driver->free_dev && udev->parent)
+		hcd->driver->free_dev(hcd, udev);
+	usb_put_hcd(hcd);
 	kfree(udev->product);
 	kfree(udev->manufacturer);
 	kfree(udev->serial);
@@ -348,6 +353,13 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent,
 		kfree(dev);
 		return NULL;
 	}
+	/* Root hubs aren't true devices, so don't allocate HCD resources */
+	if (usb_hcd->driver->alloc_dev && parent &&
+		!usb_hcd->driver->alloc_dev(usb_hcd, dev)) {
+		usb_put_hcd(bus_to_hcd(bus));
+		kfree(dev);
+		return NULL;
+	}
 
 	device_initialize(&dev->dev);
 	dev->dev.bus = &usb_bus_type;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 2b380a16c62..475cb75cc37 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -420,6 +420,7 @@ struct usb_tt;
  * @skip_sys_resume: skip the next system resume
  * @wusb_dev: if this is a Wireless USB device, link to the WUSB
  *	specific data for the device.
+ * @slot_id: Slot ID assigned by xHCI
  *
  * Notes:
  * Usbcore drivers should not set usbdev->state directly.  Instead use
@@ -504,6 +505,7 @@ struct usb_device {
 	unsigned skip_sys_resume:1;
 #endif
 	struct wusb_dev *wusb_dev;
+	int slot_id;
 };
 #define	to_usb_device(d) container_of(d, struct usb_device, dev)
 
-- 
cgit v1.2.3-70-g09d2


From 6d65b78a093552fb42448480d4c66bf093a6d4cf Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:57:50 -0700
Subject: USB: Support for submitting control URBs under xHCI.

Warn users of URB_NO_SETUP_DMA_MAP about xHCI behavior.

Device drivers can choose to DMA map the setup packet of a control transfer
before submitting the URB to the USB core.  Drivers then set the
URB_NO_SETUP_DMA_MAP and pass in the DMA memory address in setup_dma, instead of
providing a kernel address for setup_packet.  However, xHCI requires that the
setup packet be copied into an internal data structure, and we need a kernel
memory address pointer for that.  Warn users of URB_NO_SETUP_DMA_MAP that they
should provide a valid pointer for setup_packet, along with the DMA address.

FIXME:  I'm not entirely sure how to work around this in the xHCI driver
or USB core.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 475cb75cc37..112a2d6e922 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1044,7 +1044,9 @@ typedef void (*usb_complete_t)(struct urb *);
  * @setup_dma: For control transfers with URB_NO_SETUP_DMA_MAP set, the
  *	device driver has provided this DMA address for the setup packet.
  *	The host controller driver should use this in preference to
- *	setup_packet.
+ *	setup_packet, but the HCD may chose to ignore the address if it must
+ *	copy the setup packet into internal structures.  Therefore, setup_packet
+ *	must always point to a valid buffer.
  * @start_frame: Returns the initial frame for isochronous transfers.
  * @number_of_packets: Lists the number of ISO transfer buffers.
  * @interval: Specifies the polling interval for interrupt or isochronous
-- 
cgit v1.2.3-70-g09d2


From 663c30d0829d556efabd5fbd98fb8473da7fe694 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:58:14 -0700
Subject: USB: Parse and store the SuperSpeed endpoint companion descriptors.

The USB 3.0 bus specification added an "Endpoint Companion" descriptor that is
supposed to follow all SuperSpeed Endpoint descriptors.  This descriptor is used
to extend the bus protocol to allow more packets to be sent to an endpoint per
"microframe".  The word microframe was removed from the USB 3.0 specification
because the host controller does not send Start Of Frame (SOF) symbols down the
USB 3.0 wires.

The descriptor defines a bMaxBurst field, which indicates the number of packets
of wMaxPacketSize that a SuperSpeed device can send or recieve in a service
interval.  All non-control endpoints may set this value as high as 16 packets
(bMaxBurst = 15).

The descriptor also allows isochronous endpoints to further specify that they
can send and receive multiple bursts per service interval.  The bmAttributes
allows them to specify a "Mult" of up to 3 (bmAttributes = 2).

Bulk endpoints use bmAttributes to report the number of "Streams" they support.
This was an extension of the endpoint pipe concept to allow multiple mass
storage device commands to be outstanding for one bulk endpoint at a time.  This
should allow USB 3.0 mass storage devices to support SCSI command queueing.
Bulk endpoints can say they support up to 2^16 (65,536) streams.

The information in the endpoint companion descriptor must be stored with the
other device, config, interface, and endpoint descriptors because the host
controller needs to access them quickly, and we need to install some default
values if a SuperSpeed device doesn't provide an endpoint companion descriptor.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/config.c | 189 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/usb.h       |  16 ++++
 include/linux/usb/ch9.h   |  16 ++++
 3 files changed, 212 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index e9426acf568..7103758bb48 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -19,6 +19,32 @@ static inline const char *plural(int n)
 	return (n == 1 ? "" : "s");
 }
 
+/* FIXME: this is a kludge */
+static int find_next_descriptor_more(unsigned char *buffer, int size,
+    int dt1, int dt2, int dt3, int *num_skipped)
+{
+	struct usb_descriptor_header *h;
+	int n = 0;
+	unsigned char *buffer0 = buffer;
+
+	/* Find the next descriptor of type dt1 or dt2 or dt3 */
+	while (size > 0) {
+		h = (struct usb_descriptor_header *) buffer;
+		if (h->bDescriptorType == dt1 || h->bDescriptorType == dt2 ||
+				h->bDescriptorType == dt3)
+			break;
+		buffer += h->bLength;
+		size -= h->bLength;
+		++n;
+	}
+
+	/* Store the number of descriptors skipped and return the
+	 * number of bytes skipped */
+	if (num_skipped)
+		*num_skipped = n;
+	return buffer - buffer0;
+}
+
 static int find_next_descriptor(unsigned char *buffer, int size,
     int dt1, int dt2, int *num_skipped)
 {
@@ -43,6 +69,128 @@ static int find_next_descriptor(unsigned char *buffer, int size,
 	return buffer - buffer0;
 }
 
+static int usb_parse_endpoint_companion(struct device *ddev, int cfgno,
+		int inum, int asnum, struct usb_host_endpoint *ep,
+		int num_ep, unsigned char *buffer, int size)
+{
+	unsigned char *buffer_start = buffer;
+	struct usb_ep_comp_descriptor	*desc;
+	int retval;
+	int num_skipped;
+	int max_tx;
+	int i;
+
+	/* Allocate space for the companion descriptor */
+	ep->ep_comp = kzalloc(sizeof(struct usb_host_ep_comp), GFP_KERNEL);
+	if (!ep->ep_comp)
+		return -ENOMEM;
+	desc = (struct usb_ep_comp_descriptor *) buffer;
+	if (desc->bDescriptorType != USB_DT_SS_ENDPOINT_COMP) {
+		dev_warn(ddev, "No SuperSpeed endpoint companion for config %d "
+				" interface %d altsetting %d ep %d: "
+				"using minimum values\n",
+				cfgno, inum, asnum, ep->desc.bEndpointAddress);
+		ep->ep_comp->desc.bLength = USB_DT_EP_COMP_SIZE;
+		ep->ep_comp->desc.bDescriptorType = USB_DT_SS_ENDPOINT_COMP;
+		ep->ep_comp->desc.bMaxBurst = 0;
+		/*
+		 * Leave bmAttributes as zero, which will mean no streams for
+		 * bulk, and isoc won't support multiple bursts of packets.
+		 * With bursts of only one packet, and a Mult of 1, the max
+		 * amount of data moved per endpoint service interval is one
+		 * packet.
+		 */
+		if (usb_endpoint_xfer_isoc(&ep->desc) ||
+				usb_endpoint_xfer_int(&ep->desc))
+			ep->ep_comp->desc.wBytesPerInterval =
+				ep->desc.wMaxPacketSize;
+		/*
+		 * The next descriptor is for an Endpoint or Interface,
+		 * no extra descriptors to copy into the companion structure,
+		 * and we didn't eat up any of the buffer.
+		 */
+		retval = 0;
+		goto valid;
+	}
+	memcpy(&ep->ep_comp->desc, desc, USB_DT_EP_COMP_SIZE);
+	desc = &ep->ep_comp->desc;
+	buffer += desc->bLength;
+	size -= desc->bLength;
+
+	/* Eat up the other descriptors we don't care about */
+	ep->ep_comp->extra = buffer;
+	i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
+			USB_DT_INTERFACE, &num_skipped);
+	ep->ep_comp->extralen = i;
+	buffer += i;
+	size -= i;
+	retval = buffer - buffer_start + i;
+	if (num_skipped > 0)
+		dev_dbg(ddev, "skipped %d descriptor%s after %s\n",
+				num_skipped, plural(num_skipped),
+				"SuperSpeed endpoint companion");
+
+	/* Check the various values */
+	if (usb_endpoint_xfer_control(&ep->desc) && desc->bMaxBurst != 0) {
+		dev_warn(ddev, "Control endpoint with bMaxBurst = %d in "
+				"config %d interface %d altsetting %d ep %d: "
+				"setting to zero\n", desc->bMaxBurst,
+				cfgno, inum, asnum, ep->desc.bEndpointAddress);
+		desc->bMaxBurst = 0;
+	}
+	if (desc->bMaxBurst > 15) {
+		dev_warn(ddev, "Endpoint with bMaxBurst = %d in "
+				"config %d interface %d altsetting %d ep %d: "
+				"setting to 15\n", desc->bMaxBurst,
+				cfgno, inum, asnum, ep->desc.bEndpointAddress);
+		desc->bMaxBurst = 15;
+	}
+	if ((usb_endpoint_xfer_control(&ep->desc) || usb_endpoint_xfer_int(&ep->desc))
+			&& desc->bmAttributes != 0) {
+		dev_warn(ddev, "%s endpoint with bmAttributes = %d in "
+				"config %d interface %d altsetting %d ep %d: "
+				"setting to zero\n",
+				usb_endpoint_xfer_control(&ep->desc) ? "Control" : "Bulk",
+				desc->bmAttributes,
+				cfgno, inum, asnum, ep->desc.bEndpointAddress);
+		desc->bmAttributes = 0;
+	}
+	if (usb_endpoint_xfer_bulk(&ep->desc) && desc->bmAttributes > 16) {
+		dev_warn(ddev, "Bulk endpoint with more than 65536 streams in "
+				"config %d interface %d altsetting %d ep %d: "
+				"setting to max\n",
+				cfgno, inum, asnum, ep->desc.bEndpointAddress);
+		desc->bmAttributes = 16;
+	}
+	if (usb_endpoint_xfer_isoc(&ep->desc) && desc->bmAttributes > 2) {
+		dev_warn(ddev, "Isoc endpoint has Mult of %d in "
+				"config %d interface %d altsetting %d ep %d: "
+				"setting to 3\n", desc->bmAttributes + 1,
+				cfgno, inum, asnum, ep->desc.bEndpointAddress);
+		desc->bmAttributes = 2;
+	}
+	if (usb_endpoint_xfer_isoc(&ep->desc)) {
+		max_tx = ep->desc.wMaxPacketSize * (desc->bMaxBurst + 1) *
+			(desc->bmAttributes + 1);
+	} else if (usb_endpoint_xfer_int(&ep->desc)) {
+		max_tx = ep->desc.wMaxPacketSize * (desc->bMaxBurst + 1);
+	} else {
+		goto valid;
+	}
+	if (desc->wBytesPerInterval > max_tx) {
+		dev_warn(ddev, "%s endpoint with wBytesPerInterval of %d in "
+				"config %d interface %d altsetting %d ep %d: "
+				"setting to %d\n",
+				usb_endpoint_xfer_isoc(&ep->desc) ? "Isoc" : "Int",
+				desc->wBytesPerInterval,
+				cfgno, inum, asnum, ep->desc.bEndpointAddress,
+				max_tx);
+		desc->wBytesPerInterval = max_tx;
+	}
+valid:
+	return retval;
+}
+
 static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
     int asnum, struct usb_host_interface *ifp, int num_ep,
     unsigned char *buffer, int size)
@@ -50,7 +198,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
 	unsigned char *buffer0 = buffer;
 	struct usb_endpoint_descriptor *d;
 	struct usb_host_endpoint *endpoint;
-	int n, i, j;
+	int n, i, j, retval;
 
 	d = (struct usb_endpoint_descriptor *) buffer;
 	buffer += d->bLength;
@@ -162,17 +310,38 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
 				cfgno, inum, asnum, d->bEndpointAddress,
 				maxp);
 	}
-
-	/* Skip over any Class Specific or Vendor Specific descriptors;
-	 * find the next endpoint or interface descriptor */
-	endpoint->extra = buffer;
-	i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
-	    USB_DT_INTERFACE, &n);
-	endpoint->extralen = i;
+	/* Allocate room for and parse any endpoint companion descriptors */
+	if (to_usb_device(ddev)->speed == USB_SPEED_SUPER) {
+		endpoint->extra = buffer;
+		i = find_next_descriptor_more(buffer, size, USB_DT_SS_ENDPOINT_COMP,
+				USB_DT_ENDPOINT, USB_DT_INTERFACE, &n);
+		endpoint->extralen = i;
+		buffer += i;
+		size -= i;
+
+		if (size > 0) {
+			retval = usb_parse_endpoint_companion(ddev, cfgno, inum, asnum,
+					endpoint, num_ep, buffer, size);
+			if (retval >= 0) {
+				buffer += retval;
+				retval = buffer - buffer0;
+			}
+		} else {
+			retval = buffer - buffer0;
+		}
+	} else {
+		/* Skip over any Class Specific or Vendor Specific descriptors;
+		 * find the next endpoint or interface descriptor */
+		endpoint->extra = buffer;
+		i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
+				USB_DT_INTERFACE, &n);
+		endpoint->extralen = i;
+		retval = buffer - buffer0 + i;
+	}
 	if (n > 0)
 		dev_dbg(ddev, "skipped %d descriptor%s after %s\n",
 		    n, plural(n), "endpoint");
-	return buffer - buffer0 + i;
+	return retval;
 
 skip_to_next_endpoint_or_interface_descriptor:
 	i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
@@ -453,6 +622,8 @@ static int usb_parse_configuration(struct device *ddev, int cfgidx,
 		kref_init(&intfc->ref);
 	}
 
+	/* FIXME: parse the BOS descriptor */
+
 	/* Skip over any Class Specific or Vendor Specific descriptors;
 	 * find the first interface descriptor */
 	config->extra = buffer;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 112a2d6e922..13bced521b8 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -36,6 +36,7 @@ struct wusb_dev;
  *  - configs have one (often) or more interfaces;
  *  - interfaces have one (usually) or more settings;
  *  - each interface setting has zero or (usually) more endpoints.
+ *  - a SuperSpeed endpoint has a companion descriptor
  *
  * And there might be other descriptors mixed in with those.
  *
@@ -44,6 +45,19 @@ struct wusb_dev;
 
 struct ep_device;
 
+/* For SS devices */
+/**
+ * struct usb_host_ep_comp - Valid for SuperSpeed devices only
+ * @desc: endpoint companion descriptor, wMaxPacketSize in native byteorder
+ * @extra: descriptors following this endpoint companion descriptor
+ * @extralen: how many bytes of "extra" are valid
+ */
+struct usb_host_ep_comp {
+	struct usb_ep_comp_descriptor	desc;
+	unsigned char			*extra;   /* Extra descriptors */
+	int				extralen;
+};
+
 /**
  * struct usb_host_endpoint - host-side endpoint descriptor and queue
  * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder
@@ -51,6 +65,7 @@ struct ep_device;
  * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH)
  *	with one or more transfer descriptors (TDs) per urb
  * @ep_dev: ep_device for sysfs info
+ * @ep_comp: companion descriptor information for this endpoint
  * @extra: descriptors following this endpoint in the configuration
  * @extralen: how many bytes of "extra" are valid
  * @enabled: URBs may be submitted to this endpoint
@@ -63,6 +78,7 @@ struct usb_host_endpoint {
 	struct list_head		urb_list;
 	void				*hcpriv;
 	struct ep_device 		*ep_dev;	/* For sysfs info */
+	struct usb_host_ep_comp		*ep_comp;	/* For SS devices */
 
 	unsigned char *extra;   /* Extra descriptors */
 	int extralen;
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 93bfe635234..9e9c5c0a3d7 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -191,6 +191,8 @@ struct usb_ctrlrequest {
 #define USB_DT_WIRE_ADAPTER		0x21
 #define USB_DT_RPIPE			0x22
 #define USB_DT_CS_RADIO_CONTROL		0x23
+/* From the USB 3.0 spec */
+#define	USB_DT_SS_ENDPOINT_COMP		0x30
 
 /* Conventional codes for class-specific descriptors.  The convention is
  * defined in the USB "Common Class" Spec (3.11).  Individual class specs
@@ -535,6 +537,20 @@ static inline int usb_endpoint_is_isoc_out(
 
 /*-------------------------------------------------------------------------*/
 
+/* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */
+struct usb_ep_comp_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__u8  bMaxBurst;
+	__u8  bmAttributes;
+	__u16 wBytesPerInterval;
+} __attribute__ ((packed));
+
+#define USB_DT_EP_COMP_SIZE		6
+
+/*-------------------------------------------------------------------------*/
+
 /* USB_DT_DEVICE_QUALIFIER: Device Qualifier descriptor */
 struct usb_qualifier_descriptor {
 	__u8  bLength;
-- 
cgit v1.2.3-70-g09d2


From e04748e3a87271fcf30d383e3780c5d3ee1c1618 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 27 Apr 2009 19:59:01 -0700
Subject: USB: Push scatter gather lists down to host controller drivers.

This is the original patch I created before David Vrabel posted a better
patch (http://marc.info/?l=linux-usb&m=123377477209109&w=2) that does
basically the same thing.  This patch will get replaced with his
(modified) patch later.

Allow USB device drivers that use usb_sg_init() and usb_sg_wait() to push
bulk endpoint scatter gather lists down to the host controller drivers.
This allows host controller drivers to more efficiently enqueue these
transfers, and allows the xHCI host controller to better take advantage of
USB 3.0 "bursts" for bulk endpoints.

This patch currently only enables scatter gather lists for bulk endpoints.
Other endpoint types that use the usb_sg_* functions will not have their
scatter gather lists pushed down to the host controller.  For periodic
endpoints, we want each scatterlist entry to be a separate transfer.
Eventually, HCDs could parse these scatter-gather lists for periodic
endpoints also.  For now, we use the old code and call usb_submit_urb()
for each scatterlist entry.

The caller of usb_sg_init() can request that all bytes in the scatter
gather list be transferred by passing in a length of zero.  Handle that
request for a bulk endpoint under xHCI by walking the scatter gather list
and calculating the length.  We could let the HCD handle a zero length in
this case, but I'm not sure if the core layers in between will get
confused by this.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.c     |   3 +-
 drivers/usb/core/message.c | 139 +++++++++++++++++++++++++++++----------------
 include/linux/usb.h        |   2 +
 3 files changed, 95 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index b2da4753b12..1609623ec82 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1239,7 +1239,8 @@ static int map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 
 	/* Map the URB's buffers for DMA access.
 	 * Lower level HCD code should use *_dma exclusively,
-	 * unless it uses pio or talks to another transport.
+	 * unless it uses pio or talks to another transport,
+	 * or uses the provided scatter gather list for bulk.
 	 */
 	if (is_root_hub(urb->dev))
 		return 0;
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 3a2e69ec2f2..2bed83caacb 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -365,6 +365,7 @@ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
 	int i;
 	int urb_flags;
 	int dma;
+	int use_sg;
 
 	if (!io || !dev || !sg
 			|| usb_pipecontrol(pipe)
@@ -392,7 +393,19 @@ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
 	if (io->entries <= 0)
 		return io->entries;
 
-	io->urbs = kmalloc(io->entries * sizeof *io->urbs, mem_flags);
+	/* If we're running on an xHCI host controller, queue the whole scatter
+	 * gather list with one call to urb_enqueue().  This is only for bulk,
+	 * as that endpoint type does not care how the data gets broken up
+	 * across frames.
+	 */
+	if (usb_pipebulk(pipe) &&
+			bus_to_hcd(dev->bus)->driver->flags & HCD_USB3) {
+		io->urbs = kmalloc(sizeof *io->urbs, mem_flags);
+		use_sg = true;
+	} else {
+		io->urbs = kmalloc(io->entries * sizeof *io->urbs, mem_flags);
+		use_sg = false;
+	}
 	if (!io->urbs)
 		goto nomem;
 
@@ -402,62 +415,92 @@ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
 	if (usb_pipein(pipe))
 		urb_flags |= URB_SHORT_NOT_OK;
 
-	for_each_sg(sg, sg, io->entries, i) {
-		unsigned len;
-
-		io->urbs[i] = usb_alloc_urb(0, mem_flags);
-		if (!io->urbs[i]) {
-			io->entries = i;
+	if (use_sg) {
+		io->urbs[0] = usb_alloc_urb(0, mem_flags);
+		if (!io->urbs[0]) {
+			io->entries = 0;
 			goto nomem;
 		}
 
-		io->urbs[i]->dev = NULL;
-		io->urbs[i]->pipe = pipe;
-		io->urbs[i]->interval = period;
-		io->urbs[i]->transfer_flags = urb_flags;
-
-		io->urbs[i]->complete = sg_complete;
-		io->urbs[i]->context = io;
-
-		/*
-		 * Some systems need to revert to PIO when DMA is temporarily
-		 * unavailable.  For their sakes, both transfer_buffer and
-		 * transfer_dma are set when possible.  However this can only
-		 * work on systems without:
-		 *
-		 *  - HIGHMEM, since DMA buffers located in high memory are
-		 *    not directly addressable by the CPU for PIO;
-		 *
-		 *  - IOMMU, since dma_map_sg() is allowed to use an IOMMU to
-		 *    make virtually discontiguous buffers be "dma-contiguous"
-		 *    so that PIO and DMA need diferent numbers of URBs.
-		 *
-		 * So when HIGHMEM or IOMMU are in use, transfer_buffer is NULL
-		 * to prevent stale pointers and to help spot bugs.
-		 */
-		if (dma) {
-			io->urbs[i]->transfer_dma = sg_dma_address(sg);
-			len = sg_dma_len(sg);
+		io->urbs[0]->dev = NULL;
+		io->urbs[0]->pipe = pipe;
+		io->urbs[0]->interval = period;
+		io->urbs[0]->transfer_flags = urb_flags;
+
+		io->urbs[0]->complete = sg_complete;
+		io->urbs[0]->context = io;
+		/* A length of zero means transfer the whole sg list */
+		io->urbs[0]->transfer_buffer_length = length;
+		if (length == 0) {
+			for_each_sg(sg, sg, io->entries, i) {
+				io->urbs[0]->transfer_buffer_length +=
+					sg_dma_len(sg);
+			}
+		}
+		io->urbs[0]->sg = io;
+		io->urbs[0]->num_sgs = io->entries;
+		io->entries = 1;
+	} else {
+		for_each_sg(sg, sg, io->entries, i) {
+			unsigned len;
+
+			io->urbs[i] = usb_alloc_urb(0, mem_flags);
+			if (!io->urbs[i]) {
+				io->entries = i;
+				goto nomem;
+			}
+
+			io->urbs[i]->dev = NULL;
+			io->urbs[i]->pipe = pipe;
+			io->urbs[i]->interval = period;
+			io->urbs[i]->transfer_flags = urb_flags;
+
+			io->urbs[i]->complete = sg_complete;
+			io->urbs[i]->context = io;
+
+			/*
+			 * Some systems need to revert to PIO when DMA is
+			 * temporarily unavailable.  For their sakes, both
+			 * transfer_buffer and transfer_dma are set when
+			 * possible.  However this can only work on systems
+			 * without:
+			 *
+			 *  - HIGHMEM, since DMA buffers located in high memory
+			 *    are not directly addressable by the CPU for PIO;
+			 *
+			 *  - IOMMU, since dma_map_sg() is allowed to use an
+			 *    IOMMU to make virtually discontiguous buffers be
+			 *    "dma-contiguous" so that PIO and DMA need diferent
+			 *    numbers of URBs.
+			 *
+			 * So when HIGHMEM or IOMMU are in use, transfer_buffer
+			 * is NULL to prevent stale pointers and to help spot
+			 * bugs.
+			 */
+			if (dma) {
+				io->urbs[i]->transfer_dma = sg_dma_address(sg);
+				len = sg_dma_len(sg);
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_GART_IOMMU)
-			io->urbs[i]->transfer_buffer = NULL;
+				io->urbs[i]->transfer_buffer = NULL;
 #else
-			io->urbs[i]->transfer_buffer = sg_virt(sg);
+				io->urbs[i]->transfer_buffer = sg_virt(sg);
 #endif
-		} else {
-			/* hc may use _only_ transfer_buffer */
-			io->urbs[i]->transfer_buffer = sg_virt(sg);
-			len = sg->length;
-		}
+			} else {
+				/* hc may use _only_ transfer_buffer */
+				io->urbs[i]->transfer_buffer = sg_virt(sg);
+				len = sg->length;
+			}
 
-		if (length) {
-			len = min_t(unsigned, len, length);
-			length -= len;
-			if (length == 0)
-				io->entries = i + 1;
+			if (length) {
+				len = min_t(unsigned, len, length);
+				length -= len;
+				if (length == 0)
+					io->entries = i + 1;
+			}
+			io->urbs[i]->transfer_buffer_length = len;
 		}
-		io->urbs[i]->transfer_buffer_length = len;
+		io->urbs[--i]->transfer_flags &= ~URB_NO_INTERRUPT;
 	}
-	io->urbs[--i]->transfer_flags &= ~URB_NO_INTERRUPT;
 
 	/* transaction state */
 	io->count = io->entries;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 13bced521b8..0a1819a6497 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1198,6 +1198,8 @@ struct urb {
 	unsigned int transfer_flags;	/* (in) URB_SHORT_NOT_OK | ...*/
 	void *transfer_buffer;		/* (in) associated data buffer */
 	dma_addr_t transfer_dma;	/* (in) dma addr for transfer_buffer */
+	struct usb_sg_request *sg;	/* (in) scatter gather buffer list */
+	int num_sgs;			/* (in) number of entries in the sg list */
 	u32 transfer_buffer_length;	/* (in) data buffer length */
 	u32 actual_length;		/* (return) actual transfer length */
 	unsigned char *setup_packet;	/* (in) setup packet (control only) */
-- 
cgit v1.2.3-70-g09d2


From f0058c627855ecb3b6c7185b7ad1910463c24c42 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Wed, 29 Apr 2009 19:06:20 -0700
Subject: USB: Change names of SuperSpeed ep companion descriptor structs.

Differentiate between SuperSpeed endpoint companion descriptor and the
wireless USB endpoint companion descriptor.  Make all structure names for
this descriptor have "ss" (SuperSpeed) in them.  David Vrabel asked for
this change in http://marc.info/?l=linux-usb&m=124091465109367&w=2

Reported-by: David Vrabel <david.vrabel@csr.com>
Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/config.c   | 36 +++++++++++++++++++-----------------
 drivers/usb/host/xhci-mem.c |  2 +-
 include/linux/usb.h         | 14 +++++++-------
 include/linux/usb/ch9.h     |  4 ++--
 4 files changed, 29 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 7103758bb48..24dfb33f90c 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -69,30 +69,31 @@ static int find_next_descriptor(unsigned char *buffer, int size,
 	return buffer - buffer0;
 }
 
-static int usb_parse_endpoint_companion(struct device *ddev, int cfgno,
+static int usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno,
 		int inum, int asnum, struct usb_host_endpoint *ep,
 		int num_ep, unsigned char *buffer, int size)
 {
 	unsigned char *buffer_start = buffer;
-	struct usb_ep_comp_descriptor	*desc;
+	struct usb_ss_ep_comp_descriptor	*desc;
 	int retval;
 	int num_skipped;
 	int max_tx;
 	int i;
 
-	/* Allocate space for the companion descriptor */
-	ep->ep_comp = kzalloc(sizeof(struct usb_host_ep_comp), GFP_KERNEL);
-	if (!ep->ep_comp)
+	/* Allocate space for the SS endpoint companion descriptor */
+	ep->ss_ep_comp = kzalloc(sizeof(struct usb_host_ss_ep_comp),
+			GFP_KERNEL);
+	if (!ep->ss_ep_comp)
 		return -ENOMEM;
-	desc = (struct usb_ep_comp_descriptor *) buffer;
+	desc = (struct usb_ss_ep_comp_descriptor *) buffer;
 	if (desc->bDescriptorType != USB_DT_SS_ENDPOINT_COMP) {
 		dev_warn(ddev, "No SuperSpeed endpoint companion for config %d "
 				" interface %d altsetting %d ep %d: "
 				"using minimum values\n",
 				cfgno, inum, asnum, ep->desc.bEndpointAddress);
-		ep->ep_comp->desc.bLength = USB_DT_EP_COMP_SIZE;
-		ep->ep_comp->desc.bDescriptorType = USB_DT_SS_ENDPOINT_COMP;
-		ep->ep_comp->desc.bMaxBurst = 0;
+		ep->ss_ep_comp->desc.bLength = USB_DT_SS_EP_COMP_SIZE;
+		ep->ss_ep_comp->desc.bDescriptorType = USB_DT_SS_ENDPOINT_COMP;
+		ep->ss_ep_comp->desc.bMaxBurst = 0;
 		/*
 		 * Leave bmAttributes as zero, which will mean no streams for
 		 * bulk, and isoc won't support multiple bursts of packets.
@@ -102,7 +103,7 @@ static int usb_parse_endpoint_companion(struct device *ddev, int cfgno,
 		 */
 		if (usb_endpoint_xfer_isoc(&ep->desc) ||
 				usb_endpoint_xfer_int(&ep->desc))
-			ep->ep_comp->desc.wBytesPerInterval =
+			ep->ss_ep_comp->desc.wBytesPerInterval =
 				ep->desc.wMaxPacketSize;
 		/*
 		 * The next descriptor is for an Endpoint or Interface,
@@ -112,16 +113,16 @@ static int usb_parse_endpoint_companion(struct device *ddev, int cfgno,
 		retval = 0;
 		goto valid;
 	}
-	memcpy(&ep->ep_comp->desc, desc, USB_DT_EP_COMP_SIZE);
-	desc = &ep->ep_comp->desc;
+	memcpy(&ep->ss_ep_comp->desc, desc, USB_DT_SS_EP_COMP_SIZE);
+	desc = &ep->ss_ep_comp->desc;
 	buffer += desc->bLength;
 	size -= desc->bLength;
 
 	/* Eat up the other descriptors we don't care about */
-	ep->ep_comp->extra = buffer;
+	ep->ss_ep_comp->extra = buffer;
 	i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
 			USB_DT_INTERFACE, &num_skipped);
-	ep->ep_comp->extralen = i;
+	ep->ss_ep_comp->extralen = i;
 	buffer += i;
 	size -= i;
 	retval = buffer - buffer_start + i;
@@ -310,7 +311,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
 				cfgno, inum, asnum, d->bEndpointAddress,
 				maxp);
 	}
-	/* Allocate room for and parse any endpoint companion descriptors */
+	/* Allocate room for and parse any SS endpoint companion descriptors */
 	if (to_usb_device(ddev)->speed == USB_SPEED_SUPER) {
 		endpoint->extra = buffer;
 		i = find_next_descriptor_more(buffer, size, USB_DT_SS_ENDPOINT_COMP,
@@ -320,8 +321,9 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
 		size -= i;
 
 		if (size > 0) {
-			retval = usb_parse_endpoint_companion(ddev, cfgno, inum, asnum,
-					endpoint, num_ep, buffer, size);
+			retval = usb_parse_ss_endpoint_companion(ddev, cfgno,
+					inum, asnum, endpoint, num_ep, buffer,
+					size);
 			if (retval >= 0) {
 				buffer += retval;
 				retval = buffer - buffer0;
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index f49f280cfd4..a7fbd6c10ad 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -496,7 +496,7 @@ int xhci_endpoint_init(struct xhci_hcd *xhci,
 		max_packet = ep->desc.wMaxPacketSize;
 		ep_ctx->ep_info2 |= MAX_PACKET(max_packet);
 		/* dig out max burst from ep companion desc */
-		max_packet = ep->ep_comp->desc.bMaxBurst;
+		max_packet = ep->ss_ep_comp->desc.bMaxBurst;
 		ep_ctx->ep_info2 |= MAX_BURST(max_packet);
 		break;
 	case USB_SPEED_HIGH:
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 0a1819a6497..7e6b5259ea3 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -47,15 +47,15 @@ struct ep_device;
 
 /* For SS devices */
 /**
- * struct usb_host_ep_comp - Valid for SuperSpeed devices only
+ * struct usb_host_ss_ep_comp - Valid for SuperSpeed devices only
  * @desc: endpoint companion descriptor, wMaxPacketSize in native byteorder
  * @extra: descriptors following this endpoint companion descriptor
  * @extralen: how many bytes of "extra" are valid
  */
-struct usb_host_ep_comp {
-	struct usb_ep_comp_descriptor	desc;
-	unsigned char			*extra;   /* Extra descriptors */
-	int				extralen;
+struct usb_host_ss_ep_comp {
+	struct usb_ss_ep_comp_descriptor	desc;
+	unsigned char				*extra;   /* Extra descriptors */
+	int					extralen;
 };
 
 /**
@@ -65,7 +65,7 @@ struct usb_host_ep_comp {
  * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH)
  *	with one or more transfer descriptors (TDs) per urb
  * @ep_dev: ep_device for sysfs info
- * @ep_comp: companion descriptor information for this endpoint
+ * @ss_ep_comp: companion descriptor information for this endpoint
  * @extra: descriptors following this endpoint in the configuration
  * @extralen: how many bytes of "extra" are valid
  * @enabled: URBs may be submitted to this endpoint
@@ -78,7 +78,7 @@ struct usb_host_endpoint {
 	struct list_head		urb_list;
 	void				*hcpriv;
 	struct ep_device 		*ep_dev;	/* For sysfs info */
-	struct usb_host_ep_comp		*ep_comp;	/* For SS devices */
+	struct usb_host_ss_ep_comp	*ss_ep_comp;	/* For SS devices */
 
 	unsigned char *extra;   /* Extra descriptors */
 	int extralen;
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 9e9c5c0a3d7..93223638f70 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -538,7 +538,7 @@ static inline int usb_endpoint_is_isoc_out(
 /*-------------------------------------------------------------------------*/
 
 /* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */
-struct usb_ep_comp_descriptor {
+struct usb_ss_ep_comp_descriptor {
 	__u8  bLength;
 	__u8  bDescriptorType;
 
@@ -547,7 +547,7 @@ struct usb_ep_comp_descriptor {
 	__u16 wBytesPerInterval;
 } __attribute__ ((packed));
 
-#define USB_DT_EP_COMP_SIZE		6
+#define USB_DT_SS_EP_COMP_SIZE		6
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3-70-g09d2


From e475bba2fdee9c3dbfe25f026f8fb8de69508ad2 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 16 Jun 2009 08:23:52 +0200
Subject: block: Introduce helper to reset queue limits to default values

DM reuses the request queue when swapping in a new device table
Introduce blk_set_default_limits() which can be used to reset the the
queue_limits prior to stacking devices.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 33 +++++++++++++++++++++++++++------
 include/linux/blkdev.h |  1 +
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 138610b9895..7541ea4bf9f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -95,6 +95,31 @@ void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
 }
 EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 
+/**
+ * blk_set_default_limits - reset limits to default values
+ * @limits:  the queue_limits structure to reset
+ *
+ * Description:
+ *   Returns a queue_limit struct to its default state.  Can be used by
+ *   stacking drivers like DM that stage table swaps and reuse an
+ *   existing device queue.
+ */
+void blk_set_default_limits(struct queue_limits *lim)
+{
+	lim->max_phys_segments = MAX_PHYS_SEGMENTS;
+	lim->max_hw_segments = MAX_HW_SEGMENTS;
+	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	lim->max_segment_size = MAX_SEGMENT_SIZE;
+	lim->max_sectors = lim->max_hw_sectors = SAFE_MAX_SECTORS;
+	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
+	lim->bounce_pfn = BLK_BOUNCE_ANY;
+	lim->alignment_offset = 0;
+	lim->io_opt = 0;
+	lim->misaligned = 0;
+	lim->no_cluster = 0;
+}
+EXPORT_SYMBOL(blk_set_default_limits);
+
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
@@ -123,14 +148,8 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	 * set defaults
 	 */
 	q->nr_requests = BLKDEV_MAX_RQ;
-	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
-	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
-	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
-	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
 
 	q->make_request_fn = mfn;
-	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
-	blk_queue_logical_block_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
@@ -143,6 +162,8 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->unplug_timer.function = blk_unplug_timeout;
 	q->unplug_timer.data = (unsigned long)q;
 
+	blk_set_default_limits(&q->limits);
+
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0b1a6cae9de..8963d9149b5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -926,6 +926,7 @@ extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern void blk_set_default_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
 extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
-- 
cgit v1.2.3-70-g09d2


From 5ced504b1bd1979378de35c56aa5d3d79fb5033f Mon Sep 17 00:00:00 2001
From: unsik Kim <donari75@gmail.com>
Date: Tue, 16 Jun 2009 08:40:20 +0200
Subject: mg_disk: seperate mg_disk.h again

eec9462088a26c046d4db3100796a340a50890b8 fold mg_disk.h into mg_disk.c,
but mg_disk platform driver needs private data for operation. This also
make mg_disk.c as machine independent. Seperate only needed structure and
defines to mg_disk.h

Signed-off-by: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 28 +---------------------------
 include/linux/mg_disk.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 27 deletions(-)
 create mode 100644 include/linux/mg_disk.h

(limited to 'include/linux')

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 60de5a01e71..f703f547824 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -22,13 +22,12 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
+#include <linux/mg_disk.h>
 
 #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
 
 /* name for block device */
 #define MG_DISK_NAME "mgd"
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
 
 #define MG_DISK_MAJ 0
 #define MG_DISK_MAX_PART 16
@@ -103,33 +102,8 @@
 #define MG_TMAX_SWRST_TO_RDY	500
 #define MG_TMAX_RSTOUT		3000
 
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
 #define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
 
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	struct mg_host *host;
-};
-
 /* main structure for mflash driver */
 struct mg_host {
 	struct device *dev;
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
new file mode 100644
index 00000000000..e11f4d9f1c2
--- /dev/null
+++ b/include/linux/mg_disk.h
@@ -0,0 +1,45 @@
+/*
+ *  include/linux/mg_disk.c
+ *
+ *  Private data for mflash platform driver
+ *
+ * (c) 2008 mGine Co.,LTD
+ * (c) 2008 unsik Kim <donari75@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#ifndef __MG_DISK_H__
+#define __MG_DISK_H__
+
+/* name for platform device */
+#define MG_DEV_NAME "mg_disk"
+
+/* names of GPIO resource */
+#define MG_RST_PIN	"mg_rst"
+/* except MG_BOOT_DEV, reset-out pin should be assigned */
+#define MG_RSTOUT_PIN	"mg_rstout"
+
+/* device attribution */
+/* use mflash as boot device */
+#define MG_BOOT_DEV		(1 << 0)
+/* use mflash as storage device */
+#define MG_STORAGE_DEV		(1 << 1)
+/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
+#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
+
+/* private driver data */
+struct mg_drv_data {
+	/* disk resource */
+	u32 use_polling;
+
+	/* device attribution */
+	u32 dev_attr;
+
+	/* internally used */
+	void *host;
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 6c18ba9f5e506b8115b89b1aa7bdc25178f40b0a Mon Sep 17 00:00:00 2001
From: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
Date: Tue, 16 Jun 2009 04:19:13 +0300
Subject: nfsd41: move channel attributes from nfsd4_session to a
 nfsd4_channel_attr struct

the change is valid for both the forechannel and the backchannel (currently dummy)

Signed-off-by: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c        | 28 +++++++++++++++-------------
 fs/nfsd/nfs4xdr.c          |  2 +-
 include/linux/nfsd/state.h | 18 +++++++++++++-----
 include/linux/nfsd/xdr4.h  | 11 -----------
 4 files changed, 29 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 89d9ac55c03..d5caf2a709d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -444,8 +444,8 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
  * fchan holds the client values on input, and the server values on output
  */
 static int init_forechannel_attrs(struct svc_rqst *rqstp,
-				    struct nfsd4_session *session,
-				    struct nfsd4_channel_attrs *fchan)
+				  struct nfsd4_channel_attrs *session_fchan,
+				  struct nfsd4_channel_attrs *fchan)
 {
 	int status = 0;
 	__u32   maxcount = svc_max_payload(rqstp);
@@ -455,21 +455,21 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 	/* Use the client's max request and max response size if possible */
 	if (fchan->maxreq_sz > maxcount)
 		fchan->maxreq_sz = maxcount;
-	session->se_fmaxreq_sz = fchan->maxreq_sz;
+	session_fchan->maxreq_sz = fchan->maxreq_sz;
 
 	if (fchan->maxresp_sz > maxcount)
 		fchan->maxresp_sz = maxcount;
-	session->se_fmaxresp_sz = fchan->maxresp_sz;
+	session_fchan->maxresp_sz = fchan->maxresp_sz;
 
 	/* Set the max response cached size our default which is
 	 * a multiple of PAGE_SIZE and small */
-	session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
-	fchan->maxresp_cached = session->se_fmaxresp_cached;
+	session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	fchan->maxresp_cached = session_fchan->maxresp_cached;
 
 	/* Use the client's maxops if possible */
 	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
 		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
-	session->se_fmaxops = fchan->maxops;
+	session_fchan->maxops = fchan->maxops;
 
 	/* try to use the client requested number of slots */
 	if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
@@ -481,7 +481,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
 	 */
 	status = set_forechannel_maxreqs(fchan);
 
-	session->se_fnumslots = fchan->maxreqs;
+	session_fchan->maxreqs = fchan->maxreqs;
 	return status;
 }
 
@@ -495,12 +495,14 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 	memset(&tmp, 0, sizeof(tmp));
 
 	/* FIXME: For now, we just accept the client back channel attributes. */
-	status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+	tmp.se_bchannel = cses->back_channel;
+	status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+					&cses->fore_channel);
 	if (status)
 		goto out;
 
 	/* allocate struct nfsd4_session and slot table in one piece */
-	slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+	slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
 	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
 	if (!new)
 		goto out;
@@ -574,7 +576,7 @@ free_session(struct kref *kref)
 	int i;
 
 	ses = container_of(kref, struct nfsd4_session, se_ref);
-	for (i = 0; i < ses->se_fnumslots; i++) {
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
 		struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
 		nfsd4_release_respages(e->ce_respages, e->ce_resused);
 	}
@@ -1130,7 +1132,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 	 * is sent (lease renewal).
 	 */
 	if (seq && nfsd4_not_cached(resp)) {
-		seq->maxslots = resp->cstate.session->se_fnumslots;
+		seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
 		return nfs_ok;
 	}
 
@@ -1473,7 +1475,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		goto out;
 
 	status = nfserr_badslot;
-	if (seq->slotid >= session->se_fnumslots)
+	if (seq->slotid >= session->se_fchannel.maxreqs)
 		goto out;
 
 	slot = &session->se_slots[seq->slotid];
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d07f704a2ac..2dcc7feaa6f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3183,7 +3183,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
 	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
 		length, xb->page_len, tlen, pad);
 
-	if (length <= session->se_fmaxresp_cached)
+	if (length <= session->se_fchannel.maxresp_cached)
 		return status;
 	else
 		return nfserr_rep_too_big_to_cache;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index c0c49215ddc..105cc100de0 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -115,6 +115,17 @@ struct nfsd4_slot {
 	struct nfsd4_cache_entry	sl_cache_entry;
 };
 
+struct nfsd4_channel_attrs {
+	u32		headerpadsz;
+	u32		maxreq_sz;
+	u32		maxresp_sz;
+	u32		maxresp_cached;
+	u32		maxops;
+	u32		maxreqs;
+	u32		nr_rdma_attrs;
+	u32		rdma_attrs;
+};
+
 struct nfsd4_session {
 	struct kref		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
@@ -122,11 +133,8 @@ struct nfsd4_session {
 	u32			se_flags;
 	struct nfs4_client	*se_client;	/* for expire_client */
 	struct nfs4_sessionid	se_sessionid;
-	u32			se_fmaxreq_sz;
-	u32			se_fmaxresp_sz;
-	u32			se_fmaxresp_cached;
-	u32			se_fmaxops;
-	u32			se_fnumslots;
+	struct nfsd4_channel_attrs se_fchannel;
+	struct nfsd4_channel_attrs se_bchannel;
 	struct nfsd4_slot	se_slots[];	/* forward channel slots */
 };
 
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index d0f050f01ec..2bacf753506 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -366,17 +366,6 @@ struct nfsd4_exchange_id {
 	int		spa_how;
 };
 
-struct nfsd4_channel_attrs {
-	u32		headerpadsz;
-	u32		maxreq_sz;
-	u32		maxresp_sz;
-	u32		maxresp_cached;
-	u32		maxops;
-	u32		maxreqs;
-	u32		nr_rdma_attrs;
-	u32		rdma_attrs;
-};
-
 struct nfsd4_create_session {
 	clientid_t		clientid;
 	struct nfs4_sessionid	sessionid;
-- 
cgit v1.2.3-70-g09d2


From 5fd29d6ccbc98884569d6f3105aeca70858b3e0f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Jun 2009 10:57:02 -0700
Subject: printk: clean up handling of log-levels and newlines

It used to be that we would only look at the log-level in a printk()
after explicit newlines, which can cause annoying problems when the
previous printk() did not end with a '\n'. In that case, the log-level
marker would be just printed out in the middle of the line, and be
seen as just noise rather than change the logging level.

This changes things to always look at the log-level in the first
bytes of the printout. If a log level marker is found, it is always
used as the log-level. Additionally, if no newline existed, one is
added (unless the log-level is the explicit KERN_CONT marker, to
explicitly show that it's a continuation of a previous line).

Acked-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  2 +-
 kernel/printk.c        | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 883cd44ff76..066bb1eddfe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -102,7 +102,7 @@ extern const char linux_proc_banner[];
  * line that had no enclosing \n). Only to be used by core/arch code
  * during early bootup (a continued line is not SMP-safe otherwise).
  */
-#define	KERN_CONT	""
+#define	KERN_CONT	"<c>"
 
 extern int console_printk[];
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c6..a87770ce73a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,33 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 				  sizeof(printk_buf) - printed_len, fmt, args);
 
 
+	p = printk_buf;
+
+	/* Do we have a loglevel in the string? */
+	if (p[0] == '<') {
+		unsigned char c = p[1];
+		if (c && p[2] == '>') {
+			switch (c) {
+			case '0' ... '7': /* loglevel */
+				current_log_level = c - '0';
+				if (!new_text_line) {
+					emit_log_char('\n');
+					new_text_line = 1;
+				}
+			/* Fallthrough - skip the loglevel */
+			case 'c': /* KERN_CONT */
+				p += 3;
+				break;
+			}
+		}
+	}
+
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
 	 * appropriate log level tags, we insert them here
 	 */
-	for (p = printk_buf; *p; p++) {
+	for ( ; *p; p++) {
 		if (new_text_line) {
-			/* If a token, set current_log_level and skip over */
-			if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
-			    p[2] == '>') {
-				current_log_level = p[1] - '0';
-				p += 3;
-				printed_len -= 3;
-			}
-
 			/* Always output the token */
 			emit_log_char('<');
 			emit_log_char(current_log_level + '0');
-- 
cgit v1.2.3-70-g09d2


From e28d713704117bca0820c732210df6075b09f13b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Jun 2009 11:02:28 -0700
Subject: printk: Add KERN_DEFAULT printk log-level

This adds a KERN_DEFAULT loglevel marker, for when you cannot decide
which loglevel you want, and just want to keep an existing printk
with the default loglevel.

The difference between having KERN_DEFAULT and having no log-level
marker at all is two-fold:

 - having the log-level marker will now force a new-line if the
   previous printout had not added one (perhaps because it forgot,
   but perhaps because it expected a continuation)

 - having a log-level marker is required if you are printing out a
   message that otherwise itself could perhaps otherwise be mistaken
   for a log-level.

Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 kernel/printk.c        | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 066bb1eddfe..1b2e1747df1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -97,6 +97,8 @@ extern const char linux_proc_banner[];
 #define	KERN_INFO	"<6>"	/* informational			*/
 #define	KERN_DEBUG	"<7>"	/* debug-level messages			*/
 
+/* Use the default kernel loglevel */
+#define KERN_DEFAULT	"<d>"
 /*
  * Annotation for a "continued" line of log printout (only done after a
  * line that had no enclosing \n). Only to be used by core/arch code
diff --git a/kernel/printk.c b/kernel/printk.c
index a87770ce73a..b4d97b54c1e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -696,6 +696,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 			switch (c) {
 			case '0' ... '7': /* loglevel */
 				current_log_level = c - '0';
+			/* Fallthrough - make sure we're on a new line */
+			case 'd': /* KERN_DEFAULT */
 				if (!new_text_line) {
 					emit_log_char('\n');
 					new_text_line = 1;
-- 
cgit v1.2.3-70-g09d2


From 84845c070ce3ac4d3bd2c148fa20ba8ce5409167 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 26 May 2009 16:05:06 +0900
Subject: PCI: use pci_is_root_bus() in acpi_pci_get_bridge_handle()

Use pci_is_root_bus() in acpi_pci_get_bridge_handle() to check if the
pci bus is root, for code consistency.

Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Reviewed-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 092e82e0048..df67c78dfe2 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -23,7 +23,7 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 {
-	if (pbus->parent)
+	if (!pci_is_root_bus(pbus))
 		return DEVICE_ACPI_HANDLE(&(pbus->self->dev));
 	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
 					      pbus->number);
-- 
cgit v1.2.3-70-g09d2


From a222b8f83b995e9c6fe2aff2a8125facb49f658e Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 26 May 2009 16:05:33 +0900
Subject: PCI: use pci_is_root_bus() in acpi_find_root_bridge_handle()

Use pci_is_root_bus() in acpi_find_root_bridge_handle() to check if
the pci bus is root, for code consistency.

Reviewed-by: Alex Chiang <achiang@hp.com>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index df67c78dfe2..93a7c08f869 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -15,7 +15,7 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	struct pci_bus *pbus = pdev->bus;
 	/* Find a PCI root bus */
-	while (pbus->parent)
+	while (!pci_is_root_bus(pbus))
 		pbus = pbus->parent;
 	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
 					      pbus->number);
-- 
cgit v1.2.3-70-g09d2


From a72b46c3849cdb05993015991bde548ab8b6d7ac Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 24 Apr 2009 10:45:17 +0800
Subject: PCI: Add pci_bus_set_ops

pci_bus_set_ops changes pci_ops associated with a pci_bus. This can be
used by debug tools such as PCIE AER error injection to fake some PCI
configuration registers.

Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/access.c | 19 +++++++++++++++++++
 include/linux/pci.h  |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 0f370651268..db23200c487 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -66,6 +66,25 @@ EXPORT_SYMBOL(pci_bus_write_config_byte);
 EXPORT_SYMBOL(pci_bus_write_config_word);
 EXPORT_SYMBOL(pci_bus_write_config_dword);
 
+/**
+ * pci_bus_set_ops - Set raw operations of pci bus
+ * @bus:	pci bus struct
+ * @ops:	new raw operations
+ *
+ * Return previous raw operations
+ */
+struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops)
+{
+	struct pci_ops *old_ops;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_lock, flags);
+	old_ops = bus->ops;
+	bus->ops = ops;
+	spin_unlock_irqrestore(&pci_lock, flags);
+	return old_ops;
+}
+EXPORT_SYMBOL(pci_bus_set_ops);
 
 /**
  * pci_read_vpd - Read one entry from Vital Product Data
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ec03b90d351..ea2a153a912 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -637,6 +637,7 @@ int pci_bus_write_config_word(struct pci_bus *bus, unsigned int devfn,
 			      int where, u16 val);
 int pci_bus_write_config_dword(struct pci_bus *bus, unsigned int devfn,
 			       int where, u32 val);
+struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops);
 
 static inline int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val)
 {
-- 
cgit v1.2.3-70-g09d2


From bd3d99c17039fd05a29587db3f4a180c48da115a Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 2 Jun 2009 13:52:26 +0900
Subject: PCI: Remove untested Electromechanical Interlock (EMI) support in
 pciehp.

The EMI support in pciehp is obviously broken. It is implemented using
struct hotplug_slot_attribute, but sysfs_ops for pci_slot_ktype is NOT
for struct hotplug_slot_attribute, but for struct pci_slot_attribute.
This bug had been there for a long time, maybe it was introduced when
PCI slot framework was introduced. The reason why this bug didn't
cause any problem is maybe the EMI support is not tested at all
because of lack of test environment.

As described above, the EMI support in pciehp seems not to be tested
at all. So this patch removes EMI support from pciehp, instead of
fixing the bug.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp.h      |   3 --
 drivers/pci/hotplug/pciehp_core.c | 111 +-------------------------------------
 drivers/pci/hotplug/pciehp_hpc.c  |  31 -----------
 include/linux/pci_hotplug.h       |   8 ---
 4 files changed, 1 insertion(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 0a368547e63..e6cf096498b 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -81,7 +81,6 @@ struct slot {
 	struct hpc_ops *hpc_ops;
 	struct hotplug_slot *hotplug_slot;
 	struct list_head	slot_list;
-	unsigned long last_emi_toggle;
 	struct delayed_work work;	/* work for button event */
 	struct mutex lock;
 };
@@ -203,8 +202,6 @@ struct hpc_ops {
 	int (*set_attention_status)(struct slot *slot, u8 status);
 	int (*get_latch_status)(struct slot *slot, u8 *status);
 	int (*get_adapter_status)(struct slot *slot, u8 *status);
-	int (*get_emi_status)(struct slot *slot, u8 *status);
-	int (*toggle_emi)(struct slot *slot);
 	int (*get_max_bus_speed)(struct slot *slot, enum pci_bus_speed *speed);
 	int (*get_cur_bus_speed)(struct slot *slot, enum pci_bus_speed *speed);
 	int (*get_max_lnk_width)(struct slot *slot, enum pcie_link_width *val);
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index fb254b2454d..eb183d1d091 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -85,99 +85,6 @@ static struct hotplug_slot_ops pciehp_hotplug_slot_ops = {
   	.get_cur_bus_speed =	get_cur_bus_speed,
 };
 
-/*
- * Check the status of the Electro Mechanical Interlock (EMI)
- */
-static int get_lock_status(struct hotplug_slot *hotplug_slot, u8 *value)
-{
-	struct slot *slot = hotplug_slot->private;
-	return (slot->hpc_ops->get_emi_status(slot, value));
-}
-
-/*
- * sysfs interface for the Electro Mechanical Interlock (EMI)
- * 1 == locked, 0 == unlocked
- */
-static ssize_t lock_read_file(struct hotplug_slot *slot, char *buf)
-{
-	int retval;
-	u8 value;
-
-	retval = get_lock_status(slot, &value);
-	if (retval)
-		goto lock_read_exit;
-	retval = sprintf (buf, "%d\n", value);
-
-lock_read_exit:
-	return retval;
-}
-
-/*
- * Change the status of the Electro Mechanical Interlock (EMI)
- * This is a toggle - in addition there must be at least 1 second
- * in between toggles.
- */
-static int set_lock_status(struct hotplug_slot *hotplug_slot, u8 status)
-{
-	struct slot *slot = hotplug_slot->private;
-	int retval;
-	u8 value;
-
-	mutex_lock(&slot->ctrl->crit_sect);
-
-	/* has it been >1 sec since our last toggle? */
-	if ((get_seconds() - slot->last_emi_toggle) < 1) {
-		mutex_unlock(&slot->ctrl->crit_sect);
-		return -EINVAL;
-	}
-
-	/* see what our current state is */
-	retval = get_lock_status(hotplug_slot, &value);
-	if (retval || (value == status))
-		goto set_lock_exit;
-
-	slot->hpc_ops->toggle_emi(slot);
-set_lock_exit:
-	mutex_unlock(&slot->ctrl->crit_sect);
-	return 0;
-}
-
-/*
- * sysfs interface which allows the user to toggle the Electro Mechanical
- * Interlock.  Valid values are either 0 or 1.  0 == unlock, 1 == lock
- */
-static ssize_t lock_write_file(struct hotplug_slot *hotplug_slot,
-		const char *buf, size_t count)
-{
-	struct slot *slot = hotplug_slot->private;
-	unsigned long llock;
-	u8 lock;
-	int retval = 0;
-
-	llock = simple_strtoul(buf, NULL, 10);
-	lock = (u8)(llock & 0xff);
-
-	switch (lock) {
-		case 0:
-		case 1:
-			retval = set_lock_status(hotplug_slot, lock);
-			break;
-		default:
-			ctrl_err(slot->ctrl, "%d is an invalid lock value\n",
-				 lock);
-			retval = -EINVAL;
-	}
-	if (retval)
-		return retval;
-	return count;
-}
-
-static struct hotplug_slot_attribute hotplug_slot_attr_lock = {
-	.attr = {.name = "lock", .mode = S_IFREG | S_IRUGO | S_IWUSR},
-	.show = lock_read_file,
-	.store = lock_write_file
-};
-
 /**
  * release_slot - free up the memory used by a slot
  * @hotplug_slot: slot to free
@@ -236,17 +143,6 @@ static int init_slots(struct controller *ctrl)
 		get_attention_status(hotplug_slot, &info->attention_status);
 		get_latch_status(hotplug_slot, &info->latch_status);
 		get_adapter_status(hotplug_slot, &info->adapter_status);
-		/* create additional sysfs entries */
-		if (EMI(ctrl)) {
-			retval = sysfs_create_file(&hotplug_slot->pci_slot->kobj,
-				&hotplug_slot_attr_lock.attr);
-			if (retval) {
-				pci_hp_deregister(hotplug_slot);
-				ctrl_err(ctrl, "Cannot create additional sysfs "
-					 "entries\n");
-				goto error_info;
-			}
-		}
 	}
 
 	return 0;
@@ -261,13 +157,8 @@ error:
 static void cleanup_slots(struct controller *ctrl)
 {
 	struct slot *slot;
-
-	list_for_each_entry(slot, &ctrl->slot_list, slot_list) {
-		if (EMI(ctrl))
-			sysfs_remove_file(&slot->hotplug_slot->pci_slot->kobj,
-				&hotplug_slot_attr_lock.attr);
+	list_for_each_entry(slot, &ctrl->slot_list, slot_list)
 		pci_hp_deregister(slot->hotplug_slot);
-	}
 }
 
 /*
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 07bd3215114..52813257e5b 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -422,35 +422,6 @@ static int hpc_query_power_fault(struct slot *slot)
 	return !!(slot_status & PCI_EXP_SLTSTA_PFD);
 }
 
-static int hpc_get_emi_status(struct slot *slot, u8 *status)
-{
-	struct controller *ctrl = slot->ctrl;
-	u16 slot_status;
-	int retval;
-
-	retval = pciehp_readw(ctrl, PCI_EXP_SLTSTA, &slot_status);
-	if (retval) {
-		ctrl_err(ctrl, "Cannot check EMI status\n");
-		return retval;
-	}
-	*status = !!(slot_status & PCI_EXP_SLTSTA_EIS);
-	return retval;
-}
-
-static int hpc_toggle_emi(struct slot *slot)
-{
-	u16 slot_cmd;
-	u16 cmd_mask;
-	int rc;
-
-	slot_cmd = PCI_EXP_SLTCTL_EIC;
-	cmd_mask = PCI_EXP_SLTCTL_EIC;
-	rc = pcie_write_cmd(slot->ctrl, slot_cmd, cmd_mask);
-	slot->last_emi_toggle = get_seconds();
-
-	return rc;
-}
-
 static int hpc_set_attention_status(struct slot *slot, u8 value)
 {
 	struct controller *ctrl = slot->ctrl;
@@ -874,8 +845,6 @@ static struct hpc_ops pciehp_hpc_ops = {
 	.get_attention_status		= hpc_get_attention_status,
 	.get_latch_status		= hpc_get_latch_status,
 	.get_adapter_status		= hpc_get_adapter_status,
-	.get_emi_status			= hpc_get_emi_status,
-	.toggle_emi			= hpc_toggle_emi,
 
 	.get_max_bus_speed		= hpc_get_max_lnk_speed,
 	.get_cur_bus_speed		= hpc_get_cur_lnk_speed,
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 20998746518..11936fd0b56 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -66,14 +66,6 @@ enum pcie_link_speed {
 	PCIE_LNK_SPEED_UNKNOWN	= 0xFF,
 };
 
-struct hotplug_slot;
-struct hotplug_slot_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct hotplug_slot *, char *);
-	ssize_t (*store)(struct hotplug_slot *, const char *, size_t);
-};
-#define to_hotplug_attr(n) container_of(n, struct hotplug_slot_attribute, attr);
-
 /**
  * struct hotplug_slot_ops -the callbacks that the hotplug pci core can use
  * @owner: The module owner of this structure
-- 
cgit v1.2.3-70-g09d2


From c825bc94c8c1908750ab20413eb639c6be029e2d Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 11:01:25 +0900
Subject: PCI hotplug: create symlink to hotplug driver module

Create symbolic link to hotplug driver module in the PCI slot
directory (/sys/bus/pci/slots/<SLOT#>). In the past, we need to load
hotplug drivers one by one to identify the hotplug driver that handles
the slot, and it was very inconvenient especially for trouble shooting.
With this change, we can easily identify the hotplug driver.

Signed-off-by: Taku Izumi <izumi.taku@jp.fujitsu.com>
Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci |  7 ++++++
 drivers/pci/hotplug/pci_hotplug_core.c  | 23 +++++++++++++------
 drivers/pci/slot.c                      | 39 +++++++++++++++++++++++++++++++++
 include/linux/pci.h                     |  5 +++++
 include/linux/pci_hotplug.h             | 15 +++++++++++--
 5 files changed, 80 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 97ad190e13a..6bf68053e4b 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -122,3 +122,10 @@ Description:
 		This symbolic link appears when a device is a Virtual Function.
 		The symbolic link points to the PCI device sysfs entry of the
 		Physical Function this device associates with.
+
+What:		/sys/bus/pci/slots/.../module
+Date:		June 2009
+Contact:	linux-pci@vger.kernel.org
+Description:
+		This symbolic link points to the PCI hotplug controller driver
+		module that manages the hotplug slot.
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index ff32c6b4ae1..844580489d4 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -424,6 +424,9 @@ static int fs_add_slot(struct pci_slot *slot)
 {
 	int retval = 0;
 
+	/* Create symbolic link to the hotplug driver module */
+	pci_hp_create_module_link(slot);
+
 	if (has_power_file(slot)) {
 		retval = sysfs_create_file(&slot->kobj,
 					   &hotplug_slot_attr_power.attr);
@@ -498,6 +501,7 @@ exit_attention:
 	if (has_power_file(slot))
 		sysfs_remove_file(&slot->kobj, &hotplug_slot_attr_power.attr);
 exit_power:
+	pci_hp_remove_module_link(slot);
 exit:
 	return retval;
 }
@@ -528,6 +532,8 @@ static void fs_remove_slot(struct pci_slot *slot)
 
 	if (has_test_file(slot))
 		sysfs_remove_file(&slot->kobj, &hotplug_slot_attr_test.attr);
+
+	pci_hp_remove_module_link(slot);
 }
 
 static struct hotplug_slot *get_slot_from_name (const char *name)
@@ -544,10 +550,10 @@ static struct hotplug_slot *get_slot_from_name (const char *name)
 }
 
 /**
- * pci_hp_register - register a hotplug_slot with the PCI hotplug subsystem
+ * __pci_hp_register - register a hotplug_slot with the PCI hotplug subsystem
  * @bus: bus this slot is on
  * @slot: pointer to the &struct hotplug_slot to register
- * @slot_nr: slot number
+ * @devnr: device number
  * @name: name registered with kobject core
  *
  * Registers a hotplug slot with the pci hotplug subsystem, which will allow
@@ -555,8 +561,9 @@ static struct hotplug_slot *get_slot_from_name (const char *name)
  *
  * Returns 0 if successful, anything else for an error.
  */
-int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr,
-			const char *name)
+int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus,
+		      int devnr, const char *name,
+		      struct module *owner, const char *mod_name)
 {
 	int result;
 	struct pci_slot *pci_slot;
@@ -571,14 +578,16 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr,
 		return -EINVAL;
 	}
 
-	mutex_lock(&pci_hp_mutex);
+	slot->ops->owner = owner;
+	slot->ops->mod_name = mod_name;
 
+	mutex_lock(&pci_hp_mutex);
 	/*
 	 * No problems if we call this interface from both ACPI_PCI_SLOT
 	 * driver and call it here again. If we've already created the
 	 * pci_slot, the interface will simply bump the refcount.
 	 */
-	pci_slot = pci_create_slot(bus, slot_nr, name, slot);
+	pci_slot = pci_create_slot(bus, devnr, name, slot);
 	if (IS_ERR(pci_slot)) {
 		result = PTR_ERR(pci_slot);
 		goto out;
@@ -688,6 +697,6 @@ MODULE_LICENSE("GPL");
 module_param(debug, bool, 0644);
 MODULE_PARM_DESC(debug, "Debugging mode enabled or not");
 
-EXPORT_SYMBOL_GPL(pci_hp_register);
+EXPORT_SYMBOL_GPL(__pci_hp_register);
 EXPORT_SYMBOL_GPL(pci_hp_deregister);
 EXPORT_SYMBOL_GPL(pci_hp_change_slot_info);
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index fe95ce20bcb..eddb0748b0e 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -307,6 +307,45 @@ void pci_destroy_slot(struct pci_slot *slot)
 }
 EXPORT_SYMBOL_GPL(pci_destroy_slot);
 
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+#include <linux/pci_hotplug.h>
+/**
+ * pci_hp_create_link - create symbolic link to the hotplug driver module.
+ * @slot: struct pci_slot
+ *
+ * Helper function for pci_hotplug_core.c to create symbolic link to
+ * the hotplug driver module.
+ */
+void pci_hp_create_module_link(struct pci_slot *pci_slot)
+{
+	struct hotplug_slot *slot = pci_slot->hotplug;
+	struct kobject *kobj = NULL;
+	int no_warn;
+
+	if (!slot || !slot->ops)
+		return;
+	kobj = kset_find_obj(module_kset, slot->ops->mod_name);
+	if (!kobj)
+		return;
+	no_warn = sysfs_create_link(&pci_slot->kobj, kobj, "module");
+	kobject_put(kobj);
+}
+EXPORT_SYMBOL_GPL(pci_hp_create_module_link);
+
+/**
+ * pci_hp_remove_link - remove symbolic link to the hotplug driver module.
+ * @slot: struct pci_slot
+ *
+ * Helper function for pci_hotplug_core.c to remove symbolic link to
+ * the hotplug driver module.
+ */
+void pci_hp_remove_module_link(struct pci_slot *pci_slot)
+{
+	sysfs_remove_link(&pci_slot->kobj, "module");
+}
+EXPORT_SYMBOL_GPL(pci_hp_remove_module_link);
+#endif
+
 static int pci_slot_init(void)
 {
 	struct kset *pci_bus_kset;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ea2a153a912..6a1800ecd95 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1261,5 +1261,10 @@ static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
 }
 #endif
 
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+extern void pci_hp_create_module_link(struct pci_slot *pci_slot);
+extern void pci_hp_remove_module_link(struct pci_slot *pci_slot);
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 11936fd0b56..b3646cd7fd5 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -69,6 +69,7 @@ enum pcie_link_speed {
 /**
  * struct hotplug_slot_ops -the callbacks that the hotplug pci core can use
  * @owner: The module owner of this structure
+ * @mod_name: The module name (KBUILD_MODNAME) of this structure
  * @enable_slot: Called when the user wants to enable a specific pci slot
  * @disable_slot: Called when the user wants to disable a specific pci slot
  * @set_attention_status: Called to set the specific slot's attention LED to
@@ -101,6 +102,7 @@ enum pcie_link_speed {
  */
 struct hotplug_slot_ops {
 	struct module *owner;
+	const char *mod_name;
 	int (*enable_slot)		(struct hotplug_slot *slot);
 	int (*disable_slot)		(struct hotplug_slot *slot);
 	int (*set_attention_status)	(struct hotplug_slot *slot, u8 value);
@@ -159,12 +161,21 @@ static inline const char *hotplug_slot_name(const struct hotplug_slot *slot)
 	return pci_slot_name(slot->pci_slot);
 }
 
-extern int pci_hp_register(struct hotplug_slot *, struct pci_bus *, int nr,
-			   const char *name);
+extern int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *pbus,
+			     int nr, const char *name,
+			     struct module *owner, const char *mod_name);
 extern int pci_hp_deregister(struct hotplug_slot *slot);
 extern int __must_check pci_hp_change_slot_info	(struct hotplug_slot *slot,
 						 struct hotplug_slot_info *info);
 
+static inline int pci_hp_register(struct hotplug_slot *slot,
+				  struct pci_bus *pbus,
+				  int devnr, const char *name)
+{
+	return __pci_hp_register(slot, pbus, devnr, name,
+				 THIS_MODULE, KBUILD_MODNAME);
+}
+
 /* PCI Setting Record (Type 0) */
 struct hpp_type0 {
 	u32 revision;
-- 
cgit v1.2.3-70-g09d2


From 70298c6e6c1ba68346336b4ea54bd5c0abbf73c8 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 16 Jun 2009 13:34:38 +0800
Subject: PCI AER: support Multiple Error Received and no error source id

Based on PCI Express AER specs, a root port might receive multiple
TLP errors while it could only save a correctable error source id
and an uncorrectable error source id at the same time. In addition,
some root port hardware might be unable to provide a correct source
id, i.e., the source id, or the bus id part of the source id provided
by root port might be equal to 0.

The patchset implements the support in kernel by searching the device
tree under the root port.

Patch 1 changes parameter cb of function pci_walk_bus to return a value.
When cb return non-zero, pci_walk_bus stops more searching on the
device tree.

Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/powerpc/platforms/pseries/eeh_driver.c | 38 ++++++++++++++++++-----------
 drivers/pci/bus.c                           | 11 +++++++--
 drivers/pci/pcie/aer/aerdrv_core.c          | 30 ++++++++++++-----------
 include/linux/pci.h                         |  2 +-
 4 files changed, 50 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 9a2a6e32f00..0e8db677125 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -122,7 +122,7 @@ static void eeh_enable_irq(struct pci_dev *dev)
  * passed back in "userdata".
  */
 
-static void eeh_report_error(struct pci_dev *dev, void *userdata)
+static int eeh_report_error(struct pci_dev *dev, void *userdata)
 {
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
@@ -130,19 +130,21 @@ static void eeh_report_error(struct pci_dev *dev, void *userdata)
 	dev->error_state = pci_channel_io_frozen;
 
 	if (!driver)
-		return;
+		return 0;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->error_detected)
-		return;
+		return 0;
 
 	rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);
 
 	/* A driver that needs a reset trumps all others */
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	return 0;
 }
 
 /**
@@ -153,7 +155,7 @@ static void eeh_report_error(struct pci_dev *dev, void *userdata)
  * Cumulative response passed back in "userdata".
  */
 
-static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 {
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
@@ -161,26 +163,28 @@ static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 	if (!driver ||
 	    !driver->err_handler ||
 	    !driver->err_handler->mmio_enabled)
-		return;
+		return 0;
 
 	rc = driver->err_handler->mmio_enabled (dev);
 
 	/* A driver that needs a reset trumps all others */
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	return 0;
 }
 
 /**
  * eeh_report_reset - tell device that slot has been reset
  */
 
-static void eeh_report_reset(struct pci_dev *dev, void *userdata)
+static int eeh_report_reset(struct pci_dev *dev, void *userdata)
 {
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
 
 	if (!driver)
-		return;
+		return 0;
 
 	dev->error_state = pci_channel_io_normal;
 
@@ -188,35 +192,39 @@ static void eeh_report_reset(struct pci_dev *dev, void *userdata)
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->slot_reset)
-		return;
+		return 0;
 
 	rc = driver->err_handler->slot_reset(dev);
 	if ((*res == PCI_ERS_RESULT_NONE) ||
 	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
 	if (*res == PCI_ERS_RESULT_DISCONNECT &&
 	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+	return 0;
 }
 
 /**
  * eeh_report_resume - tell device to resume normal operations
  */
 
-static void eeh_report_resume(struct pci_dev *dev, void *userdata)
+static int eeh_report_resume(struct pci_dev *dev, void *userdata)
 {
 	struct pci_driver *driver = dev->driver;
 
 	dev->error_state = pci_channel_io_normal;
 
 	if (!driver)
-		return;
+		return 0;
 
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->resume)
-		return;
+		return 0;
 
 	driver->err_handler->resume(dev);
+
+	return 0;
 }
 
 /**
@@ -226,22 +234,24 @@ static void eeh_report_resume(struct pci_dev *dev, void *userdata)
  * dead, and that no further recovery attempts will be made on it.
  */
 
-static void eeh_report_failure(struct pci_dev *dev, void *userdata)
+static int eeh_report_failure(struct pci_dev *dev, void *userdata)
 {
 	struct pci_driver *driver = dev->driver;
 
 	dev->error_state = pci_channel_io_perm_failure;
 
 	if (!driver)
-		return;
+		return 0;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->error_detected)
-		return;
+		return 0;
 
 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+	return 0;
 }
 
 /* ------------------------------------------------------- */
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 40af27f3104..cef28a79103 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -206,13 +206,18 @@ void pci_enable_bridges(struct pci_bus *bus)
  *  Walk the given bus, including any bridged devices
  *  on buses under this bus.  Call the provided callback
  *  on each device found.
+ *
+ *  We check the return of @cb each time. If it returns anything
+ *  other than 0, we break out.
+ *
  */
-void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
+void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata)
 {
 	struct pci_dev *dev;
 	struct pci_bus *bus;
 	struct list_head *next;
+	int retval;
 
 	bus = top;
 	down_read(&pci_bus_sem);
@@ -236,8 +241,10 @@ void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
 
 		/* Run device routines with the device locked */
 		down(&dev->dev.sem);
-		cb(dev, userdata);
+		retval = cb(dev, userdata);
 		up(&dev->dev.sem);
+		if (retval)
+			break;
 	}
 	up_read(&pci_bus_sem);
 }
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index dd3829e68e3..a7a3919904b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -109,7 +109,7 @@ int pci_cleanup_aer_correct_error_status(struct pci_dev *dev)
 #endif  /*  0  */
 
 
-static void set_device_error_reporting(struct pci_dev *dev, void *data)
+static int set_device_error_reporting(struct pci_dev *dev, void *data)
 {
 	bool enable = *((bool *)data);
 
@@ -124,6 +124,8 @@ static void set_device_error_reporting(struct pci_dev *dev, void *data)
 
 	if (enable)
 		pcie_set_ecrc_checking(dev);
+
+	return 0;
 }
 
 /**
@@ -207,7 +209,7 @@ static struct device* find_source_device(struct pci_dev *parent, u16 id)
 	return NULL;
 }
 
-static void report_error_detected(struct pci_dev *dev, void *data)
+static int report_error_detected(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote;
 	struct pci_error_handlers *err_handler;
@@ -232,16 +234,16 @@ static void report_error_detected(struct pci_dev *dev, void *data)
 				   dev->driver ?
 				   "no AER-aware driver" : "no driver");
 		}
-		return;
+		return 0;
 	}
 
 	err_handler = dev->driver->err_handler;
 	vote = err_handler->error_detected(dev, result_data->state);
 	result_data->result = merge_result(result_data->result, vote);
-	return;
+	return 0;
 }
 
-static void report_mmio_enabled(struct pci_dev *dev, void *data)
+static int report_mmio_enabled(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote;
 	struct pci_error_handlers *err_handler;
@@ -251,15 +253,15 @@ static void report_mmio_enabled(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->mmio_enabled)
-		return;
+		return 0;
 
 	err_handler = dev->driver->err_handler;
 	vote = err_handler->mmio_enabled(dev);
 	result_data->result = merge_result(result_data->result, vote);
-	return;
+	return 0;
 }
 
-static void report_slot_reset(struct pci_dev *dev, void *data)
+static int report_slot_reset(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote;
 	struct pci_error_handlers *err_handler;
@@ -269,15 +271,15 @@ static void report_slot_reset(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->slot_reset)
-		return;
+		return 0;
 
 	err_handler = dev->driver->err_handler;
 	vote = err_handler->slot_reset(dev);
 	result_data->result = merge_result(result_data->result, vote);
-	return;
+	return 0;
 }
 
-static void report_resume(struct pci_dev *dev, void *data)
+static int report_resume(struct pci_dev *dev, void *data)
 {
 	struct pci_error_handlers *err_handler;
 
@@ -286,11 +288,11 @@ static void report_resume(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->resume)
-		return;
+		return 0;
 
 	err_handler = dev->driver->err_handler;
 	err_handler->resume(dev);
-	return;
+	return 0;
 }
 
 /**
@@ -307,7 +309,7 @@ static void report_resume(struct pci_dev *dev, void *data)
 static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 	enum pci_channel_state state,
 	char *error_mesg,
-	void (*cb)(struct pci_dev *, void *))
+	int (*cb)(struct pci_dev *, void *))
 {
 	struct aer_broadcast_data result_data;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6a1800ecd95..61d9b790d21 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -789,7 +789,7 @@ const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
 int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 		    int pass);
 
-void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
+void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata);
 int pci_cfg_space_size_ext(struct pci_dev *dev);
 int pci_cfg_space_size(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From 8c1c699fec9e9021bf6ff0285dee086bb27aec90 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Sat, 13 Jun 2009 15:52:13 +0800
Subject: PCI: cleanup Function Level Reset

This patch enhances the FLR functions:
  1) remove disable_irq() so the shared IRQ won't be disabled.
  2) replace the 1s wait with 100, 200 and 400ms wait intervals
     for the Pending Transaction.
  3) replace mdelay() with msleep().
  4) add might_sleep().
  5) lock the device to prevent PM suspend from accessing the CSRs
     during the reset.
  6) coding style fixes.

Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   |   4 +-
 drivers/pci/pci.c   | 166 ++++++++++++++++++++++++++--------------------------
 include/linux/pci.h |   2 +-
 3 files changed, 87 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index e87fe95da81..03c7706c0a0 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -110,7 +110,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
 	}
 
 	if (reset)
-		pci_execute_reset_function(virtfn);
+		__pci_reset_function(virtfn);
 
 	pci_device_add(virtfn, virtfn->bus);
 	mutex_unlock(&iov->dev->sriov->lock);
@@ -164,7 +164,7 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
 
 	if (reset) {
 		device_release_driver(&virtfn->dev);
-		pci_execute_reset_function(virtfn);
+		__pci_reset_function(virtfn);
 	}
 
 	sprintf(buf, "virtfn%u", id);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8ea911e5572..6a052ada3fe 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2055,111 +2055,112 @@ int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask)
 EXPORT_SYMBOL(pci_set_dma_seg_boundary);
 #endif
 
-static int __pcie_flr(struct pci_dev *dev, int probe)
+static int pcie_flr(struct pci_dev *dev, int probe)
 {
-	u16 status;
+	int i;
+	int pos;
 	u32 cap;
-	int exppos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	u16 status;
 
-	if (!exppos)
+	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (!pos)
 		return -ENOTTY;
-	pci_read_config_dword(dev, exppos + PCI_EXP_DEVCAP, &cap);
+
+	pci_read_config_dword(dev, pos + PCI_EXP_DEVCAP, &cap);
 	if (!(cap & PCI_EXP_DEVCAP_FLR))
 		return -ENOTTY;
 
 	if (probe)
 		return 0;
 
-	pci_block_user_cfg_access(dev);
-
 	/* Wait for Transaction Pending bit clean */
-	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (!(status & PCI_EXP_DEVSTA_TRPND))
-		goto transaction_done;
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
 
-	msleep(100);
-	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (!(status & PCI_EXP_DEVSTA_TRPND))
-		goto transaction_done;
-
-	dev_info(&dev->dev, "Busy after 100ms while trying to reset; "
-			"sleeping for 1 second\n");
-	ssleep(1);
-	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
-	if (status & PCI_EXP_DEVSTA_TRPND)
-		dev_info(&dev->dev, "Still busy after 1s; "
-				"proceeding with reset anyway\n");
-
-transaction_done:
-	pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL,
+		pci_read_config_word(dev, pos + PCI_EXP_DEVSTA, &status);
+		if (!(status & PCI_EXP_DEVSTA_TRPND))
+			goto clear;
+	}
+
+	dev_err(&dev->dev, "transaction is not cleared; "
+			"proceeding with reset anyway\n");
+
+clear:
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
 				PCI_EXP_DEVCTL_BCR_FLR);
-	mdelay(100);
+	msleep(100);
 
-	pci_unblock_user_cfg_access(dev);
 	return 0;
 }
 
-static int __pci_af_flr(struct pci_dev *dev, int probe)
+static int pci_af_flr(struct pci_dev *dev, int probe)
 {
-	int cappos = pci_find_capability(dev, PCI_CAP_ID_AF);
-	u8 status;
+	int i;
+	int pos;
 	u8 cap;
+	u8 status;
 
-	if (!cappos)
+	pos = pci_find_capability(dev, PCI_CAP_ID_AF);
+	if (!pos)
 		return -ENOTTY;
-	pci_read_config_byte(dev, cappos + PCI_AF_CAP, &cap);
+
+	pci_read_config_byte(dev, pos + PCI_AF_CAP, &cap);
 	if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
 		return -ENOTTY;
 
 	if (probe)
 		return 0;
 
-	pci_block_user_cfg_access(dev);
-
 	/* Wait for Transaction Pending bit clean */
-	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (!(status & PCI_AF_STATUS_TP))
-		goto transaction_done;
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
+
+		pci_read_config_byte(dev, pos + PCI_AF_STATUS, &status);
+		if (!(status & PCI_AF_STATUS_TP))
+			goto clear;
+	}
+
+	dev_err(&dev->dev, "transaction is not cleared; "
+			"proceeding with reset anyway\n");
 
+clear:
+	pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
 	msleep(100);
-	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (!(status & PCI_AF_STATUS_TP))
-		goto transaction_done;
-
-	dev_info(&dev->dev, "Busy after 100ms while trying to"
-			" reset; sleeping for 1 second\n");
-	ssleep(1);
-	pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status);
-	if (status & PCI_AF_STATUS_TP)
-		dev_info(&dev->dev, "Still busy after 1s; "
-				"proceeding with reset anyway\n");
-
-transaction_done:
-	pci_write_config_byte(dev, cappos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
-	mdelay(100);
-
-	pci_unblock_user_cfg_access(dev);
+
 	return 0;
 }
 
-static int __pci_reset_function(struct pci_dev *pdev, int probe)
+static int pci_dev_reset(struct pci_dev *dev, int probe)
 {
-	int res;
+	int rc;
+
+	might_sleep();
+
+	if (!probe) {
+		pci_block_user_cfg_access(dev);
+		/* block PM suspend, driver probe, etc. */
+		down(&dev->dev.sem);
+	}
 
-	res = __pcie_flr(pdev, probe);
-	if (res != -ENOTTY)
-		return res;
+	rc = pcie_flr(dev, probe);
+	if (rc != -ENOTTY)
+		goto done;
 
-	res = __pci_af_flr(pdev, probe);
-	if (res != -ENOTTY)
-		return res;
+	rc = pci_af_flr(dev, probe);
+done:
+	if (!probe) {
+		up(&dev->dev.sem);
+		pci_unblock_user_cfg_access(dev);
+	}
 
-	return res;
+	return rc;
 }
 
 /**
- * pci_execute_reset_function() - Reset a PCI device function
- * @dev: Device function to reset
+ * __pci_reset_function - reset a PCI device function
+ * @dev: PCI device to reset
  *
  * Some devices allow an individual function to be reset without affecting
  * other functions in the same device.  The PCI device must be responsive
@@ -2171,18 +2172,18 @@ static int __pci_reset_function(struct pci_dev *pdev, int probe)
  * device including MSI, bus mastering, BARs, decoding IO and memory spaces,
  * etc.
  *
- * Returns 0 if the device function was successfully reset or -ENOTTY if the
+ * Returns 0 if the device function was successfully reset or negative if the
  * device doesn't support resetting a single function.
  */
-int pci_execute_reset_function(struct pci_dev *dev)
+int __pci_reset_function(struct pci_dev *dev)
 {
-	return __pci_reset_function(dev, 0);
+	return pci_dev_reset(dev, 0);
 }
-EXPORT_SYMBOL_GPL(pci_execute_reset_function);
+EXPORT_SYMBOL_GPL(__pci_reset_function);
 
 /**
- * pci_reset_function() - quiesce and reset a PCI device function
- * @dev: Device function to reset
+ * pci_reset_function - quiesce and reset a PCI device function
+ * @dev: PCI device to reset
  *
  * Some devices allow an individual function to be reset without affecting
  * other functions in the same device.  The PCI device must be responsive
@@ -2190,32 +2191,33 @@ EXPORT_SYMBOL_GPL(pci_execute_reset_function);
  *
  * This function does not just reset the PCI portion of a device, but
  * clears all the state associated with the device.  This function differs
- * from pci_execute_reset_function in that it saves and restores device state
+ * from __pci_reset_function in that it saves and restores device state
  * over the reset.
  *
- * Returns 0 if the device function was successfully reset or -ENOTTY if the
+ * Returns 0 if the device function was successfully reset or negative if the
  * device doesn't support resetting a single function.
  */
 int pci_reset_function(struct pci_dev *dev)
 {
-	int r = __pci_reset_function(dev, 1);
+	int rc;
 
-	if (r < 0)
-		return r;
+	rc = pci_dev_reset(dev, 1);
+	if (rc)
+		return rc;
 
-	if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0)
-		disable_irq(dev->irq);
 	pci_save_state(dev);
 
+	/*
+	 * both INTx and MSI are disabled after the Interrupt Disable bit
+	 * is set and the Bus Master bit is cleared.
+	 */
 	pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
 
-	r = pci_execute_reset_function(dev);
+	rc = pci_dev_reset(dev, 0);
 
 	pci_restore_state(dev);
-	if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0)
-		enable_irq(dev->irq);
 
-	return r;
+	return rc;
 }
 EXPORT_SYMBOL_GPL(pci_reset_function);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 61d9b790d21..91b06be2f01 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -702,8 +702,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
 int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
+int __pci_reset_function(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
-int pci_execute_reset_function(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
-- 
cgit v1.2.3-70-g09d2


From 49809d6a511960e5ccfb85b780894f45ac119065 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 7 Jun 2009 12:10:39 -0300
Subject: V4L/DVB (11970): gspca - ov519: Add support for the ov518 bridge.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jean-Francois Moine <moinejf@free.fr>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/ov519.c | 520 +++++++++++++++++++++++++++++++++++---
 include/linux/videodev2.h         |   3 +-
 2 files changed, 488 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/ov519.c b/drivers/media/video/gspca/ov519.c
index 1fff37b7989..188866ac6ce 100644
--- a/drivers/media/video/gspca/ov519.c
+++ b/drivers/media/video/gspca/ov519.c
@@ -50,6 +50,13 @@ static int i2c_detect_tries = 10;
 struct sd {
 	struct gspca_dev gspca_dev;		/* !! must be the first item */
 
+	char bridge;
+#define BRIDGE_OV511		0
+#define BRIDGE_OV511PLUS	1
+#define BRIDGE_OV518		2
+#define BRIDGE_OV518PLUS	3
+#define BRIDGE_OV519		4
+
 	/* Determined by sensor type */
 	__u8 sif;
 
@@ -87,6 +94,9 @@ static int sd_sethflip(struct gspca_dev *gspca_dev, __s32 val);
 static int sd_gethflip(struct gspca_dev *gspca_dev, __s32 *val);
 static int sd_setvflip(struct gspca_dev *gspca_dev, __s32 val);
 static int sd_getvflip(struct gspca_dev *gspca_dev, __s32 *val);
+static void setbrightness(struct gspca_dev *gspca_dev);
+static void setcontrast(struct gspca_dev *gspca_dev);
+static void setcolors(struct gspca_dev *gspca_dev);
 
 static struct ctrl sd_ctrls[] = {
 	{
@@ -164,7 +174,7 @@ static struct ctrl sd_ctrls[] = {
 	},
 };
 
-static const struct v4l2_pix_format vga_mode[] = {
+static const struct v4l2_pix_format ov519_vga_mode[] = {
 	{320, 240, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE,
 		.bytesperline = 320,
 		.sizeimage = 320 * 240 * 3 / 8 + 590,
@@ -176,7 +186,7 @@ static const struct v4l2_pix_format vga_mode[] = {
 		.colorspace = V4L2_COLORSPACE_JPEG,
 		.priv = 0},
 };
-static const struct v4l2_pix_format sif_mode[] = {
+static const struct v4l2_pix_format ov519_sif_mode[] = {
 	{176, 144, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE,
 		.bytesperline = 176,
 		.sizeimage = 176 * 144 * 3 / 8 + 590,
@@ -189,6 +199,47 @@ static const struct v4l2_pix_format sif_mode[] = {
 		.priv = 0},
 };
 
+static const struct v4l2_pix_format ov518_vga_mode[] = {
+	{320, 240, V4L2_PIX_FMT_OV518, V4L2_FIELD_NONE,
+		.bytesperline = 320,
+		.sizeimage = 320 * 240 * 3 / 8 + 590,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 1},
+	{640, 480, V4L2_PIX_FMT_OV518, V4L2_FIELD_NONE,
+		.bytesperline = 640,
+		.sizeimage = 640 * 480 * 3 / 8 + 590,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 0},
+};
+static const struct v4l2_pix_format ov518_sif_mode[] = {
+	{176, 144, V4L2_PIX_FMT_OV518, V4L2_FIELD_NONE,
+		.bytesperline = 176,
+		.sizeimage = 40000,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 1},
+	{352, 288, V4L2_PIX_FMT_OV518, V4L2_FIELD_NONE,
+		.bytesperline = 352,
+		.sizeimage = 352 * 288 * 3 / 8 + 590,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 0},
+};
+
+
+/* Registers common to OV511 / OV518 */
+#define R51x_SYS_RESET          	0x50
+#define R51x_SYS_INIT         		0x53
+#define R51x_SYS_SNAP			0x52
+#define R51x_SYS_CUST_ID		0x5F
+#define R51x_COMP_LUT_BEGIN		0x80
+
+/* OV511 Camera interface register numbers */
+#define R511_SYS_LED_CTL		0x55	/* OV511+ only */
+#define	OV511_RESET_NOREGS		0x3F	/* All but OV511 & regs */
+
+/* OV518 Camera interface register numbers */
+#define R518_GPIO_OUT			0x56	/* OV518(+) only */
+#define R518_GPIO_CTL			0x57	/* OV518(+) only */
+
 /* OV519 Camera interface register numbers */
 #define OV519_R10_H_SIZE		0x10
 #define OV519_R11_V_SIZE		0x11
@@ -224,6 +275,8 @@ static const struct v4l2_pix_format sif_mode[] = {
 
 /* OV7610 registers */
 #define OV7610_REG_GAIN		0x00	/* gain setting (5:0) */
+#define OV7610_REG_BLUE		0x01	/* blue channel balance */
+#define OV7610_REG_RED		0x02	/* red channel balance */
 #define OV7610_REG_SAT		0x03	/* saturation */
 #define OV8610_REG_HUE		0x04	/* 04 reserved */
 #define OV7610_REG_CNT		0x05	/* Y contrast */
@@ -846,11 +899,12 @@ static unsigned char ov7670_abs_to_sm(unsigned char v)
 static int reg_w(struct sd *sd, __u16 index, __u8 value)
 {
 	int ret;
+	int req = (sd->bridge <= BRIDGE_OV511PLUS) ? 2 : 1;
 
 	sd->gspca_dev.usb_buf[0] = value;
 	ret = usb_control_msg(sd->gspca_dev.dev,
 			usb_sndctrlpipe(sd->gspca_dev.dev, 0),
-			1,			/* REQ_IO (ov518/519) */
+			req,
 			USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
 			0, index,
 			sd->gspca_dev.usb_buf, 1, 500);
@@ -864,10 +918,11 @@ static int reg_w(struct sd *sd, __u16 index, __u8 value)
 static int reg_r(struct sd *sd, __u16 index)
 {
 	int ret;
+	int req = (sd->bridge <= BRIDGE_OV511PLUS) ? 3 : 1;
 
 	ret = usb_control_msg(sd->gspca_dev.dev,
 			usb_rcvctrlpipe(sd->gspca_dev.dev, 0),
-			1,			/* REQ_IO */
+			req,
 			USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
 			0, index, sd->gspca_dev.usb_buf, 1, 500);
 
@@ -923,6 +978,28 @@ static int reg_w_mask(struct sd *sd,
 	return reg_w(sd, index, value);
 }
 
+/*
+ * Writes multiple (n) byte value to a single register. Only valid with certain
+ * registers (0x30 and 0xc4 - 0xce).
+ */
+static int ov518_reg_w32(struct sd *sd, __u16 index, u32 value, int n)
+{
+	int ret;
+
+	*((u32 *)sd->gspca_dev.usb_buf) = __cpu_to_le32(value);
+
+	ret = usb_control_msg(sd->gspca_dev.dev,
+			usb_sndctrlpipe(sd->gspca_dev.dev, 0),
+			1 /* REG_IO */,
+			USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			0, index,
+			sd->gspca_dev.usb_buf, n, 500);
+	if (ret < 0)
+		PDEBUG(D_ERR, "Write reg32 [%02x] %08x failed", index, value);
+	return ret;
+}
+
+
 /*
  * The OV518 I2C I/O procedure is different, hence, this function.
  * This is normally only called from i2c_w(). Note that this function
@@ -1014,20 +1091,47 @@ static inline int ov51x_stop(struct sd *sd)
 {
 	PDEBUG(D_STREAM, "stopping");
 	sd->stopped = 1;
-	return reg_w(sd, OV519_SYS_RESET1, 0x0f);
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		return reg_w(sd, R51x_SYS_RESET, 0x3d);
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		return reg_w_mask(sd, R51x_SYS_RESET, 0x3a, 0x3a);
+	case BRIDGE_OV519:
+		return reg_w(sd, OV519_SYS_RESET1, 0x0f);
+	}
+
+	return 0;
 }
 
 /* Restarts OV511 after ov511_stop() is called. Has no effect if it is not
  * actually stopped (for performance). */
 static inline int ov51x_restart(struct sd *sd)
 {
+	int rc;
+
 	PDEBUG(D_STREAM, "restarting");
 	if (!sd->stopped)
 		return 0;
 	sd->stopped = 0;
 
 	/* Reinitialize the stream */
-	return reg_w(sd, OV519_SYS_RESET1, 0x00);
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		return reg_w(sd, R51x_SYS_RESET, 0x00);
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		rc = reg_w(sd, 0x2f, 0x80);
+		if (rc < 0)
+			return rc;
+		return reg_w(sd, R51x_SYS_RESET, 0x00);
+	case BRIDGE_OV519:
+		return reg_w(sd, OV519_SYS_RESET1, 0x00);
+	}
+
+	return 0;
 }
 
 /* This does an initial reset of an OmniVision sensor and ensures that I2C
@@ -1287,16 +1391,161 @@ static int ov6xx0_configure(struct sd *sd)
 /* Turns on or off the LED. Only has an effect with OV511+/OV518(+)/OV519 */
 static void ov51x_led_control(struct sd *sd, int on)
 {
-	reg_w_mask(sd, OV519_GPIO_DATA_OUT0, !on, 1);	/* 0 / 1 */
+	switch (sd->bridge) {
+	/* OV511 has no LED control */
+	case BRIDGE_OV511PLUS:
+		reg_w(sd, R511_SYS_LED_CTL, on ? 1 : 0);
+		break;
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		reg_w_mask(sd, R518_GPIO_OUT, on ? 0x02 : 0x00, 0x02);
+		break;
+	case BRIDGE_OV519:
+		reg_w_mask(sd, OV519_GPIO_DATA_OUT0, !on, 1);	/* 0 / 1 */
+		break;
+	}
 }
 
-/* this function is called at probe time */
-static int sd_config(struct gspca_dev *gspca_dev,
-			const struct usb_device_id *id)
+/* OV518 quantization tables are 8x4 (instead of 8x8) */
+static int ov518_upload_quan_tables(struct sd *sd)
+{
+	const unsigned char yQuanTable518[] = {
+		5, 4, 5, 6, 6, 7, 7, 7,
+		5, 5, 5, 5, 6, 7, 7, 7,
+		6, 6, 6, 6, 7, 7, 7, 8,
+		7, 7, 6, 7, 7, 7, 8, 8
+	};
+
+	const unsigned char uvQuanTable518[] = {
+		6, 6, 6, 7, 7, 7, 7, 7,
+		6, 6, 6, 7, 7, 7, 7, 7,
+		6, 6, 6, 7, 7, 7, 7, 8,
+		7, 7, 7, 7, 7, 7, 8, 8
+	};
+
+	const unsigned char *pYTable = yQuanTable518;
+	const unsigned char *pUVTable = uvQuanTable518;
+	unsigned char val0, val1;
+	int i, rc, reg = R51x_COMP_LUT_BEGIN;
+
+	PDEBUG(D_PROBE, "Uploading quantization tables");
+
+	for (i = 0; i < 16; i++) {
+		val0 = *pYTable++;
+		val1 = *pYTable++;
+		val0 &= 0x0f;
+		val1 &= 0x0f;
+		val0 |= val1 << 4;
+		rc = reg_w(sd, reg, val0);
+		if (rc < 0)
+			return rc;
+
+		val0 = *pUVTable++;
+		val1 = *pUVTable++;
+		val0 &= 0x0f;
+		val1 &= 0x0f;
+		val0 |= val1 << 4;
+		rc = reg_w(sd, reg + 16, val0);
+		if (rc < 0)
+			return rc;
+
+		reg++;
+	}
+
+	return 0;
+}
+
+/* This initializes the OV518/OV518+ and the sensor */
+static int ov518_configure(struct gspca_dev *gspca_dev)
 {
 	struct sd *sd = (struct sd *) gspca_dev;
-	struct cam *cam;
+	int rc;
+
+	/* For 518 and 518+ */
+	static struct ov_regvals init_518[] = {
+		{ R51x_SYS_RESET,	0x40 },
+		{ R51x_SYS_INIT,	0xe1 },
+		{ R51x_SYS_RESET,	0x3e },
+		{ R51x_SYS_INIT,	0xe1 },
+		{ R51x_SYS_RESET,	0x00 },
+		{ R51x_SYS_INIT,	0xe1 },
+		{ 0x46,			0x00 },
+		{ 0x5d,			0x03 },
+	};
+
+	static struct ov_regvals norm_518[] = {
+		{ R51x_SYS_SNAP,	0x02 }, /* Reset */
+		{ R51x_SYS_SNAP,	0x01 }, /* Enable */
+		{ 0x31, 		0x0f },
+		{ 0x5d,			0x03 },
+		{ 0x24,			0x9f },
+		{ 0x25,			0x90 },
+		{ 0x20,			0x00 },
+		{ 0x51,			0x04 },
+		{ 0x71,			0x19 },
+		{ 0x2f,			0x80 },
+	};
+
+	static struct ov_regvals norm_518_p[] = {
+		{ R51x_SYS_SNAP,	0x02 }, /* Reset */
+		{ R51x_SYS_SNAP,	0x01 }, /* Enable */
+		{ 0x31, 		0x0f },
+		{ 0x5d,			0x03 },
+		{ 0x24,			0x9f },
+		{ 0x25,			0x90 },
+		{ 0x20,			0x60 },
+		{ 0x51,			0x02 },
+		{ 0x71,			0x19 },
+		{ 0x40,			0xff },
+		{ 0x41,			0x42 },
+		{ 0x46,			0x00 },
+		{ 0x33,			0x04 },
+		{ 0x21,			0x19 },
+		{ 0x3f,			0x10 },
+		{ 0x2f,			0x80 },
+	};
+
+	/* First 5 bits of custom ID reg are a revision ID on OV518 */
+	PDEBUG(D_PROBE, "Device revision %d",
+	       0x1F & reg_r(sd, R51x_SYS_CUST_ID));
+
+	rc = write_regvals(sd, init_518, ARRAY_SIZE(init_518));
+	if (rc < 0)
+		return rc;
+
+	/* Set LED GPIO pin to output mode */
+	rc = reg_w_mask(sd, R518_GPIO_CTL, 0x00, 0x02);
+	if (rc < 0)
+		return rc;
 
+	switch (sd->bridge) {
+	case BRIDGE_OV518:
+		rc = write_regvals(sd, norm_518, ARRAY_SIZE(norm_518));
+		if (rc < 0)
+			return rc;
+		break;
+	case BRIDGE_OV518PLUS:
+		rc = write_regvals(sd, norm_518_p, ARRAY_SIZE(norm_518_p));
+		if (rc < 0)
+			return rc;
+		break;
+	}
+
+	rc = ov518_upload_quan_tables(sd);
+	if (rc < 0) {
+		PDEBUG(D_ERR, "Error uploading quantization tables");
+		return rc;
+	}
+
+	rc = reg_w(sd, 0x2f, 0x80);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static int ov519_configure(struct sd *sd)
+{
 	static const struct ov_regvals init_519[] = {
 		{ 0x5a,  0x6d }, /* EnableSystem */
 		{ 0x53,  0x9b },
@@ -1313,8 +1562,32 @@ static int sd_config(struct gspca_dev *gspca_dev,
 		/* windows reads 0x55 at this point*/
 	};
 
-	if (write_regvals(sd, init_519, ARRAY_SIZE(init_519)))
+	return write_regvals(sd, init_519, ARRAY_SIZE(init_519));
+}
+
+/* this function is called at probe time */
+static int sd_config(struct gspca_dev *gspca_dev,
+			const struct usb_device_id *id)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+	struct cam *cam;
+	int ret = 0;
+
+	sd->bridge = id->driver_info;
+
+	switch (sd->bridge) {
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		ret = ov518_configure(gspca_dev);
+		break;
+	case BRIDGE_OV519:
+		ret = ov519_configure(sd);
+		break;
+	}
+
+	if (ret)
 		goto error;
+
 	ov51x_led_control(sd, 0);	/* turn LED off */
 
 	/* Test for 76xx */
@@ -1360,12 +1633,26 @@ static int sd_config(struct gspca_dev *gspca_dev,
 	}
 
 	cam = &gspca_dev->cam;
-	if (!sd->sif) {
-		cam->cam_mode = vga_mode;
-		cam->nmodes = ARRAY_SIZE(vga_mode);
-	} else {
-		cam->cam_mode = sif_mode;
-		cam->nmodes = ARRAY_SIZE(sif_mode);
+	switch (sd->bridge) {
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		if (!sd->sif) {
+			cam->cam_mode = ov518_vga_mode;
+			cam->nmodes = ARRAY_SIZE(ov518_vga_mode);
+		} else {
+			cam->cam_mode = ov518_sif_mode;
+			cam->nmodes = ARRAY_SIZE(ov518_sif_mode);
+		}
+		break;
+	case BRIDGE_OV519:
+		if (!sd->sif) {
+			cam->cam_mode = ov519_vga_mode;
+			cam->nmodes = ARRAY_SIZE(ov519_vga_mode);
+		} else {
+			cam->cam_mode = ov519_sif_mode;
+			cam->nmodes = ARRAY_SIZE(ov519_sif_mode);
+		}
+		break;
 	}
 	sd->brightness = BRIGHTNESS_DEF;
 	sd->contrast = CONTRAST_DEF;
@@ -1422,6 +1709,106 @@ static int sd_init(struct gspca_dev *gspca_dev)
 	return 0;
 }
 
+/* Sets up the OV518/OV518+ with the given image parameters
+ *
+ * OV518 needs a completely different approach, until we can figure out what
+ * the individual registers do. Also, only 15 FPS is supported now.
+ *
+ * Do not put any sensor-specific code in here (including I2C I/O functions)
+ */
+static int ov518_mode_init_regs(struct sd *sd)
+{
+	int hsegs, vsegs;
+
+	/******** Set the mode ********/
+
+	reg_w(sd, 0x2b, 0);
+	reg_w(sd, 0x2c, 0);
+	reg_w(sd, 0x2d, 0);
+	reg_w(sd, 0x2e, 0);
+	reg_w(sd, 0x3b, 0);
+	reg_w(sd, 0x3c, 0);
+	reg_w(sd, 0x3d, 0);
+	reg_w(sd, 0x3e, 0);
+
+	if (sd->bridge == BRIDGE_OV518) {
+		/* Set 8-bit (YVYU) input format */
+		reg_w_mask(sd, 0x20, 0x08, 0x08);
+
+		/* Set 12-bit (4:2:0) output format */
+		reg_w_mask(sd, 0x28, 0x80, 0xf0);
+		reg_w_mask(sd, 0x38, 0x80, 0xf0);
+	} else {
+		reg_w(sd, 0x28, 0x80);
+		reg_w(sd, 0x38, 0x80);
+	}
+
+	hsegs = sd->gspca_dev.width / 16;
+	vsegs = sd->gspca_dev.height / 4;
+
+	reg_w(sd, 0x29, hsegs);
+	reg_w(sd, 0x2a, vsegs);
+
+	reg_w(sd, 0x39, hsegs);
+	reg_w(sd, 0x3a, vsegs);
+
+	/* Windows driver does this here; who knows why */
+	reg_w(sd, 0x2f, 0x80);
+
+	/******** Set the framerate (to 30 FPS) ********/
+	if (sd->bridge == BRIDGE_OV518PLUS)
+		sd->clockdiv = 1;
+	else
+		sd->clockdiv = 0;
+
+	/* Mode independent, but framerate dependent, regs */
+	reg_w(sd, 0x51, 0x04);	/* Clock divider; lower==faster */
+	reg_w(sd, 0x22, 0x18);
+	reg_w(sd, 0x23, 0xff);
+
+	if (sd->bridge == BRIDGE_OV518PLUS)
+		reg_w(sd, 0x21, 0x19);
+	else
+		reg_w(sd, 0x71, 0x17);	/* Compression-related? */
+
+	/* FIXME: Sensor-specific */
+	/* Bit 5 is what matters here. Of course, it is "reserved" */
+	i2c_w(sd, 0x54, 0x23);
+
+	reg_w(sd, 0x2f, 0x80);
+
+	if (sd->bridge == BRIDGE_OV518PLUS) {
+		reg_w(sd, 0x24, 0x94);
+		reg_w(sd, 0x25, 0x90);
+		ov518_reg_w32(sd, 0xc4,    400, 2);	/* 190h   */
+		ov518_reg_w32(sd, 0xc6,    540, 2);	/* 21ch   */
+		ov518_reg_w32(sd, 0xc7,    540, 2);	/* 21ch   */
+		ov518_reg_w32(sd, 0xc8,    108, 2);	/* 6ch    */
+		ov518_reg_w32(sd, 0xca, 131098, 3);	/* 2001ah */
+		ov518_reg_w32(sd, 0xcb,    532, 2);	/* 214h   */
+		ov518_reg_w32(sd, 0xcc,   2400, 2);	/* 960h   */
+		ov518_reg_w32(sd, 0xcd,     32, 2);	/* 20h    */
+		ov518_reg_w32(sd, 0xce,    608, 2);	/* 260h   */
+	} else {
+		reg_w(sd, 0x24, 0x9f);
+		reg_w(sd, 0x25, 0x90);
+		ov518_reg_w32(sd, 0xc4,    400, 2);	/* 190h   */
+		ov518_reg_w32(sd, 0xc6,    381, 2);	/* 17dh   */
+		ov518_reg_w32(sd, 0xc7,    381, 2);	/* 17dh   */
+		ov518_reg_w32(sd, 0xc8,    128, 2);	/* 80h    */
+		ov518_reg_w32(sd, 0xca, 183331, 3);	/* 2cc23h */
+		ov518_reg_w32(sd, 0xcb,    746, 2);	/* 2eah   */
+		ov518_reg_w32(sd, 0xcc,   1750, 2);	/* 6d6h   */
+		ov518_reg_w32(sd, 0xcd,     45, 2);	/* 2dh    */
+		ov518_reg_w32(sd, 0xce,    851, 2);	/* 353h   */
+	}
+
+	reg_w(sd, 0x2f, 0x80);
+
+	return 0;
+}
+
+
 /* Sets up the OV519 with the given image parameters
  *
  * OV519 needs a completely different approach, until we can figure out what
@@ -1740,6 +2127,11 @@ static int set_ov_sensor_window(struct sd *sd)
 		hwebase = 0x3a;
 		vwsbase = 0x05;
 		vwebase = 0x06;
+		if (qvga) {
+			/* HDG: this fixes U and V getting swapped */
+			hwsbase--;
+			vwsbase--;
+		}
 		break;
 	case SEN_OV7620:
 		hwsbase = 0x2f;		/* From 7620.SET (spec is wrong) */
@@ -1855,15 +2247,28 @@ static int set_ov_sensor_window(struct sd *sd)
 static int sd_start(struct gspca_dev *gspca_dev)
 {
 	struct sd *sd = (struct sd *) gspca_dev;
-	int ret;
+	int ret = 0;
 
-	ret = ov519_mode_init_regs(sd);
+	switch (sd->bridge) {
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		ret = ov518_mode_init_regs(sd);
+		break;
+	case BRIDGE_OV519:
+		ret = ov519_mode_init_regs(sd);
+		break;
+	}
 	if (ret < 0)
 		goto out;
+
 	ret = set_ov_sensor_window(sd);
 	if (ret < 0)
 		goto out;
 
+	setcontrast(gspca_dev);
+	setbrightness(gspca_dev);
+	setcolors(gspca_dev);
+
 	ret = ov51x_restart(sd);
 	if (ret < 0)
 		goto out;
@@ -1882,7 +2287,30 @@ static void sd_stopN(struct gspca_dev *gspca_dev)
 	ov51x_led_control(sd, 0);
 }
 
-static void sd_pkt_scan(struct gspca_dev *gspca_dev,
+static void ov518_pkt_scan(struct gspca_dev *gspca_dev,
+			struct gspca_frame *frame,	/* target */
+			__u8 *data,			/* isoc packet */
+			int len)			/* iso packet length */
+{
+	PDEBUG(D_STREAM, "ov518_pkt_scan: %d bytes", len);
+
+	if (len & 7) {
+		len--;
+		PDEBUG(D_STREAM, "packet number: %d\n", (int)data[len]);
+	}
+
+	/* A false positive here is likely, until OVT gives me
+	 * the definitive SOF/EOF format */
+	if ((!(data[0] | data[1] | data[2] | data[3] | data[5])) && data[6]) {
+		gspca_frame_add(gspca_dev, LAST_PACKET, frame, data, 0);
+		gspca_frame_add(gspca_dev, FIRST_PACKET, frame, data, 0);
+	}
+
+	/* intermediate packet */
+	gspca_frame_add(gspca_dev, INTER_PACKET, frame, data, len);
+}
+
+static void ov519_pkt_scan(struct gspca_dev *gspca_dev,
 			struct gspca_frame *frame,	/* target */
 			__u8 *data,			/* isoc packet */
 			int len)			/* iso packet length */
@@ -1926,6 +2354,27 @@ static void sd_pkt_scan(struct gspca_dev *gspca_dev,
 			data, len);
 }
 
+static void sd_pkt_scan(struct gspca_dev *gspca_dev,
+			struct gspca_frame *frame,	/* target */
+			__u8 *data,			/* isoc packet */
+			int len)			/* iso packet length */
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		break;
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+		ov518_pkt_scan(gspca_dev, frame, data, len);
+		break;
+	case BRIDGE_OV519:
+		ov519_pkt_scan(gspca_dev, frame, data, len);
+		break;
+	}
+}
+
 /* -- management routines -- */
 
 static void setbrightness(struct gspca_dev *gspca_dev)
@@ -1970,6 +2419,7 @@ static void setcontrast(struct gspca_dev *gspca_dev)
 		break;
 	case SEN_OV6630:
 		i2c_w_mask(sd, OV7610_REG_CNT, val >> 4, 0x0f);
+		break;
 	case SEN_OV8610: {
 		static const __u8 ctab[] = {
 			0x03, 0x09, 0x0b, 0x0f, 0x53, 0x6f, 0x35, 0x7f
@@ -2136,19 +2586,21 @@ static const struct sd_desc sd_desc = {
 
 /* -- module initialisation -- */
 static const __devinitdata struct usb_device_id device_table[] = {
-	{USB_DEVICE(0x041e, 0x4052)},
-	{USB_DEVICE(0x041e, 0x405f)},
-	{USB_DEVICE(0x041e, 0x4060)},
-	{USB_DEVICE(0x041e, 0x4061)},
-	{USB_DEVICE(0x041e, 0x4064)},
-	{USB_DEVICE(0x041e, 0x4068)},
-	{USB_DEVICE(0x045e, 0x028c)},
-	{USB_DEVICE(0x054c, 0x0154)},
-	{USB_DEVICE(0x054c, 0x0155)},
-	{USB_DEVICE(0x05a9, 0x0519)},
-	{USB_DEVICE(0x05a9, 0x0530)},
-	{USB_DEVICE(0x05a9, 0x4519)},
-	{USB_DEVICE(0x05a9, 0x8519)},
+	{USB_DEVICE(0x041e, 0x4052), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x041e, 0x405f), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x041e, 0x4060), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x041e, 0x4061), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x041e, 0x4064), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x041e, 0x4068), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x045e, 0x028c), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x054c, 0x0154), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x054c, 0x0155), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0x0518), .driver_info = BRIDGE_OV518 },
+	{USB_DEVICE(0x05a9, 0x0519), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0x0530), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0x4519), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0x8519), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0xa518), .driver_info = BRIDGE_OV518PLUS },
 	{}
 };
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index ebb2ea6b499..f24eceecc5a 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -347,7 +347,8 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_MR97310A v4l2_fourcc('M', '3', '1', '0') /* compressed BGGR bayer */
 #define V4L2_PIX_FMT_SQ905C   v4l2_fourcc('9', '0', '5', 'C') /* compressed RGGB bayer */
 #define V4L2_PIX_FMT_PJPG     v4l2_fourcc('P', 'J', 'P', 'G') /* Pixart 73xx JPEG */
-#define V4L2_PIX_FMT_YVYU    v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16  YVU 4:2:2     */
+#define V4L2_PIX_FMT_YVYU     v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16 YVU 4:2:2 */
+#define V4L2_PIX_FMT_OV518    v4l2_fourcc('O', '5', '1', '8') /* ov518 JPEG */
 
 /*
  *	F O R M A T   E N U M E R A T I O N
-- 
cgit v1.2.3-70-g09d2


From ecfe0cfa3cae9a8402df12d81b159d851b61cf29 Mon Sep 17 00:00:00 2001
From: Uri Shkolnik <uris@siano-ms.com>
Date: Thu, 12 Mar 2009 10:10:40 -0300
Subject: V4L/DVB (11239): sdio: add cards ids for sms (Siano Mobile Silicon)
 MDTV receivers

sdio: add cards id for sms (Siano Mobile Silicon) MDTV receivers

Add SDIO vendor ID, and multiple device IDs for
various SMS-based MDTV SDIO adapters.

Signed-off-by: Uri Shkolnik <uris@siano-ms.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/mmc/sdio_ids.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index c7211ab6dd4..39751c8cde9 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -28,4 +28,12 @@
 #define SDIO_DEVICE_ID_MARVELL_8688WLAN		0x9104
 #define SDIO_DEVICE_ID_MARVELL_8688BT		0x9105
 
+#define SDIO_VENDOR_ID_SIANO			0x039a
+#define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
+#define SDIO_DEVICE_ID_SIANO_NICE		0x0202
+#define SDIO_DEVICE_ID_SIANO_VEGA_A0		0x0300
+#define SDIO_DEVICE_ID_SIANO_VENICE		0x0301
+#define SDIO_DEVICE_ID_SIANO_NOVA_A0		0x1100
+#define SDIO_DEVICE_ID_SIANO_STELLAR 		0x5347
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 7d9a73f6dcf4390d256bf19330c81e91523a26d5 Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Wed, 17 Jun 2009 00:16:15 +0200
Subject: PCI PM: consistently use type bool for wake enable variable

Other functions use type bool, so use that for pci_enable_wake as well.

Signed-off-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c   | 2 +-
 include/linux/pci.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7b59fd7c957..ccc0a0ccbef 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1205,7 +1205,7 @@ void pci_pme_active(struct pci_dev *dev, bool enable)
  * Error code depending on the platform is returned if both the platform and
  * the native mechanism fail to enable the generation of wake-up events
  */
-int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable)
+int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
 {
 	int error = 0;
 	bool pme_done = false;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 91b06be2f01..62e8452c2ec 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -723,7 +723,7 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
 void pci_pme_active(struct pci_dev *dev, bool enable);
-int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
+int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable);
 int pci_wake_from_d3(struct pci_dev *dev, bool enable);
 pci_power_t pci_target_state(struct pci_dev *dev);
 int pci_prepare_to_sleep(struct pci_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From 08604bd9935dc98fb62ef61d5b7baa7ccc10f8c2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jun 2009 15:31:12 -0700
Subject: time: move PIT_TICK_RATE to linux/timex.h

PIT_TICK_RATE is currently defined in four architectures, but in three
different places.  While linux/timex.h is not the perfect place for it, it
is still a reasonable replacement for those drivers that traditionally use
asm/timex.h to get CLOCK_TICK_RATE and expect it to be the PIT frequency.

Note that for Alpha, the actual value changed from 1193182UL to 1193180UL.
 This is unlikely to make a difference, and probably can only improve
accuracy.  There was a discussion on the correct value of CLOCK_TICK_RATE
a few years ago, after which every existing instance was getting changed
to 1193182.  According to the specification, it should be
1193181.818181...

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Len Brown <lenb@kernel.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/8253pit.h   | 7 -------
 arch/alpha/kernel/sys_ruffian.c    | 1 +
 arch/mips/include/asm/i8253.h      | 2 --
 arch/powerpc/include/asm/8253pit.h | 7 -------
 arch/x86/include/asm/timex.h       | 4 +---
 arch/x86/kernel/i8253.c            | 1 +
 arch/x86/kernel/tsc.c              | 1 +
 drivers/clocksource/acpi_pm.c      | 1 +
 drivers/input/joystick/analog.c    | 2 +-
 drivers/input/misc/pcspkr.c        | 1 +
 include/linux/timex.h              | 3 +++
 sound/drivers/pcsp/pcsp.h          | 1 +
 sound/oss/pas2_pcm.c               | 2 +-
 13 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/8253pit.h b/arch/alpha/include/asm/8253pit.h
index fef5c1450e4..a71c9c1455a 100644
--- a/arch/alpha/include/asm/8253pit.h
+++ b/arch/alpha/include/asm/8253pit.h
@@ -1,10 +1,3 @@
 /*
  * 8253/8254 Programmable Interval Timer
  */
-
-#ifndef _8253PIT_H
-#define _8253PIT_H
-
-#define PIT_TICK_RATE 	1193180UL
-
-#endif
diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c
index f15a329b601..d9f9cfeb993 100644
--- a/arch/alpha/kernel/sys_ruffian.c
+++ b/arch/alpha/kernel/sys_ruffian.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/pci.h>
 #include <linux/ioport.h>
+#include <linux/timex.h>
 #include <linux/init.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 5dabc870b32..032ca73f181 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -12,8 +12,6 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-#define PIT_TICK_RATE		1193182UL
-
 extern spinlock_t i8253_lock;
 
 extern void setup_pit_timer(void);
diff --git a/arch/powerpc/include/asm/8253pit.h b/arch/powerpc/include/asm/8253pit.h
index b70d6e53b30..a71c9c1455a 100644
--- a/arch/powerpc/include/asm/8253pit.h
+++ b/arch/powerpc/include/asm/8253pit.h
@@ -1,10 +1,3 @@
-#ifndef _ASM_POWERPC_8253PIT_H
-#define _ASM_POWERPC_8253PIT_H
-
 /*
  * 8253/8254 Programmable Interval Timer
  */
-
-#define PIT_TICK_RATE	1193182UL
-
-#endif	/* _ASM_POWERPC_8253PIT_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981..1375cfc9396 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
 #include <asm/processor.h>
 #include <asm/tsc.h>
 
-/* The PIT ticks at this frequency (in HZ): */
-#define PIT_TICK_RATE		1193182
-
+/* Assume we use the PIT time source for the clock tick */
 #define CLOCK_TICK_RATE		PIT_TICK_RATE
 
 #define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d..5cf36c053ac 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
+#include <linux/timex.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3e1c057e98f..ae3180c506a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
+#include <linux/timex.h>
 
 #include <asm/hpet.h>
 #include <asm/timer.h>
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 40bd8c61c7d..72a633a6ec9 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -18,6 +18,7 @@
 
 #include <linux/acpi_pmtmr.h>
 #include <linux/clocksource.h>
+#include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/pci.h>
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 356b3a25efa..1c0b529c06a 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -35,7 +35,7 @@
 #include <linux/input.h>
 #include <linux/gameport.h>
 #include <linux/jiffies.h>
-#include <asm/timex.h>
+#include <linux/timex.h>
 
 #define DRIVER_DESC	"Analog joystick and gamepad driver"
 
diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c
index d6a30cee7bc..6d67af5387a 100644
--- a/drivers/input/misc/pcspkr.c
+++ b/drivers/input/misc/pcspkr.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/platform_device.h>
+#include <linux/timex.h>
 #include <asm/io.h>
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 9910e3bd5b3..e6967d10d9e 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -280,6 +280,9 @@ extern int do_adjtimex(struct timex *);
 
 int read_current_timer(unsigned long *timer_val);
 
+/* The clock frequency of the i8253/i8254 PIT */
+#define PIT_TICK_RATE 1193182ul
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h
index cdef2664218..174dd2ff0f2 100644
--- a/sound/drivers/pcsp/pcsp.h
+++ b/sound/drivers/pcsp/pcsp.h
@@ -10,6 +10,7 @@
 #define __PCSP_H__
 
 #include <linux/hrtimer.h>
+#include <linux/timex.h>
 #if defined(CONFIG_MIPS) || defined(CONFIG_X86)
 /* Use the global PIT lock ! */
 #include <asm/i8253.h>
diff --git a/sound/oss/pas2_pcm.c b/sound/oss/pas2_pcm.c
index 36c3ea62086..8f7d175767a 100644
--- a/sound/oss/pas2_pcm.c
+++ b/sound/oss/pas2_pcm.c
@@ -17,7 +17,7 @@
 
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <asm/timex.h>
+#include <linux/timex.h>
 #include "sound_config.h"
 
 #include "pas2.h"
-- 
cgit v1.2.3-70-g09d2


From 3b0fde0fac19c180317eb0601b3504083f4b9bf5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 16 Jun 2009 15:31:16 -0700
Subject: firmware_map: fix hang with x86/32bit

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13484

Peer reported:
| The bug is introduced from kernel 2.6.27, if E820 table reserve the memory
| above 4G in 32bit OS(BIOS-e820: 00000000fff80000 - 0000000120000000
| (reserved)), system will report Int 6 error and hang up. The bug is caused by
| the following code in drivers/firmware/memmap.c, the resource_size_t is 32bit
| variable in 32bit OS, the BUG_ON() will be invoked to result in the Int 6
| error. I try the latest 32bit Ubuntu and Fedora distributions, all hit this
| bug.
|======
|static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
|                  const char *type,
|                  struct firmware_map_entry *entry)

and it only happen with CONFIG_PHYS_ADDR_T_64BIT is not set.

it turns out we need to pass u64 instead of resource_size_t for that.

[akpm@linux-foundation.org: add comment]
Reported-and-tested-by: Peer Chen <pchen@nvidia.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/memmap.c    | 16 +++++++++-------
 include/linux/firmware-map.h | 12 ++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 05aa2d406ac..d5ea8a68d33 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -31,8 +31,12 @@
  * information is necessary as for the resource tree.
  */
 struct firmware_map_entry {
-	resource_size_t		start;	/* start of the memory range */
-	resource_size_t		end;	/* end of the memory range (incl.) */
+	/*
+	 * start and end must be u64 rather than resource_size_t, because e820
+	 * resources can lie at addresses above 4G.
+	 */
+	u64			start;	/* start of the memory range */
+	u64			end;	/* end of the memory range (incl.) */
 	const char		*type;	/* type of the memory range */
 	struct list_head	list;	/* entry for the linked list */
 	struct kobject		kobj;   /* kobject for each entry */
@@ -101,7 +105,7 @@ static LIST_HEAD(map_entries);
  * Common implementation of firmware_map_add() and firmware_map_add_early()
  * which expects a pre-allocated struct firmware_map_entry.
  **/
-static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
+static int firmware_map_add_entry(u64 start, u64 end,
 				  const char *type,
 				  struct firmware_map_entry *entry)
 {
@@ -132,8 +136,7 @@ static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
  *
  * Returns 0 on success, or -ENOMEM if no memory could be allocated.
  **/
-int firmware_map_add(resource_size_t start, resource_size_t end,
-		     const char *type)
+int firmware_map_add(u64 start, u64 end, const char *type)
 {
 	struct firmware_map_entry *entry;
 
@@ -157,8 +160,7 @@ int firmware_map_add(resource_size_t start, resource_size_t end,
  *
  * Returns 0 on success, or -ENOMEM if no memory could be allocated.
  **/
-int __init firmware_map_add_early(resource_size_t start, resource_size_t end,
-				  const char *type)
+int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 {
 	struct firmware_map_entry *entry;
 
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index cca686b3912..875451f1373 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -24,21 +24,17 @@
  */
 #ifdef CONFIG_FIRMWARE_MEMMAP
 
-int firmware_map_add(resource_size_t start, resource_size_t end,
-		     const char *type);
-int firmware_map_add_early(resource_size_t start, resource_size_t end,
-			   const char *type);
+int firmware_map_add(u64 start, u64 end, const char *type);
+int firmware_map_add_early(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
-static inline int firmware_map_add(resource_size_t start, resource_size_t end,
-				   const char *type)
+static inline int firmware_map_add(u64 start, u64 end, const char *type)
 {
 	return 0;
 }
 
-static inline int firmware_map_add_early(resource_size_t start,
-					 resource_size_t end, const char *type)
+static inline int firmware_map_add_early(u64 start, u64 end, const char *type)
 {
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From bb1f17b0372de93758653ca3454bc0df18dc2e5c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jun 2009 15:31:18 -0700
Subject: mm: consolidate init_mm definition

* create mm/init-mm.c, move init_mm there
* remove INIT_MM, initialize init_mm with C99 initializer
* unexport init_mm on all arches:

  init_mm is already unexported on x86.

  One strange place is some OMAP driver (drivers/video/omap/) which
  won't build modular, but it's already wants get_vm_area() export.
  Somebody should look there.

[akpm@linux-foundation.org: add missing #includes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/init_task.c     |  3 ---
 arch/arm/kernel/init_task.c       |  4 ----
 arch/avr32/kernel/init_task.c     |  4 ----
 arch/blackfin/kernel/init_task.c  |  4 ----
 arch/cris/kernel/process.c        |  4 ----
 arch/frv/kernel/init_task.c       |  4 ----
 arch/h8300/kernel/init_task.c     |  4 ----
 arch/ia64/kernel/init_task.c      |  4 ----
 arch/m32r/kernel/init_task.c      |  4 ----
 arch/m68k/kernel/process.c        |  4 ----
 arch/m68knommu/kernel/init_task.c |  4 ----
 arch/mips/kernel/init_task.c      |  4 ----
 arch/mn10300/kernel/init_task.c   |  3 ---
 arch/parisc/kernel/init_task.c    |  4 ----
 arch/powerpc/kernel/init_task.c   |  4 ----
 arch/s390/kernel/init_task.c      |  4 ----
 arch/sh/kernel/init_task.c        |  3 ---
 arch/sparc/kernel/init_task.c     |  3 ---
 arch/um/kernel/init_task.c        |  3 ---
 arch/x86/kernel/init_task.c       |  1 -
 arch/xtensa/kernel/init_task.c    |  4 ----
 include/linux/init_task.h         | 12 ------------
 mm/Makefile                       |  1 +
 mm/init-mm.c                      | 20 ++++++++++++++++++++
 24 files changed, 21 insertions(+), 88 deletions(-)
 create mode 100644 mm/init-mm.c

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c
index c2938e574a4..19b86328ffd 100644
--- a/arch/alpha/kernel/init_task.c
+++ b/arch/alpha/kernel/init_task.c
@@ -10,10 +10,7 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
 struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_mm);
 EXPORT_SYMBOL(init_task);
 
 union thread_union init_thread_union
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index e859af34946..3f470866bb8 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -14,10 +14,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index 993d56ee3cf..57ec9f2dcd9 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -15,10 +15,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure. Must be aligned on an 8192-byte boundary.
  */
diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c
index 2c228c02097..c26c34de9f3 100644
--- a/arch/blackfin/kernel/init_task.c
+++ b/arch/blackfin/kernel/init_task.c
@@ -35,10 +35,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 4df0b320d52..51dcd04d277 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -38,10 +38,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index 29429a8b7f6..1d3df1d9495 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -12,10 +12,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index cb5dc552da9..089c65ed6eb 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -14,10 +14,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index 5b0e830c6f3..c475fc281be 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -19,10 +19,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index 016885c6f26..fce57e5d3f9 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -13,10 +13,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index ec37fb56c12..72bad65dba3 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -42,10 +42,6 @@
  */
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 union thread_union init_thread_union
 __attribute__((section(".data.init_task"), aligned(THREAD_SIZE)))
        = { INIT_THREAD_INFO(init_task) };
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index fe282de1d59..45e97a207fe 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -14,10 +14,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index 149cd914526..5b457a40c78 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -11,10 +11,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index 5ac3566f8c9..80d423b80af 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -20,9 +20,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index 1e25a45d64c..82974b20fc1 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -36,10 +36,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index 688b329800b..ffc4253fef5 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -9,10 +9,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index 7db95c0b869..fe787f9e5f3 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -18,10 +18,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index 80c35ff71d5..1719957c0a6 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -10,9 +10,6 @@
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct pt_regs fake_swapper_regs;
-struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial thread structure.
  *
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index f28cb8278e9..28125c5b3d3 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -10,10 +10,7 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
 struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_mm);
 EXPORT_SYMBOL(init_task);
 
 /* .text section in head.S is aligned at 8k boundary and this gets linked
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index 806d381947b..b25121b537d 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -10,11 +10,8 @@
 #include "linux/mqueue.h"
 #include "asm/uaccess.h"
 
-struct mm_struct init_mm = INIT_MM(init_mm);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269bea..270ff83efc1 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index e07f5c9fcd3..c4302f0e4ba 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -23,10 +23,6 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 union thread_union init_thread_union
 	__attribute__((__section__(".data.init_task"))) =
 { INIT_THREAD_INFO(init_task) };
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 28b1f30601b..5368fbdc780 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,18 +15,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
-#define INIT_MM(name) \
-{			 					\
-	.mm_rb		= RB_ROOT,				\
-	.pgd		= swapper_pg_dir, 			\
-	.mm_users	= ATOMIC_INIT(2), 			\
-	.mm_count	= ATOMIC_INIT(1), 			\
-	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
-	.page_table_lock =  __SPIN_LOCK_UNLOCKED(name.page_table_lock),	\
-	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
-	.cpu_vm_mask	= CPU_MASK_ALL,				\
-}
-
 #define INIT_SIGNALS(sig) {						\
 	.count		= ATOMIC_INIT(1), 				\
 	.wait_chldexit	= __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
diff --git a/mm/Makefile b/mm/Makefile
index e89acb090b4..cf76be785ad 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o mm_init.o $(mmu-y)
+obj-y += init-mm.o
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 00000000000..57aba0da966
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
+#include <linux/mm_types.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpumask.h>
+
+#include <asm/atomic.h>
+#include <asm/pgtable.h>
+
+struct mm_struct init_mm = {
+	.mm_rb		= RB_ROOT,
+	.pgd		= swapper_pg_dir,
+	.mm_users	= ATOMIC_INIT(2),
+	.mm_count	= ATOMIC_INIT(1),
+	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
+	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+	.cpu_vm_mask	= CPU_MASK_ALL,
+};
-- 
cgit v1.2.3-70-g09d2


From 1ebf26a9b338534def47f307c6c8694b6dfc0a79 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:19 -0700
Subject: readahead: make mmap_miss an unsigned int

This makes the performance impact of possible mmap_miss wrap around to be
temporary and tolerable: i.e.  MMAP_LOTSAMISS=100 extra readarounds.

Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX
cache misses to bring it back to normal state.  During the time mmap
readaround will be _enabled_ for whatever wild random workload.  That's
almost permanent performance impact.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ede84fa7da5..8146e0264ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -879,7 +879,7 @@ struct file_ra_state {
 					   there are only # of pages ahead */
 
 	unsigned int ra_pages;		/* Maximum readahead window */
-	int mmap_miss;			/* Cache miss stat for mmap accesses */
+	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
 	loff_t prev_pos;		/* Cache last read() position */
 };
 
-- 
cgit v1.2.3-70-g09d2


From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:30 -0700
Subject: readahead: record mmap read-around states in file_ra_state

Mmap read-around now shares the same code style and data structure with
readahead code.

This also removes do_page_cache_readahead().  Its last user, mmap
read-around, has been changed to call ra_submit().

The no-readahead-if-congested logic is dumped by the way.  Users will be
pretty sensitive about the slow loading of executables.  So it's
unfavorable to disabled mmap read-around on a congested queue.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  5 +++--
 mm/filemap.c       | 12 +++++++-----
 mm/readahead.c     | 23 ++---------------------
 3 files changed, 12 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad613ed66ab..33da7f53884 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1178,8 +1178,6 @@ void task_dirty_inc(struct task_struct *tsk);
 #define VM_MAX_READAHEAD	128	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
 
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read);
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
 
@@ -1197,6 +1195,9 @@ void page_cache_async_readahead(struct address_space *mapping,
 				unsigned long size);
 
 unsigned long max_sane_readahead(unsigned long nr);
+unsigned long ra_submit(struct file_ra_state *ra,
+			struct address_space *mapping,
+			struct file *filp);
 
 /* Do stack extension */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff --git a/mm/filemap.c b/mm/filemap.c
index 5c0c6518f34..734891d0663 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1488,13 +1488,15 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
 	if (ra->mmap_miss > MMAP_LOTSAMISS)
 		return;
 
+	/*
+	 * mmap read-around
+	 */
 	ra_pages = max_sane_readahead(ra->ra_pages);
 	if (ra_pages) {
-		pgoff_t start = 0;
-
-		if (offset > ra_pages / 2)
-			start = offset - ra_pages / 2;
-		do_page_cache_readahead(mapping, file, start, ra_pages);
+		ra->start = max_t(long, 0, offset - ra_pages/2);
+		ra->size = ra_pages;
+		ra->async_size = 0;
+		ra_submit(ra, mapping, file);
 	}
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index d7c6e143a12..a7f01fcce9e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
 }
 
 /*
- * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
  * the pages first, then submits them all for I/O. This avoids the very bad
  * behaviour which would occur if page allocations are causing VM writeback.
  * We really don't want to intermingle reads and writes like that.
  *
  * Returns the number of pages requested, or the maximum amount of I/O allowed.
- *
- * do_page_cache_readahead() returns -1 if it encountered request queue
- * congestion.
  */
 static int
 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -231,22 +228,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return ret;
 }
 
-/*
- * This version skips the IO if the queue is read-congested, and will tell the
- * block layer to abandon the readahead if request allocation would block.
- *
- * force_page_cache_readahead() will ignore queue congestion and will block on
- * request queues.
- */
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read)
-{
-	if (bdi_read_congested(mapping->backing_dev_info))
-		return -1;
-
-	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
-}
-
 /*
  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
  * sensible upper limit.
@@ -260,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
 /*
  * Submit IO for the read-ahead request in file_ra_state.
  */
-static unsigned long ra_submit(struct file_ra_state *ra,
+unsigned long ra_submit(struct file_ra_state *ra,
 		       struct address_space *mapping, struct file *filp)
 {
 	int actual;
-- 
cgit v1.2.3-70-g09d2


From dc566127dd161b6c997466a2349ac179527ea89b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:31:32 -0700
Subject: radix-tree: add radix_tree_prev_hole()

The counterpart of radix_tree_next_hole(). To be used by context readahead.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Vladislav Bolkhovitin <vst@vlnb.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  2 ++
 lib/radix-tree.c           | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 355f6e80db0..c5da7491809 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
 			unsigned long first_index, unsigned int max_items);
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 4bb42a0344e..5301a52cdb4 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -666,6 +666,43 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_next_hole);
 
+/**
+ *	radix_tree_prev_hole    -    find the prev hole (not-present entry)
+ *	@root:		tree root
+ *	@index:		index key
+ *	@max_scan:	maximum range to search
+ *
+ *	Search backwards in the range [max(index-max_scan+1, 0), index]
+ *	for the first hole.
+ *
+ *	Returns: the index of the hole if found, otherwise returns an index
+ *	outside of the set specified (in which case 'index - return >= max_scan'
+ *	will be true). In rare cases of wrap-around, LONG_MAX will be returned.
+ *
+ *	radix_tree_next_hole may be called under rcu_read_lock. However, like
+ *	radix_tree_gang_lookup, this will not atomically search a snapshot of
+ *	the tree at a single point in time. For example, if a hole is created
+ *	at index 10, then subsequently a hole is created at index 5,
+ *	radix_tree_prev_hole covering both indexes may return 5 if called under
+ *	rcu_read_lock.
+ */
+unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
+				   unsigned long index, unsigned long max_scan)
+{
+	unsigned long i;
+
+	for (i = 0; i < max_scan; i++) {
+		if (!radix_tree_lookup(root, index))
+			break;
+		index--;
+		if (index == LONG_MAX)
+			break;
+	}
+
+	return index;
+}
+EXPORT_SYMBOL(radix_tree_prev_hole);
+
 static unsigned int
 __lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
-- 
cgit v1.2.3-70-g09d2


From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 16 Jun 2009 15:31:39 -0700
Subject: mm: clean up get_user_pages_fast() documentation

Move more documentation for get_user_pages_fast into the new kerneldoc comment.
Add some comments for get_user_pages as well.

Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't
there initially because it was once a static inline function.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Andy Grover <andy.grover@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 20 +++++---------------
 mm/memory.c        | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/util.c          | 16 ++++++++++++----
 3 files changed, 67 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 33da7f53884..a880161a385 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -824,8 +824,11 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
-		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+			unsigned long start, int len, int write, int force,
+			struct page **pages, struct vm_area_struct **vmas);
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);
@@ -849,19 +852,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
 
-/*
- * get_user_pages_fast provides equivalent functionality to get_user_pages,
- * operating on current and current->mm (force=0 and doesn't return any vmas).
- *
- * get_user_pages_fast may take mmap_sem and page tables, so no assumptions
- * can be made about locking. get_user_pages_fast is to be implemented in a
- * way that is advantageous (vs get_user_pages()) when the user memory area is
- * already faulted in and present in ptes. However if the pages have to be
- * faulted in, it may turn out to be slightly slower).
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages);
-
 /*
  * A callback you can register to apply pressure to ageable caches.
  *
diff --git a/mm/memory.c b/mm/memory.c
index 4126dd16778..891bad0613f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	return i;
 }
 
+/**
+ * get_user_pages() - pin user pages in memory
+ * @tsk:	task_struct of target task
+ * @mm:		mm_struct of target mm
+ * @start:	starting user address
+ * @len:	number of pages from start to pin
+ * @write:	whether pages will be written to by the caller
+ * @force:	whether to force write access even if user mapping is
+ *		readonly. This will result in the page being COWed even
+ *		in MAP_SHARED mappings. You do not want this.
+ * @pages:	array that receives pointers to the pages pinned.
+ *		Should be at least nr_pages long. Or NULL, if caller
+ *		only intends to ensure the pages are faulted in.
+ * @vmas:	array of pointers to vmas corresponding to each page.
+ *		Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If len is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
diff --git a/mm/util.c b/mm/util.c
index abc65aa7cdf..d5d2213728c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long.
  *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
  * Returns number of pages pinned. This may be fewer than the number
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
+ *
+ * get_user_pages_fast provides equivalent functionality to get_user_pages,
+ * operating on current and current->mm, with force=0 and vma=NULL. However
+ * unlike get_user_pages, it must be called without mmap_sem held.
+ *
+ * get_user_pages_fast may take mmap_sem and page table locks, so no
+ * assumptions can be made about lack of locking. get_user_pages_fast is to be
+ * implemented in a way that is advantageous (vs get_user_pages()) when the
+ * user memory area is already faulted in and present in ptes. However if the
+ * pages have to be faulted in, it may turn out to be slightly slower so
+ * callers need to carefully consider what to use. On many architectures,
+ * get_user_pages_fast simply falls back to get_user_pages.
  */
 int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 				int nr_pages, int write, struct page **pages)
-- 
cgit v1.2.3-70-g09d2


From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 15:31:49 -0700
Subject: cpuset,mm: update tasks' mems_allowed in time

Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.

In order to update tasks' mems_allowed in time, we must modify the code of
memory policy.  Because the memory policy is applied in the process's
context originally.  After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.

But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression.  But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set.  In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.

[lee.schermerhorn@hp.com:
  The rework of mpol_new() to extract the adjusting of the node mask to
  apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
  with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
  allocation.  Fix this by adding the check for MPOL_PREFERRED and empty
  node mask to mpol_new_mpolicy().

  Remove the now unneeded 'nodes = NULL' from mpol_new().

  Note that mpol_new_mempolicy() is always called with a non-NULL
  'nodes' parameter now that it has been removed from mpol_new().
  Therefore, we don't need to test nodes for NULL before testing it for
  'empty'.  However, just to be extra paranoid, add a VM_BUG_ON() to
  verify this assumption.]
[lee.schermerhorn@hp.com:

  I don't think the function name 'mpol_new_mempolicy' is descriptive
  enough to differentiate it from mpol_new().

  This function applies cpuset set context, usually constraining nodes
  to those allowed by the cpuset.  However, when the 'RELATIVE_NODES flag
  is set, it also translates the nodes.  So I settled on
  'mpol_set_nodemask()', because the comment block for mpol_new() mentions
  that we need to call this function to "set nodes".

  Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h |  13 ++--
 include/linux/sched.h  |   8 +--
 init/main.c            |   6 +-
 kernel/cpuset.c        | 184 +++++++++++++------------------------------------
 kernel/kthread.c       |   2 +
 mm/mempolicy.c         | 143 +++++++++++++++++++++++++++-----------
 mm/page_alloc.c        |   5 +-
 7 files changed, 170 insertions(+), 191 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 05ea1dd7d68..a5740fc4d04 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,6 @@
 
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
-extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p,
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
 extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
@@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
+static inline void set_mems_allowed(nodemask_t nodemask)
+{
+	current->mems_allowed = nodemask;
+}
+
 #else /* !CONFIG_CPUSETS */
 
-static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
@@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 
 #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_task_memory_state(void) {}
 
 static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
@@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
 {
 }
 
+static inline void set_mems_allowed(nodemask_t nodemask)
+{
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa53007..1048bf50540 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,7 +1318,8 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
+ * mempolicy */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1386,8 +1387,7 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;
-	int cpuset_mems_generation;
+	nodemask_t mems_allowed;	/* Protected by alloc_lock */
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
@@ -1410,7 +1410,7 @@ struct task_struct {
 	struct list_head perf_counter_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;
+	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
diff --git a/init/main.c b/init/main.c
index f6204f712e7..5e0d3f047ea 100644
--- a/init/main.c
+++ b/init/main.c
@@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	cpuset_init_early();
 	page_cgroup_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
@@ -867,6 +866,11 @@ static noinline int init_post(void)
 static int __init kernel_init(void * unused)
 {
 	lock_kernel();
+
+	/*
+	 * init can allocate pages on any node
+	 */
+	set_mems_allowed(node_possible_map);
 	/*
 	 * init can run on any cpu.
 	 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index af5a83d5218..7e75a41bd50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
 
 	struct cpuset *parent;		/* my parent */
 
-	/*
-	 * Copy of global cpuset_mems_generation as of the most
-	 * recent time this cpuset changed its mems_allowed.
-	 */
-	int mems_generation;
-
 	struct fmeter fmeter;		/* memory_pressure filter */
 
 	/* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value.  Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement.  So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
  * If a task is only holding callback_mutex, then it has read-only
  * access to cpusets.
  *
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 		tsk->flags &= ~PF_SPREAD_SLAB;
 }
 
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held.  May be
- * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation.  However this really only
- * matters on alpha systems using cpusets heavily.  If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant.  Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
- */
-
-void cpuset_update_task_memory_state(void)
-{
-	int my_cpusets_mem_gen;
-	struct task_struct *tsk = current;
-	struct cpuset *cs;
-
-	rcu_read_lock();
-	my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
-	rcu_read_unlock();
-
-	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		mutex_lock(&callback_mutex);
-		task_lock(tsk);
-		cs = task_cs(tsk); /* Maybe changed when task not locked */
-		guarantee_online_mems(cs, &tsk->mems_allowed);
-		tsk->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(tsk);
-		mutex_unlock(&callback_mutex);
-		mpol_rebind_task(tsk, &tsk->mems_allowed);
-	}
-}
-
 /*
  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
  *
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
  *    migrating memory region.
- *
- *    We call cpuset_update_task_memory_state() before hacking
- *    our tasks mems_allowed, so that we are assured of being in
- *    sync with our tasks cpuset, and in particular, callbacks to
- *    cpuset_update_task_memory_state() from nested page allocations
- *    won't see any mismatch of our cpuset and task mems_generation
- *    values, so won't overwrite our hacked tasks mems_allowed
- *    nodemask.
  */
 
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
 	struct task_struct *tsk = current;
 
-	cpuset_update_task_memory_state();
-
-	mutex_lock(&callback_mutex);
 	tsk->mems_allowed = *to;
-	mutex_unlock(&callback_mutex);
 
 	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
-	mutex_lock(&callback_mutex);
 	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
-	mutex_unlock(&callback_mutex);
 }
 
 /*
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+					nodemask_t *newmems)
+{
+	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+	mpol_rebind_task(tsk, &tsk->mems_allowed);
+	mpol_rebind_task(tsk, newmems);
+	tsk->mems_allowed = *newmems;
+}
+
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
  */
 static void cpuset_change_nodemask(struct task_struct *p,
 				   struct cgroup_scanner *scan)
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
+	nodemask_t newmems;
+
+	cs = cgroup_cs(scan->cg);
+	guarantee_online_mems(cs, &newmems);
+
+	task_lock(p);
+	cpuset_change_task_nodemask(p, &newmems);
+	task_unlock(p);
 
 	mm = get_task_mm(p);
 	if (!mm)
 		return;
 
-	cs = cgroup_cs(scan->cg);
 	migrate = is_memory_migrate(cs);
 
 	mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 /*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
  *
  * Call with cgroup_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
-	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
 	update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
+		to = node_possible_map;
 	} else {
-		mutex_lock(&callback_mutex);
 		guarantee_online_cpus(cs, cpus_attach);
-		mutex_unlock(&callback_mutex);
+		guarantee_online_mems(cs, &to);
 	}
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	if (err)
 		return;
 
+	task_lock(tsk);
+	cpuset_change_task_nodemask(tsk, &to);
+	task_unlock(tsk);
 	cpuset_update_task_spread_flag(cs, tsk);
 
 	from = oldcs->mems_allowed;
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
 	struct cpuset *parent;
 
 	if (!cont->parent) {
-		/* This is early initialization for the top cgroup */
-		top_cpuset.mems_generation = cpuset_mems_generation++;
 		return &top_cpuset.css;
 	}
 	parent = cgroup_cs(cont->parent);
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (is_spread_page(parent))
 		set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
-	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
 
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 
-	cpuset_update_task_memory_state();
-
 	if (is_sched_load_balance(cs))
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
 	.early_init = 1,
 };
 
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
-	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
-
-	top_cpuset.mems_generation = cpuset_mems_generation++;
-	return 0;
-}
-
-
 /**
  * cpuset_init - initialize cpusets at system boot
  *
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void)
 {
 	int err = 0;
 
+	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+		BUG();
+
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 
 	fmeter_init(&top_cpuset.fmeter);
-	top_cpuset.mems_generation = cpuset_mems_generation++;
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe4050..7fa44133352 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
+	set_mems_allowed(node_possible_map);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc04..46bdf9ddf2b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 	return 0;
 }
 
-/* Create a new policy */
+/*
+ * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
+ * any, for the new policy.  mpol_new() has already validated the nodes
+ * parameter with respect to the policy mode and flags.  But, we need to
+ * handle an empty nodemask with MPOL_PREFERRED here.
+ *
+ * Must be called holding task's alloc_lock to protect task's mems_allowed
+ * and mempolicy.  May also be called holding the mmap_semaphore for write.
+ */
+static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+{
+	nodemask_t cpuset_context_nmask;
+	int ret;
+
+	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
+	if (pol == NULL)
+		return 0;
+
+	VM_BUG_ON(!nodes);
+	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
+		nodes = NULL;	/* explicit local allocation */
+	else {
+		if (pol->flags & MPOL_F_RELATIVE_NODES)
+			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+					       &cpuset_current_mems_allowed);
+		else
+			nodes_and(cpuset_context_nmask, *nodes,
+				  cpuset_current_mems_allowed);
+		if (mpol_store_user_nodemask(pol))
+			pol->w.user_nodemask = *nodes;
+		else
+			pol->w.cpuset_mems_allowed =
+						cpuset_current_mems_allowed;
+	}
+
+	ret = mpol_ops[pol->mode].create(pol,
+				nodes ? &cpuset_context_nmask : NULL);
+	return ret;
+}
+
+/*
+ * This function just creates a new policy, does some check and simple
+ * initialization. You must invoke mpol_set_nodemask() to set nodes.
+ */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
-	nodemask_t cpuset_context_nmask;
-	int ret;
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 			if (((flags & MPOL_F_STATIC_NODES) ||
 			     (flags & MPOL_F_RELATIVE_NODES)))
 				return ERR_PTR(-EINVAL);
-			nodes = NULL;	/* flag local alloc */
 		}
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->mode = mode;
 	policy->flags = flags;
 
-	if (nodes) {
-		/*
-		 * cpuset related setup doesn't apply to local allocation
-		 */
-		cpuset_update_task_memory_state();
-		if (flags & MPOL_F_RELATIVE_NODES)
-			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
-					       &cpuset_current_mems_allowed);
-		else
-			nodes_and(cpuset_context_nmask, *nodes,
-				  cpuset_current_mems_allowed);
-		if (mpol_store_user_nodemask(policy))
-			policy->w.user_nodemask = *nodes;
-		else
-			policy->w.cpuset_mems_allowed =
-						cpuset_mems_allowed(current);
-	}
-
-	ret = mpol_ops[mode].create(policy,
-				nodes ? &cpuset_context_nmask : NULL);
-	if (ret < 0) {
-		kmem_cache_free(policy_cache, policy);
-		return ERR_PTR(ret);
-	}
 	return policy;
 }
 
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 /*
  * Wrapper for mpol_rebind_policy() that just requires task
  * pointer, and updates task mempolicy.
+ *
+ * Called with task's alloc_lock held.
  */
 
 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 			     nodemask_t *nodes)
 {
-	struct mempolicy *new;
+	struct mempolicy *new, *old;
 	struct mm_struct *mm = current->mm;
+	int ret;
 
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
-	mpol_put(current->mempolicy);
+	task_lock(current);
+	ret = mpol_set_nodemask(new, nodes);
+	if (ret) {
+		task_unlock(current);
+		if (mm)
+			up_write(&mm->mmap_sem);
+		mpol_put(new);
+		return ret;
+	}
+	old = current->mempolicy;
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
+	task_unlock(current);
 	if (mm)
 		up_write(&mm->mmap_sem);
 
+	mpol_put(old);
 	return 0;
 }
 
 /*
  * Return nodemask for policy for get_mempolicy() query
+ *
+ * Called with task's alloc_lock held
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_task_memory_state();
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 		return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
+		task_lock(current);
 		*nmask  = cpuset_current_mems_allowed;
+		task_unlock(current);
 		return 0;
 	}
 
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	}
 
 	err = 0;
-	if (nmask)
+	if (nmask) {
+		task_lock(current);
 		get_policy_nodemask(pol, nmask);
+		task_unlock(current);
+	}
 
  out:
 	mpol_cond_put(pol);
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
 			return err;
 	}
 	down_write(&mm->mmap_sem);
+	task_lock(current);
+	err = mpol_set_nodemask(new, nmask);
+	task_unlock(current);
+	if (err) {
+		up_write(&mm->mmap_sem);
+		mpol_put(new);
+		return err;
+	}
 	vma = check_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
 
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 
-	cpuset_update_task_memory_state();
-
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 
-	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
@@ -1854,6 +1894,8 @@ restart:
  */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
+	int ret;
+
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
 	spin_lock_init(&sp->lock);
 
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 
 		/* contextualize the tmpfs mount point mempolicy */
 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
-		mpol_put(mpol);	/* drop our ref on sb mpol */
-		if (IS_ERR(new))
+		if (IS_ERR(new)) {
+			mpol_put(mpol);	/* drop our ref on sb mpol */
 			return;		/* no valid nodemask intersection */
+		}
+
+		task_lock(current);
+		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+		task_unlock(current);
+		mpol_put(mpol);	/* drop our ref on sb mpol */
+		if (ret) {
+			mpol_put(new);
+			return;
+		}
 
 		/* Create pseudo-vma that contains just the policy */
 		memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	new = mpol_new(mode, mode_flags, &nodes);
 	if (IS_ERR(new))
 		err = 1;
-	else if (no_context)
-		new->w.user_nodemask = nodes;	/* save for contextualization */
+	else {
+		int ret;
+
+		task_lock(current);
+		ret = mpol_set_nodemask(new, &nodes);
+		task_unlock(current);
+		if (ret)
+			err = 1;
+		else if (no_context) {
+			/* save for contextualization */
+			new->w.user_nodemask = nodes;
+		}
+	}
 
 out:
 	/* Restore string for error message */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9a..7cc3179e359 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,10 +1569,7 @@ nofail_alloc:
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	/*
-	 * The task's cpuset might have expanded its set of allowable nodes
-	 */
-	cpuset_update_task_memory_state();
+
 	p->flags |= PF_MEMALLOC;
 
 	lockdep_set_current_reclaim_state(gfp_mask);
-- 
cgit v1.2.3-70-g09d2


From d239171e4f6efd58d7e423853056b1b6a74f1446 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:52 -0700
Subject: page allocator: replace __alloc_pages_internal() with
 __alloc_pages_nodemask()

The start of a large patch series to clean up and optimise the page
allocator.

The performance improvements are in a wide range depending on the exact
machine but the results I've seen so fair are approximately;

kernbench:	0	to	 0.12% (elapsed time)
		0.49%	to	 3.20% (sys time)
aim9:		-4%	to	30% (for page_test and brk_test)
tbench:		-1%	to	 4%
hackbench:	-2.5%	to	 3.45% (mostly within the noise though)
netperf-udp	-1.34%  to	 4.06% (varies between machines a bit)
netperf-tcp	-0.44%  to	 5.22% (varies between machines a bit)

I haven't sysbench figures at hand, but previously they were within the
-0.5% to 2% range.

On netperf, the client and server were bound to opposite number CPUs to
maximise the problems with cache line bouncing of the struct pages so I
expect different people to report different results for netperf depending
on their exact machine and how they ran the test (different machines, same
cpus client/server, shared cache but two threads client/server, different
socket client/server etc).

I also measured the vmlinux sizes for a single x86-based config with
CONFIG_DEBUG_INFO enabled but not CONFIG_DEBUG_VM.  The core of the
.config is based on the Debian Lenny kernel config so I expect it to be
reasonably typical.

This patch:

__alloc_pages_internal is the core page allocator function but essentially
it is an alias of __alloc_pages_nodemask.  Naming a publicly available and
exported function "internal" is also a big ugly.  This patch renames
__alloc_pages_internal() to __alloc_pages_nodemask() and deletes the old
nodemask function.

Warning - This patch renames an exported symbol.  No kernel driver is
affected by external drivers calling __alloc_pages_internal() should
change the call to __alloc_pages_nodemask() without any alteration of
parameters.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 12 ++----------
 mm/page_alloc.c     |  4 ++--
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3760e7c5de0..549ec558310 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -172,24 +172,16 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
 
 struct page *
-__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		       struct zonelist *zonelist, nodemask_t *nodemask);
 
 static inline struct page *
 __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
-	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
 }
 
-static inline struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist, nodemask_t *nodemask)
-{
-	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-
-
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbed869fd83..d58df903150 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1458,7 +1458,7 @@ try_next_zone:
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
-__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1667,7 +1667,7 @@ nopage:
 got_pg:
 	return page;
 }
-EXPORT_SYMBOL(__alloc_pages_internal);
+EXPORT_SYMBOL(__alloc_pages_nodemask);
 
 /*
  * Common helper functions.
-- 
cgit v1.2.3-70-g09d2


From b3c466ce512923298ae8c0121d3e9f397a3f1210 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:53 -0700
Subject: page allocator: do not sanity check order in the fast path

No user of the allocator API should be passing in an order >= MAX_ORDER
but we check for it on each and every allocation.  Delete this check and
make it a VM_BUG_ON check further down the call path.

[akpm@linux-foundation.org: s/VM_BUG_ON/WARN_ON_ONCE/]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 6 ------
 mm/page_alloc.c     | 3 +++
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 549ec558310..c2d3fe03b5d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -185,9 +185,6 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
-	if (unlikely(order >= MAX_ORDER))
-		return NULL;
-
 	/* Unknown node is current node */
 	if (nid < 0)
 		nid = numa_node_id();
@@ -201,9 +198,6 @@ extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
 static inline struct page *
 alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
-	if (unlikely(order >= MAX_ORDER))
-		return NULL;
-
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_page_vma(gfp_t gfp_mask,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d58df903150..bfbd95c0610 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1401,6 +1401,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
 	classzone_idx = zone_idx(preferred_zone);
 
+	if (WARN_ON_ONCE(order >= MAX_ORDER))
+		return NULL;
+
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
-- 
cgit v1.2.3-70-g09d2


From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:54 -0700
Subject: page allocator: do not check NUMA node ID when the caller knows the
 node is valid

Callers of alloc_pages_node() can optionally specify -1 as a node to mean
"allocate from the current node".  However, a number of the callers in
fast paths know for a fact their node is valid.  To avoid a comparison and
branch, this patch adds alloc_pages_exact_node() that only checks the nid
with VM_BUG_ON().  Callers that know their node is valid are then
converted.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Paul Mundt <lethal@linux-sh.org>	[for the SLOB NUMA bits]
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/hp/common/sba_iommu.c   | 2 +-
 arch/ia64/kernel/mca.c            | 3 +--
 arch/ia64/kernel/uncached.c       | 3 ++-
 arch/ia64/sn/pci/pci_dma.c        | 3 ++-
 arch/powerpc/platforms/cell/ras.c | 4 ++--
 arch/x86/kvm/vmx.c                | 2 +-
 drivers/misc/sgi-gru/grufile.c    | 2 +-
 drivers/misc/sgi-xp/xpc_uv.c      | 2 +-
 include/linux/gfp.h               | 9 +++++++++
 include/linux/mm.h                | 1 -
 kernel/profile.c                  | 8 ++++----
 mm/filemap.c                      | 2 +-
 mm/hugetlb.c                      | 4 ++--
 mm/mempolicy.c                    | 2 +-
 mm/migrate.c                      | 2 +-
 mm/slab.c                         | 4 ++--
 mm/slob.c                         | 4 ++--
 17 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 56ceb68eb99..fe63b2dc9d0 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1131,7 +1131,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp
 #ifdef CONFIG_NUMA
 	{
 		struct page *page;
-		page = alloc_pages_node(ioc->node == MAX_NUMNODES ?
+		page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ?
 		                        numa_node_id() : ioc->node, flags,
 		                        get_order(size));
 
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 8f33a884042..5b17bd40227 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data)
 			data = mca_bootmem();
 			first_time = 0;
 		} else
-			data = page_address(alloc_pages_node(numa_node_id(),
-					GFP_KERNEL, get_order(sz)));
+			data = __get_free_pages(GFP_KERNEL, get_order(sz));
 		if (!data)
 			panic("Could not allocate MCA memory for cpu %d\n",
 					cpu);
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 8eff8c1d40a..6ba72ab42fc 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 
 	/* attempt to allocate a granule's worth of cached memory pages */
 
-	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+	page = alloc_pages_exact_node(nid,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
 	if (!page) {
 		mutex_unlock(&uc_pool->add_chunk_mutex);
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index d876423e4e7..98b684928e1 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -90,7 +90,8 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
 	 */
 	node = pcibus_to_node(pdev->bus);
 	if (likely(node >=0)) {
-		struct page *p = alloc_pages_node(node, flags, get_order(size));
+		struct page *p = alloc_pages_exact_node(node,
+						flags, get_order(size));
 
 		if (likely(p))
 			cpuaddr = page_address(p);
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index 296b5268754..5e0a191764f 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -122,8 +122,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
 
 	area->nid = nid;
 	area->order = order;
-	area->pages = alloc_pages_node(area->nid, GFP_KERNEL | GFP_THISNODE,
-					area->order);
+	area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE,
+						area->order);
 
 	if (!area->pages) {
 		printk(KERN_WARNING "%s: no page on node %d\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32d6ae8fb60..e770bf349ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index bbefe77c67a..3ce2920e2bf 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -302,7 +302,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
 		pnode = uv_node_to_pnode(nid);
 		if (bid < 0 || gru_base[bid])
 			continue;
-		page = alloc_pages_node(nid, GFP_KERNEL, order);
+		page = alloc_pages_exact_node(nid, GFP_KERNEL, order);
 		if (!page)
 			goto fail;
 		gru_base[bid] = page_address(page);
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 9172fcdee4e..c76677afda1 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -232,7 +232,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
 	mq->mmr_blade = uv_cpu_to_blade_id(cpu);
 
 	nid = cpu_to_node(cpu);
-	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+	page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				pg_order);
 	if (page == NULL) {
 		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c2d3fe03b5d..4efa33088a8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -5,6 +5,7 @@
 #include <linux/stddef.h>
 #include <linux/linkage.h>
 #include <linux/topology.h>
+#include <linux/mmdebug.h>
 
 struct vm_area_struct;
 
@@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
 
+static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
+						unsigned int order)
+{
+	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+
+	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a880161a385..7b548e7cfbd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -7,7 +7,6 @@
 
 #include <linux/gfp.h>
 #include <linux/list.h>
-#include <linux/mmdebug.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
 #include <linux/prio_tree.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26ad2d2..69911b5745e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = alloc_pages_node(node,
+			page = alloc_pages_exact_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
@@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = alloc_pages_node(node,
+			page = alloc_pages_exact_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
@@ -564,14 +564,14 @@ static int create_hash_tables(void)
 		int node = cpu_to_node(cpu);
 		struct page *page;
 
-		page = alloc_pages_node(node,
+		page = alloc_pages_exact_node(node,
 				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
-		page = alloc_pages_node(node,
+		page = alloc_pages_exact_node(node,
 				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				0);
 		if (!page)
diff --git a/mm/filemap.c b/mm/filemap.c
index 6846a902f5c..22396713feb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
 {
 	if (cpuset_do_page_mem_spread()) {
 		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, gfp, 0);
+		return alloc_pages_exact_node(n, gfp, 0);
 	}
 	return alloc_pages(gfp, 0);
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e83ad2c9228..2f8241f300f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 	if (h->order >= MAX_ORDER)
 		return NULL;
 
-	page = alloc_pages_node(nid,
+	page = alloc_pages_exact_node(nid,
 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
 		huge_page_order(h));
@@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  * Use a helper variable to find the next node and then
  * copy it back to hugetlb_next_nid afterwards:
  * otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
  * But we don't need to use a spin_lock here: it really
  * doesn't matter if occasionally a racer chooses the
  * same nid as we do.  Move nid forward in the mask even
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 46bdf9ddf2b..e08e2c4da63 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
-	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
+	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 068655d8f88..5a24923e7fd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 
 	*result = &pm->status;
 
-	return alloc_pages_node(pm->node,
+	return alloc_pages_exact_node(pm->node,
 				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index 18e3164de09..bb3254c95cd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 
-	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+	page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder);
 	if (!page)
 		return NULL;
 
@@ -3261,7 +3261,7 @@ retry:
 		if (local_flags & __GFP_WAIT)
 			local_irq_enable();
 		kmem_flagcheck(cache, flags);
-		obj = kmem_getpages(cache, local_flags, -1);
+		obj = kmem_getpages(cache, local_flags, numa_node_id());
 		if (local_flags & __GFP_WAIT)
 			local_irq_disable();
 		if (obj) {
diff --git a/mm/slob.c b/mm/slob.c
index 12f26149992..64f6db1943b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
  * NUMA support in SLOB is fairly simplistic, pushing most of the real
  * logic down to the page allocator, and simply doing the node accounting
  * on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_node() with the specified node id is used
+ * provided, alloc_pages_exact_node() with the specified node id is used
  * instead. The common case (or when the node id isn't explicitly provided)
  * will default to the current node, as per numa_node_id().
  *
@@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 
 #ifdef CONFIG_NUMA
 	if (node != -1)
-		page = alloc_pages_node(node, gfp, order);
+		page = alloc_pages_exact_node(node, gfp, order);
 	else
 #endif
 		page = alloc_pages(gfp, order);
-- 
cgit v1.2.3-70-g09d2


From 49255c619fbd482d704289b5eb2795f8e3b7ff2e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:31:58 -0700
Subject: page allocator: move check for disabled anti-fragmentation out of
 fastpath

On low-memory systems, anti-fragmentation gets disabled as there is
nothing it can do and it would just incur overhead shuffling pages between
lists constantly.  Currently the check is made in the free page fast path
for every page.  This patch moves it to a slow path.  On machines with low
memory, there will be small amount of additional overhead as pages get
shuffled between lists but it should quickly settle.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 3 ---
 mm/page_alloc.c        | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a47c879e130..0aa4445b0b8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -50,9 +50,6 @@ extern int page_group_by_mobility_disabled;
 
 static inline int get_pageblock_migratetype(struct page *page)
 {
-	if (unlikely(page_group_by_mobility_disabled))
-		return MIGRATE_UNMOVABLE;
-
 	return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 512bf9a618c..b09859629e9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -168,6 +168,10 @@ int page_group_by_mobility_disabled __read_mostly;
 
 static void set_pageblock_migratetype(struct page *page, int migratetype)
 {
+
+	if (unlikely(page_group_by_mobility_disabled))
+		migratetype = MIGRATE_UNMOVABLE;
+
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
-- 
cgit v1.2.3-70-g09d2


From 418589663d6011de9006425b6c5721e1544fb47a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:32:12 -0700
Subject: page allocator: use allocation flags as an index to the zone
 watermark

ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether
pages_min, pages_low or pages_high is used as the zone watermark when
allocating the pages.  Two branches in the allocator hotpath determine
which watermark to use.

This patch uses the flags as an array index into a watermark array that is
indexed with WMARK_* defines accessed via helpers.  All call sites that
use zone->pages_* are updated to use the helpers for accessing the values
and the array offsets for setting.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 11 +++++-----
 Documentation/vm/balance    | 18 ++++++++--------
 arch/m32r/mm/discontig.c    |  6 +++---
 include/linux/mmzone.h      | 16 +++++++++++++-
 mm/page_alloc.c             | 51 +++++++++++++++++++++++----------------------
 mm/vmscan.c                 | 39 ++++++++++++++++++----------------
 mm/vmstat.c                 |  6 +++---
 7 files changed, 83 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6fab2dcbb4d..0ea5adbc5b1 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used
 for page allocation or should be reclaimed.
 
 In this example, if normal pages (index=2) are required to this DMA zone and
-pages_high is used for watermark, the kernel judges this zone should not be
-used because pages_free(1355) is smaller than watermark + protection[2]
+watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
+not be used because pages_free(1355) is smaller than watermark + protection[2]
 (4 + 2004 = 2008). If this protection value is 0, this zone would be used for
 normal page requirement. If requirement is DMA zone(index=0), protection[0]
 (=0) is used.
@@ -280,9 +280,10 @@ The default value is 65536.
 min_free_kbytes:
 
 This is used to force the Linux VM to keep a minimum number
-of kilobytes free.  The VM uses this number to compute a pages_min
-value for each lowmem zone in the system.  Each lowmem zone gets
-a number of reserved free pages based proportionally on its size.
+of kilobytes free.  The VM uses this number to compute a
+watermark[WMARK_MIN] value for each lowmem zone in the system.
+Each lowmem zone gets a number of reserved free pages based
+proportionally on its size.
 
 Some minimal amount of memory is needed to satisfy PF_MEMALLOC
 allocations; if you set this to lower than 1024KB, your system will
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
index bd3d31bc491..c46e68cf934 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance
@@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would
 alleviate memory pressure on any zone in the page's node that has fallen below
 its watermark.
 
-pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are 
-per-zone fields, used to determine when a zone needs to be balanced. When
-the number of pages falls below pages_min, the hysteric field low_on_memory
-gets set. This stays set till the number of free pages becomes pages_high.
-When low_on_memory is set, page allocation requests will try to free some
-pages in the zone (providing GFP_WAIT is set in the request). Orthogonal
-to this, is the decision to poke kswapd to free some zone pages. That
-decision is not hysteresis based, and is done when the number of free
-pages is below pages_low; in which case zone_wake_kswapd is also set.
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
+are per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below watermark[WMARK_MIN], the hysteric field
+low_on_memory gets set. This stays set till the number of free pages becomes
+watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
+try to free some pages in the zone (providing GFP_WAIT is set in the request).
+Orthogonal to this, is the decision to poke kswapd to free some zone pages.
+That decision is not hysteresis based, and is done when the number of free
+pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
 
 
 (Good) Ideas that I have heard:
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index 7daf897292c..b7a78ad429b 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -154,9 +154,9 @@ unsigned long __init zone_sizes_init(void)
 	 *  Use all area of internal RAM.
 	 *  see __alloc_pages()
 	 */
-	NODE_DATA(1)->node_zones->pages_min = 0;
-	NODE_DATA(1)->node_zones->pages_low = 0;
-	NODE_DATA(1)->node_zones->pages_high = 0;
+	NODE_DATA(1)->node_zones->watermark[WMARK_MIN] = 0;
+	NODE_DATA(1)->node_zones->watermark[WMARK_LOW] = 0;
+	NODE_DATA(1)->node_zones->watermark[WMARK_HIGH] = 0;
 
 	return holes;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0aa4445b0b8..dd8487f0442 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -163,6 +163,17 @@ static inline int is_unevictable_lru(enum lru_list l)
 #endif
 }
 
+enum zone_watermarks {
+	WMARK_MIN,
+	WMARK_LOW,
+	WMARK_HIGH,
+	NR_WMARK
+};
+
+#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
+#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
+#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
@@ -275,7 +286,10 @@ struct zone_reclaim_stat {
 
 struct zone {
 	/* Fields commonly accessed by the page allocator */
-	unsigned long		pages_min, pages_low, pages_high;
+
+	/* zone watermarks, access with *_wmark_pages(zone) macros */
+	unsigned long watermark[NR_WMARK];
+
 	/*
 	 * We don't know if the memory that we're going to allocate will be freeable
 	 * or/and it will be released eventually, so to avoid totally wasting several
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8485735fc69..abe26003124 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1150,10 +1150,15 @@ failed:
 	return NULL;
 }
 
-#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN		WMARK_MIN
+#define ALLOC_WMARK_LOW		WMARK_LOW
+#define ALLOC_WMARK_HIGH	WMARK_HIGH
+#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
+
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
@@ -1440,14 +1445,10 @@ zonelist_scan:
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
 
+		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
-			if (alloc_flags & ALLOC_WMARK_MIN)
-				mark = zone->pages_min;
-			else if (alloc_flags & ALLOC_WMARK_LOW)
-				mark = zone->pages_low;
-			else
-				mark = zone->pages_high;
+			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 			if (!zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags)) {
 				if (!zone_reclaim_mode ||
@@ -1959,7 +1960,7 @@ static unsigned int nr_free_zone_pages(int offset)
 
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
-		unsigned long high = zone->pages_high;
+		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
@@ -2096,9 +2097,9 @@ void show_free_areas(void)
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
-			K(zone->pages_min),
-			K(zone->pages_low),
-			K(zone->pages_high),
+			K(min_wmark_pages(zone)),
+			K(low_wmark_pages(zone)),
+			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
@@ -2702,8 +2703,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
 
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
- * of blocks reserved is based on zone->pages_min. The memory within the
- * reserve will tend to store contiguous free pages. Setting min_free_kbytes
+ * of blocks reserved is based on min_wmark_pages(zone). The memory within
+ * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
@@ -2716,7 +2717,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
 	/* Get the start pfn, end pfn and the number of blocks to reserve */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = start_pfn + zone->spanned_pages;
-	reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
+	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -4319,8 +4320,8 @@ static void calculate_totalreserve_pages(void)
 					max = zone->lowmem_reserve[j];
 			}
 
-			/* we treat pages_high as reserved pages. */
-			max += zone->pages_high;
+			/* we treat the high watermark as reserved pages. */
+			max += high_wmark_pages(zone);
 
 			if (max > zone->present_pages)
 				max = zone->present_pages;
@@ -4400,7 +4401,7 @@ void setup_per_zone_pages_min(void)
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
-			 * The (pages_high-pages_low) and (pages_low-pages_min)
+			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
@@ -4411,17 +4412,17 @@ void setup_per_zone_pages_min(void)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
-			zone->pages_min = min_pages;
+			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = tmp;
+			zone->watermark[WMARK_MIN] = tmp;
 		}
 
-		zone->pages_low   = zone->pages_min + (tmp >> 2);
-		zone->pages_high  = zone->pages_min + (tmp >> 1);
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -4566,7 +4567,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
- * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6b7d14812e..e5245d05164 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1401,7 +1401,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 		free  = zone_page_state(zone, NR_FREE_PAGES);
 		/* If we have very few page cache pages,
 		   force-scan anon pages. */
-		if (unlikely(file + free <= zone->pages_high)) {
+		if (unlikely(file + free <= high_wmark_pages(zone))) {
 			percent[0] = 100;
 			percent[1] = 0;
 			return;
@@ -1533,11 +1533,13 @@ static void shrink_zone(int priority, struct zone *zone,
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
- * We reclaim from a zone even if that zone is over pages_high.  Because:
+ * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
+ * Because:
  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
  *    allocation or
- * b) The zones may be over pages_high but they must go *over* pages_high to
- *    satisfy the `incremental min' zone defense algorithm.
+ * b) The target zone may be at high_wmark_pages(zone) but the lower zones
+ *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
+ *    zone defense algorithm.
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
@@ -1743,7 +1745,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at pages_high.
+ * they are all at high_wmark_pages(zone).
  *
  * Returns the number of pages which were actually freed.
  *
@@ -1756,11 +1758,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
  * the zone for when the problem goes away.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
- * zones which have free_pages > pages_high, but once a zone is found to have
- * free_pages <= pages_high, we scan that zone and the lower zones regardless
- * of the number of free pages in the lower zones.  This interoperates with
- * the page allocator fallback scheme to ensure that aging of pages is balanced
- * across the zones.
+ * zones which have free_pages > high_wmark_pages(zone), but once a zone is
+ * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
+ * lower zones regardless of the number of free pages in the lower zones. This
+ * interoperates with the page allocator fallback scheme to ensure that aging
+ * of pages is balanced across the zones.
  */
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 {
@@ -1781,7 +1783,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 	};
 	/*
 	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to free_pages == pages_high.
+	 * this zone was successfully refilled to
+	 * free_pages == high_wmark_pages(zone).
 	 */
 	int temp_priority[MAX_NR_ZONES];
 
@@ -1826,8 +1829,8 @@ loop_again:
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
-			if (!zone_watermark_ok(zone, order, zone->pages_high,
-					       0, 0)) {
+			if (!zone_watermark_ok(zone, order,
+					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
 				break;
 			}
@@ -1861,8 +1864,8 @@ loop_again:
 					priority != DEF_PRIORITY)
 				continue;
 
-			if (!zone_watermark_ok(zone, order, zone->pages_high,
-					       end_zone, 0))
+			if (!zone_watermark_ok(zone, order,
+					high_wmark_pages(zone), end_zone, 0))
 				all_zones_ok = 0;
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
@@ -1871,8 +1874,8 @@ loop_again:
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
 			 */
-			if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
-						end_zone, 0))
+			if (!zone_watermark_ok(zone, order,
+					8*high_wmark_pages(zone), end_zone, 0))
 				shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2038,7 +2041,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 
 	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74d66dba0cb..415110772c7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -714,9 +714,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
-		   zone->pages_min,
-		   zone->pages_low,
-		   zone->pages_high,
+		   min_wmark_pages(zone),
+		   low_wmark_pages(zone),
+		   high_wmark_pages(zone),
 		   zone->pages_scanned,
 		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
 		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
-- 
cgit v1.2.3-70-g09d2


From 62bc62a873116805774ffd37d7f86aa4faa832b1 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Tue, 16 Jun 2009 15:32:15 -0700
Subject: page allocator: use a pre-calculated value instead of
 num_online_nodes() in fast paths

num_online_nodes() is called in a number of places but most often by the
page allocator when deciding whether the zonelist needs to be filtered
based on cpusets or the zonelist cache.  This is actually a heavy function
and touches a number of cache lines.

This patch stores the number of online nodes at boot time and updates the
value when nodes get onlined and offlined.  The value is then used in a
number of important paths in place of num_online_nodes().

[rientjes@google.com: do not override definition of node_set_online() with macro]
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 19 ++++++++++++++++---
 mm/hugetlb.c             |  4 ++--
 mm/page_alloc.c          | 10 ++++++----
 mm/slub.c                |  2 +-
 net/sunrpc/svc.c         |  2 +-
 5 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 848025cd708..829b94b156f 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -408,6 +408,19 @@ static inline int num_node_state(enum node_states state)
 #define next_online_node(nid)	next_node((nid), node_states[N_ONLINE])
 
 extern int nr_node_ids;
+extern int nr_online_nodes;
+
+static inline void node_set_online(int nid)
+{
+	node_set_state(nid, N_ONLINE);
+	nr_online_nodes = num_node_state(N_ONLINE);
+}
+
+static inline void node_set_offline(int nid)
+{
+	node_clear_state(nid, N_ONLINE);
+	nr_online_nodes = num_node_state(N_ONLINE);
+}
 #else
 
 static inline int node_state(int node, enum node_states state)
@@ -434,7 +447,10 @@ static inline int num_node_state(enum node_states state)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
+#define nr_online_nodes		1
 
+#define node_set_online(node)	   node_set_state((node), N_ONLINE)
+#define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
 #endif
 
 #define node_online_map 	node_states[N_ONLINE]
@@ -454,9 +470,6 @@ static inline int num_node_state(enum node_states state)
 #define node_online(node)	node_state((node), N_ONLINE)
 #define node_possible(node)	node_state((node), N_POSSIBLE)
 
-#define node_set_online(node)	   node_set_state((node), N_ONLINE)
-#define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
-
 #define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2f8241f300f..7b9b6015b2e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -875,7 +875,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 	 * can no longer free unreserved surplus pages. This occurs when
 	 * the nodes with surplus pages have no free pages.
 	 */
-	unsigned long remaining_iterations = num_online_nodes();
+	unsigned long remaining_iterations = nr_online_nodes;
 
 	/* Uncommit the reservation */
 	h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +904,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 			h->surplus_huge_pages--;
 			h->surplus_huge_pages_node[nid]--;
 			nr_pages--;
-			remaining_iterations = num_online_nodes();
+			remaining_iterations = nr_online_nodes;
 		}
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e60e4147433..0c9f406e3c4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,7 +161,9 @@ static unsigned long __meminitdata dma_reserve;
 
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
+int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
+EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
 int page_group_by_mobility_disabled __read_mostly;
@@ -1466,7 +1468,7 @@ this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 try_next_zone:
-		if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) {
+		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
 			/*
 			 * we do zlc_setup after the first zone is tried but only
 			 * if there are multiple nodes make it worthwhile
@@ -2265,7 +2267,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 }
 
 
-#define MAX_NODE_LOAD (num_online_nodes())
+#define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 
 /**
@@ -2474,7 +2476,7 @@ static void build_zonelists(pg_data_t *pgdat)
 
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
-	load = num_online_nodes();
+	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 
@@ -2625,7 +2627,7 @@ void build_all_zonelists(void)
 
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
-			num_online_nodes(),
+			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
diff --git a/mm/slub.c b/mm/slub.c
index 30354bfeb43..1c950775d6b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3737,7 +3737,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 						 to_cpumask(l->cpus));
 		}
 
-		if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
+		if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
 				len < PAGE_SIZE - 60) {
 			len += sprintf(buf + len, " nodes=");
 			len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 8847add6ca1..5ed8931dfe9 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -124,7 +124,7 @@ svc_pool_map_choose_mode(void)
 {
 	unsigned int node;
 
-	if (num_online_nodes() > 1) {
+	if (nr_online_nodes > 1) {
 		/*
 		 * Actually have multiple NUMA nodes,
 		 * so split pools on NUMA node boundaries
-- 
cgit v1.2.3-70-g09d2


From 20a0307c0396c2edb651401d2f2db193dda2f3c9 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:22 -0700
Subject: mm: introduce PageHuge() for testing huge/gigantic pages

A series of patches to enhance the /proc/pagemap interface and to add a
userspace executable which can be used to present the pagemap data.

Export 10 more flags to end users (and more for kernel developers):

        11. KPF_MMAP            (pseudo flag) memory mapped page
        12. KPF_ANON            (pseudo flag) memory mapped page (anonymous)
        13. KPF_SWAPCACHE       page is in swap cache
        14. KPF_SWAPBACKED      page is swap/RAM backed
        15. KPF_COMPOUND_HEAD   (*)
        16. KPF_COMPOUND_TAIL   (*)
        17. KPF_HUGE		hugeTLB pages
        18. KPF_UNEVICTABLE     page is in the unevictable LRU list
        19. KPF_HWPOISON        hardware detected corruption
        20. KPF_NOPAGE          (pseudo flag) no page frame at the address

        (*) For compound pages, exporting _both_ head/tail info enables
            users to tell where a compound page starts/ends, and its order.

a simple demo of the page-types tool

# ./page-types -h
page-types [options]
            -r|--raw                  Raw mode, for kernel developers
            -a|--addr    addr-spec    Walk a range of pages
            -b|--bits    bits-spec    Walk pages with specified bits
            -l|--list                 Show page details in ranges
            -L|--list-each            Show page details one by one
            -N|--no-summary           Don't show summay info
            -h|--help                 Show this usage message
addr-spec:
            N                         one page at offset N (unit: pages)
            N+M                       pages range from N to N+M-1
            N,M                       pages range from N to M-1
            N,                        pages range from N to end
            ,M                        pages range from 0 to M
bits-spec:
            bit1,bit2                 (flags & (bit1|bit2)) != 0
            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1
            bit1,~bit2                (flags & (bit1|bit2)) == bit1
            =bit1,bit2                flags == (bit1|bit2)
bit-names:
          locked              error         referenced           uptodate
           dirty                lru             active               slab
       writeback            reclaim              buddy               mmap
       anonymous          swapcache         swapbacked      compound_head
   compound_tail               huge        unevictable           hwpoison
          nopage           reserved(r)         mlocked(r)    mappedtodisk(r)
         private(r)       private_2(r)   owner_private(r)            arch(r)
        uncached(r)       readahead(o)       slob_free(o)     slub_frozen(o)
      slub_debug(o)
                                   (r) raw mode bits  (o) overloaded bits

# ./page-types
             flags      page-count       MB  symbolic-flags                     long-symbolic-flags
0x0000000000000000          487369     1903  _________________________________
0x0000000000000014               5        0  __R_D____________________________  referenced,dirty
0x0000000000000020               1        0  _____l___________________________  lru
0x0000000000000024              34        0  __R__l___________________________  referenced,lru
0x0000000000000028            3838       14  ___U_l___________________________  uptodate,lru
0x0001000000000028              48        0  ___U_l_______________________I___  uptodate,lru,readahead
0x000000000000002c            6478       25  __RU_l___________________________  referenced,uptodate,lru
0x000100000000002c              47        0  __RU_l_______________________I___  referenced,uptodate,lru,readahead
0x0000000000000040            8344       32  ______A__________________________  active
0x0000000000000060               1        0  _____lA__________________________  lru,active
0x0000000000000068             348        1  ___U_lA__________________________  uptodate,lru,active
0x0001000000000068              12        0  ___U_lA______________________I___  uptodate,lru,active,readahead
0x000000000000006c             988        3  __RU_lA__________________________  referenced,uptodate,lru,active
0x000100000000006c              48        0  __RU_lA______________________I___  referenced,uptodate,lru,active,readahead
0x0000000000004078               1        0  ___UDlA_______b__________________  uptodate,dirty,lru,active,swapbacked
0x000000000000407c              34        0  __RUDlA_______b__________________  referenced,uptodate,dirty,lru,active,swapbacked
0x0000000000000400             503        1  __________B______________________  buddy
0x0000000000000804               1        0  __R________M_____________________  referenced,mmap
0x0000000000000828            1029        4  ___U_l_____M_____________________  uptodate,lru,mmap
0x0001000000000828              43        0  ___U_l_____M_________________I___  uptodate,lru,mmap,readahead
0x000000000000082c             382        1  __RU_l_____M_____________________  referenced,uptodate,lru,mmap
0x000100000000082c              12        0  __RU_l_____M_________________I___  referenced,uptodate,lru,mmap,readahead
0x0000000000000868             192        0  ___U_lA____M_____________________  uptodate,lru,active,mmap
0x0001000000000868              12        0  ___U_lA____M_________________I___  uptodate,lru,active,mmap,readahead
0x000000000000086c             800        3  __RU_lA____M_____________________  referenced,uptodate,lru,active,mmap
0x000100000000086c              31        0  __RU_lA____M_________________I___  referenced,uptodate,lru,active,mmap,readahead
0x0000000000004878               2        0  ___UDlA____M__b__________________  uptodate,dirty,lru,active,mmap,swapbacked
0x0000000000001000             492        1  ____________a____________________  anonymous
0x0000000000005808               4        0  ___U_______Ma_b__________________  uptodate,mmap,anonymous,swapbacked
0x0000000000005868            2839       11  ___U_lA____Ma_b__________________  uptodate,lru,active,mmap,anonymous,swapbacked
0x000000000000586c              30        0  __RU_lA____Ma_b__________________  referenced,uptodate,lru,active,mmap,anonymous,swapbacked
             total          513968     2007

# ./page-types -r
             flags      page-count       MB  symbolic-flags                     long-symbolic-flags
0x0000000000000000          468002     1828  _________________________________
0x0000000100000000           19102       74  _____________________r___________  reserved
0x0000000000008000              41        0  _______________H_________________  compound_head
0x0000000000010000             188        0  ________________T________________  compound_tail
0x0000000000008014               1        0  __R_D__________H_________________  referenced,dirty,compound_head
0x0000000000010014               4        0  __R_D___________T________________  referenced,dirty,compound_tail
0x0000000000000020               1        0  _____l___________________________  lru
0x0000000800000024              34        0  __R__l__________________P________  referenced,lru,private
0x0000000000000028            3794       14  ___U_l___________________________  uptodate,lru
0x0001000000000028              46        0  ___U_l_______________________I___  uptodate,lru,readahead
0x0000000400000028              44        0  ___U_l_________________d_________  uptodate,lru,mappedtodisk
0x0001000400000028               2        0  ___U_l_________________d_____I___  uptodate,lru,mappedtodisk,readahead
0x000000000000002c            6434       25  __RU_l___________________________  referenced,uptodate,lru
0x000100000000002c              47        0  __RU_l_______________________I___  referenced,uptodate,lru,readahead
0x000000040000002c              14        0  __RU_l_________________d_________  referenced,uptodate,lru,mappedtodisk
0x000000080000002c              30        0  __RU_l__________________P________  referenced,uptodate,lru,private
0x0000000800000040            8124       31  ______A_________________P________  active,private
0x0000000000000040             219        0  ______A__________________________  active
0x0000000800000060               1        0  _____lA_________________P________  lru,active,private
0x0000000000000068             322        1  ___U_lA__________________________  uptodate,lru,active
0x0001000000000068              12        0  ___U_lA______________________I___  uptodate,lru,active,readahead
0x0000000400000068              13        0  ___U_lA________________d_________  uptodate,lru,active,mappedtodisk
0x0000000800000068              12        0  ___U_lA_________________P________  uptodate,lru,active,private
0x000000000000006c             977        3  __RU_lA__________________________  referenced,uptodate,lru,active
0x000100000000006c              48        0  __RU_lA______________________I___  referenced,uptodate,lru,active,readahead
0x000000040000006c               5        0  __RU_lA________________d_________  referenced,uptodate,lru,active,mappedtodisk
0x000000080000006c               3        0  __RU_lA_________________P________  referenced,uptodate,lru,active,private
0x0000000c0000006c               3        0  __RU_lA________________dP________  referenced,uptodate,lru,active,mappedtodisk,private
0x0000000c00000068               1        0  ___U_lA________________dP________  uptodate,lru,active,mappedtodisk,private
0x0000000000004078               1        0  ___UDlA_______b__________________  uptodate,dirty,lru,active,swapbacked
0x000000000000407c              34        0  __RUDlA_______b__________________  referenced,uptodate,dirty,lru,active,swapbacked
0x0000000000000400             538        2  __________B______________________  buddy
0x0000000000000804               1        0  __R________M_____________________  referenced,mmap
0x0000000000000828            1029        4  ___U_l_____M_____________________  uptodate,lru,mmap
0x0001000000000828              43        0  ___U_l_____M_________________I___  uptodate,lru,mmap,readahead
0x000000000000082c             382        1  __RU_l_____M_____________________  referenced,uptodate,lru,mmap
0x000100000000082c              12        0  __RU_l_____M_________________I___  referenced,uptodate,lru,mmap,readahead
0x0000000000000868             192        0  ___U_lA____M_____________________  uptodate,lru,active,mmap
0x0001000000000868              12        0  ___U_lA____M_________________I___  uptodate,lru,active,mmap,readahead
0x000000000000086c             800        3  __RU_lA____M_____________________  referenced,uptodate,lru,active,mmap
0x000100000000086c              31        0  __RU_lA____M_________________I___  referenced,uptodate,lru,active,mmap,readahead
0x0000000000004878               2        0  ___UDlA____M__b__________________  uptodate,dirty,lru,active,mmap,swapbacked
0x0000000000001000             492        1  ____________a____________________  anonymous
0x0000000000005008               2        0  ___U________a_b__________________  uptodate,anonymous,swapbacked
0x0000000000005808               4        0  ___U_______Ma_b__________________  uptodate,mmap,anonymous,swapbacked
0x000000000000580c               1        0  __RU_______Ma_b__________________  referenced,uptodate,mmap,anonymous,swapbacked
0x0000000000005868            2839       11  ___U_lA____Ma_b__________________  uptodate,lru,active,mmap,anonymous,swapbacked
0x000000000000586c              29        0  __RU_lA____Ma_b__________________  referenced,uptodate,lru,active,mmap,anonymous,swapbacked
             total          513968     2007

# ./page-types --raw --list --no-summary --bits reserved
offset  count   flags
0       15      _____________________r___________
31      4       _____________________r___________
159     97      _____________________r___________
4096    2067    _____________________r___________
6752    2390    _____________________r___________
9355    3       _____________________r___________
9728    14526   _____________________r___________

This patch:

Introduce PageHuge(), which identifies huge/gigantic pages by their
dedicated compound destructor functions.

Also move prep_compound_gigantic_page() to hugetlb.c and make
__free_pages_ok() non-static.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c          |  1 +
 include/linux/hugetlb.h |  7 ++++
 mm/hugetlb.c            | 98 +++++++++++++++++++++++++++++++------------------
 mm/internal.h           |  5 +--
 mm/page_alloc.c         | 17 ---------
 5 files changed, 73 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index e9983837d08..38dd88b7ce8 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03be7f29ca0..a05a5ef3339 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -11,6 +11,8 @@
 
 struct ctl_table;
 
+int PageHuge(struct page *page);
+
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_HUGETLB;
@@ -61,6 +63,11 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
+static inline int PageHuge(struct page *page)
+{
+	return 0;
+}
+
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7b9b6015b2e..a56e6f3ce97 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
 		hugetlb_put_quota(mapping, 1);
 }
 
-/*
- * Increment or decrement surplus_huge_pages.  Keep node-specific counters
- * balanced by operating on them in a round-robin fashion.
- * Returns 1 if an adjustment was made.
- */
-static int adjust_pool_surplus(struct hstate *h, int delta)
-{
-	static int prev_nid;
-	int nid = prev_nid;
-	int ret = 0;
-
-	VM_BUG_ON(delta != -1 && delta != 1);
-	do {
-		nid = next_node(nid, node_online_map);
-		if (nid == MAX_NUMNODES)
-			nid = first_node(node_online_map);
-
-		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !h->surplus_huge_pages_node[nid])
-			continue;
-		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
-						h->nr_huge_pages_node[nid])
-			continue;
-
-		h->surplus_huge_pages += delta;
-		h->surplus_huge_pages_node[nid] += delta;
-		ret = 1;
-		break;
-	} while (nid != prev_nid);
-
-	prev_nid = nid;
-	return ret;
-}
-
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 	put_page(page); /* free it into the hugepage allocator */
 }
 
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+	struct page *p = page + 1;
+
+	/* we rely on prep_new_huge_page to set the destructor */
+	set_compound_order(page, order);
+	__SetPageHead(page);
+	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+		__SetPageTail(p);
+		p->first_page = page;
+	}
+}
+
+int PageHuge(struct page *page)
+{
+	compound_page_dtor *dtor;
+
+	if (!PageCompound(page))
+		return 0;
+
+	page = compound_head(page);
+	dtor = get_compound_page_dtor(page);
+
+	return dtor == free_huge_page;
+}
+
 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
 }
 #endif
 
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, int delta)
+{
+	static int prev_nid;
+	int nid = prev_nid;
+	int ret = 0;
+
+	VM_BUG_ON(delta != -1 && delta != 1);
+	do {
+		nid = next_node(nid, node_online_map);
+		if (nid == MAX_NUMNODES)
+			nid = first_node(node_online_map);
+
+		/* To shrink on this node, there must be a surplus page */
+		if (delta < 0 && !h->surplus_huge_pages_node[nid])
+			continue;
+		/* Surplus cannot exceed the total number of pages */
+		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+						h->nr_huge_pages_node[nid])
+			continue;
+
+		h->surplus_huge_pages += delta;
+		h->surplus_huge_pages_node[nid] += delta;
+		ret = 1;
+		break;
+	} while (nid != prev_nid);
+
+	prev_nid = nid;
+	return ret;
+}
+
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 4b1672a8cf7..b4ac332e807 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
-extern void prep_compound_page(struct page *page, unsigned long order);
-extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
-
 static inline void set_page_count(struct page *page, int v)
 {
 	atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
  */
 extern unsigned long highest_memmap_pfn;
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
+extern void prep_compound_page(struct page *page, unsigned long order);
+
 
 /*
  * function for dealing with page's order in buddy system.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8ca06d87dc1..131655cdb6b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -300,23 +300,6 @@ void prep_compound_page(struct page *page, unsigned long order)
 	}
 }
 
-#ifdef CONFIG_HUGETLBFS
-void prep_compound_gigantic_page(struct page *page, unsigned long order)
-{
-	int i;
-	int nr_pages = 1 << order;
-	struct page *p = page + 1;
-
-	set_compound_page_dtor(page, free_compound_page);
-	set_compound_order(page, order);
-	__SetPageHead(page);
-	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-		__SetPageTail(p);
-		p->first_page = page;
-	}
-}
-#endif
-
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
-- 
cgit v1.2.3-70-g09d2


From 56e49d218890f49b0057710a4b6fef31f5ffbfec Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 16 Jun 2009 15:32:28 -0700
Subject: vmscan: evict use-once pages first

When the file LRU lists are dominated by streaming IO pages, evict those
pages first, before considering evicting other pages.

This should be safe from deadlocks or performance problems
because only three things can happen to an inactive file page:

1) referenced twice and promoted to the active list
2) evicted by the pageout code
3) under IO, after which it will get evicted or promoted

The pages freed in this way can either be reused for streaming IO, or
allocated for something else.  If the pages are used for streaming IO,
this pageout pattern continues.  Otherwise, we will fall back to the
normal pageout pattern.

Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Elladan <elladan@eskimo.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 +++++++
 mm/memcontrol.c            | 11 +++++++++++
 mm/vmscan.c                | 38 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 25b9ca93d23..45add35dda1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -94,6 +94,7 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
@@ -239,6 +240,12 @@ mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 	return 1;
 }
 
+static inline int
+mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+{
+	return 1;
+}
+
 static inline unsigned long
 mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
 			 enum lru_list lru)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78eb8552818..70db6e0a5ee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 	return 0;
 }
 
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+{
+	unsigned long active;
+	unsigned long inactive;
+
+	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
+	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+
+	return (active > inactive);
+}
+
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5245d05164..9673437a545 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1351,12 +1351,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 	return low;
 }
 
+static int inactive_file_is_low_global(struct zone *zone)
+{
+	unsigned long active, inactive;
+
+	active = zone_page_state(zone, NR_ACTIVE_FILE);
+	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+
+	return (active > inactive);
+}
+
+/**
+ * inactive_file_is_low - check if file pages need to be deactivated
+ * @zone: zone to check
+ * @sc:   scan control of this context
+ *
+ * When the system is doing streaming IO, memory pressure here
+ * ensures that active file pages get deactivated, until more
+ * than half of the file pages are on the inactive list.
+ *
+ * Once we get to that situation, protect the system's working
+ * set from being evicted by disabling active file page aging.
+ *
+ * This uses a different ratio than the anonymous pages, because
+ * the page cache uses a use-once replacement algorithm.
+ */
+static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
+{
+	int low;
+
+	if (scanning_global_lru(sc))
+		low = inactive_file_is_low_global(zone);
+	else
+		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+	return low;
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 	struct zone *zone, struct scan_control *sc, int priority)
 {
 	int file = is_file_lru(lru);
 
-	if (lru == LRU_ACTIVE_FILE) {
+	if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
 		shrink_active_list(nr_to_scan, zone, sc, priority, file);
 		return 0;
 	}
-- 
cgit v1.2.3-70-g09d2


From 6e08a369ee10b361ac1cdcdf4fabd420fd08beb3 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:32:29 -0700
Subject: vmscan: cleanup the scan batching code

The vmscan batching logic is twisting.  Move it into a standalone function
nr_scan_try_batch() and document it.  No behavior change.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  4 ++--
 mm/page_alloc.c        |  2 +-
 mm/vmscan.c            | 39 ++++++++++++++++++++++++++++-----------
 mm/vmstat.c            |  8 ++++----
 4 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dd8487f0442..db976b9f879 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -334,9 +334,9 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
-	struct {
+	struct zone_lru {
 		struct list_head list;
-		unsigned long nr_scan;
+		unsigned long nr_saved_scan;	/* accumulated for batching */
 	} lru[NR_LRU_LISTS];
 
 	struct zone_reclaim_stat reclaim_stat;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 131655cdb6b..e5b8f628d16 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3657,7 +3657,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone_pcp_init(zone);
 		for_each_lru(l) {
 			INIT_LIST_HEAD(&zone->lru[l].list);
-			zone->lru[l].nr_scan = 0;
+			zone->lru[l].nr_saved_scan = 0;
 		}
 		zone->reclaim_stat.recent_rotated[0] = 0;
 		zone->reclaim_stat.recent_rotated[1] = 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9673437a545..d4da097533c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1492,6 +1492,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	percent[1] = 100 - percent[0];
 }
 
+/*
+ * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
+ * until we collected @swap_cluster_max pages to scan.
+ */
+static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
+				       unsigned long *nr_saved_scan,
+				       unsigned long swap_cluster_max)
+{
+	unsigned long nr;
+
+	*nr_saved_scan += nr_to_scan;
+	nr = *nr_saved_scan;
+
+	if (nr >= swap_cluster_max)
+		*nr_saved_scan = 0;
+	else
+		nr = 0;
+
+	return nr;
+}
 
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
@@ -1517,14 +1537,11 @@ static void shrink_zone(int priority, struct zone *zone,
 			scan >>= priority;
 			scan = (scan * percent[file]) / 100;
 		}
-		if (scanning_global_lru(sc)) {
-			zone->lru[l].nr_scan += scan;
-			nr[l] = zone->lru[l].nr_scan;
-			if (nr[l] >= swap_cluster_max)
-				zone->lru[l].nr_scan = 0;
-			else
-				nr[l] = 0;
-		} else
+		if (scanning_global_lru(sc))
+			nr[l] = nr_scan_try_batch(scan,
+						  &zone->lru[l].nr_saved_scan,
+						  swap_cluster_max);
+		else
 			nr[l] = scan;
 	}
 
@@ -2124,11 +2141,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
 						l == LRU_ACTIVE_FILE))
 				continue;
 
-			zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
-			if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+			zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
+			if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
 				unsigned long nr_to_scan;
 
-				zone->lru[l].nr_scan = 0;
+				zone->lru[l].nr_saved_scan = 0;
 				nr_to_scan = min(nr_pages, lru_pages);
 				nr_reclaimed += shrink_list(l, nr_to_scan, zone,
 								sc, prio);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 415110772c7..84c05555691 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -718,10 +718,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
 		   zone->pages_scanned,
-		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
-		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
-		   zone->lru[LRU_ACTIVE_FILE].nr_scan,
-		   zone->lru[LRU_INACTIVE_FILE].nr_scan,
+		   zone->lru[LRU_ACTIVE_ANON].nr_saved_scan,
+		   zone->lru[LRU_INACTIVE_ANON].nr_saved_scan,
+		   zone->lru[LRU_ACTIVE_FILE].nr_saved_scan,
+		   zone->lru[LRU_INACTIVE_FILE].nr_saved_scan,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
-- 
cgit v1.2.3-70-g09d2


From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 16 Jun 2009 15:32:35 -0700
Subject: mm: introduce follow_pfn()

Analoguous to follow_phys(), add a helper that looks up the PFN at a
user virtual address in an IO mapping or a raw PFN mapping.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Magnus Damm <magnus.damm@gmail.com>
Cc: Hans Verkuil <hverkuil@xs4all.nl>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b548e7cfbd..c80dbd4a62c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -792,6 +792,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
 		unsigned int flags, unsigned long *prot, resource_size_t *phys);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 24ff20480bb..d5d1653d60a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3140,6 +3140,35 @@ out:
 	return -EINVAL;
 }
 
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+	unsigned long *pfn)
+{
+	int ret = -EINVAL;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		return ret;
+
+	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+	if (ret)
+		return ret;
+	*pfn = pte_pfn(*ptep);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 int follow_phys(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags,
-- 
cgit v1.2.3-70-g09d2


From 7f33d49a2ed546e01f7b1d0607661810f2421859 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 16 Jun 2009 15:32:41 -0700
Subject: mm, PM/Freezer: Disable OOM killer when tasks are frozen

Currently, the following scenario appears to be possible in theory:

* Tasks are frozen for hibernation or suspend.
* Free pages are almost exhausted.
* Certain piece of code in the suspend code path attempts to allocate
  some memory using GFP_KERNEL and allocation order less than or
  equal to PAGE_ALLOC_COSTLY_ORDER.
* __alloc_pages_internal() cannot find a free page so it invokes the
  OOM killer.
* The OOM killer attempts to kill a task, but the task is frozen, so
  it doesn't die immediately.
* __alloc_pages_internal() jumps to 'restart', unsuccessfully tries
  to find a free page and invokes the OOM killer.
* No progress can be made.

Although it is now hard to trigger during hibernation due to the memory
shrinking carried out by the hibernation code, it is theoretically
possible to trigger during suspend after the memory shrinking has been
removed from that code path.  Moreover, since memory allocations are
going to be used for the hibernation memory shrinking, it will be even
more likely to happen during hibernation.

To prevent it from happening, introduce the oom_killer_disabled switch
that will cause __alloc_pages_internal() to fail in the situations in
which the OOM killer would have been called and make the freezer set
this switch after tasks have been successfully frozen.

[akpm@linux-foundation.org: be nicer to the namespace]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Fengguang Wu <fengguang.wu@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 12 ++++++++++++
 kernel/power/process.c |  5 +++++
 mm/page_alloc.c        |  4 ++++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4efa33088a8..06b7e8cc80a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -243,4 +243,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(void);
 void drain_local_pages(void *dummy);
 
+extern bool oom_killer_disabled;
+
+static inline void oom_killer_disable(void)
+{
+	oom_killer_disabled = true;
+}
+
+static inline void oom_killer_enable(void)
+{
+	oom_killer_disabled = false;
+}
+
 #endif /* __LINUX_GFP_H */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497..da2072d7381 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
 	if (error)
 		goto Exit;
 	printk("done.");
+
+	oom_killer_disable();
  Exit:
 	BUG_ON(in_atomic());
 	printk("\n");
+
 	return error;
 }
 
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
 
 void thaw_processes(void)
 {
+	oom_killer_enable();
+
 	printk("Restarting tasks ... ");
 	thaw_tasks(true);
 	thaw_tasks(false);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 61290ea721c..5b09488d0f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -178,6 +178,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype)
 					PB_migrate, PB_migrate_end);
 }
 
+bool oom_killer_disabled __read_mostly;
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1769,6 +1771,8 @@ rebalance:
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+			if (oom_killer_disabled)
+				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
-- 
cgit v1.2.3-70-g09d2


From 31c911329e048b715a1dfeaaf617be9430fd7f4e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Tue, 16 Jun 2009 15:32:45 -0700
Subject: mm: check the argument of kunmap on architectures without highmem

If you're using a non-highmem architecture, passing an argument with the
wrong type to kunmap() doesn't give you a warning because the ifdef
doesn't check the type.

Using a static inline function solves the problem nicely.

Reported-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 1fcb7126a01..211ff449726 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -55,7 +55,9 @@ static inline void *kmap(struct page *page)
 	return page_address(page);
 }
 
-#define kunmap(page) do { (void) (page); } while (0)
+static inline void kunmap(struct page *page)
+{
+}
 
 static inline void *kmap_atomic(struct page *page, enum km_type idx)
 {
-- 
cgit v1.2.3-70-g09d2


From b70d94ee438b3fd9c15c7691d7a932a135c18101 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 16 Jun 2009 15:32:46 -0700
Subject: page-allocator: use integer fields lookup for gfp_zone and check for
 errors in flags passed to the page allocator

This simplifies the code in gfp_zone() and also keeps the ability of the
compiler to use constant folding to get rid of gfp_zone processing.

The lookup of the zone is done using a bitfield stored in an integer.  So
the code in gfp_zone is a simple extraction of bits from a constant
bitfield.  The compiler is generating a load of a constant into a register
and then performs a shift and mask operation to get the zone from a gfp_t.
 No cachelines are touched and no branches have to be predicted by the
compiler.

We are doing some macro tricks here to convince the compiler to always do
the constant folding if possible.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 111 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 96 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 06b7e8cc80a..412178afd42 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -21,7 +21,8 @@ struct vm_area_struct;
 #define __GFP_DMA	((__force gfp_t)0x01u)
 #define __GFP_HIGHMEM	((__force gfp_t)0x02u)
 #define __GFP_DMA32	((__force gfp_t)0x04u)
-
+#define __GFP_MOVABLE	((__force gfp_t)0x08u)  /* Page is movable */
+#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
 /*
  * Action modifiers - doesn't change the zoning
  *
@@ -51,7 +52,6 @@ struct vm_area_struct;
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
-#define __GFP_MOVABLE	((__force gfp_t)0x100000u)  /* Page is movable */
 
 #define __GFP_BITS_SHIFT 21	/* Room for 21 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -116,24 +116,105 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
 		((gfp_flags & __GFP_RECLAIMABLE) != 0);
 }
 
-static inline enum zone_type gfp_zone(gfp_t flags)
-{
+#ifdef CONFIG_HIGHMEM
+#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
+#else
+#define OPT_ZONE_HIGHMEM ZONE_NORMAL
+#endif
+
 #ifdef CONFIG_ZONE_DMA
-	if (flags & __GFP_DMA)
-		return ZONE_DMA;
+#define OPT_ZONE_DMA ZONE_DMA
+#else
+#define OPT_ZONE_DMA ZONE_NORMAL
 #endif
+
 #ifdef CONFIG_ZONE_DMA32
-	if (flags & __GFP_DMA32)
-		return ZONE_DMA32;
+#define OPT_ZONE_DMA32 ZONE_DMA32
+#else
+#define OPT_ZONE_DMA32 ZONE_NORMAL
 #endif
-	if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
-			(__GFP_HIGHMEM | __GFP_MOVABLE))
-		return ZONE_MOVABLE;
-#ifdef CONFIG_HIGHMEM
-	if (flags & __GFP_HIGHMEM)
-		return ZONE_HIGHMEM;
+
+/*
+ * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
+ * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
+ * and there are 16 of them to cover all possible combinations of
+ * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM
+ *
+ * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
+ * But GFP_MOVABLE is not only a zone specifier but also an allocation
+ * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
+ * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1".
+ *
+ *       bit       result
+ *       =================
+ *       0x0    => NORMAL
+ *       0x1    => DMA or NORMAL
+ *       0x2    => HIGHMEM or NORMAL
+ *       0x3    => BAD (DMA+HIGHMEM)
+ *       0x4    => DMA32 or DMA or NORMAL
+ *       0x5    => BAD (DMA+DMA32)
+ *       0x6    => BAD (HIGHMEM+DMA32)
+ *       0x7    => BAD (HIGHMEM+DMA32+DMA)
+ *       0x8    => NORMAL (MOVABLE+0)
+ *       0x9    => DMA or NORMAL (MOVABLE+DMA)
+ *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
+ *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
+ *       0xc    => DMA32 (MOVABLE+HIGHMEM+DMA32)
+ *       0xd    => BAD (MOVABLE+DMA32+DMA)
+ *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
+ *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
+ *
+ * ZONES_SHIFT must be <= 2 on 32 bit platforms.
+ */
+
+#if 16 * ZONES_SHIFT > BITS_PER_LONG
+#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
+#endif
+
+#define GFP_ZONE_TABLE ( \
+	(ZONE_NORMAL << 0 * ZONES_SHIFT)				\
+	| (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) 			\
+	| (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT)		\
+	| (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT)			\
+	| (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT)			\
+	| (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT)	\
+	| (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\
+	| (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\
+)
+
+/*
+ * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32
+ * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
+ * entry starting with bit 0. Bit is set if the combination is not
+ * allowed.
+ */
+#define GFP_ZONE_BAD ( \
+	1 << (__GFP_DMA | __GFP_HIGHMEM)				\
+	| 1 << (__GFP_DMA | __GFP_DMA32)				\
+	| 1 << (__GFP_DMA32 | __GFP_HIGHMEM)				\
+	| 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)		\
+	| 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA)		\
+	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA)		\
+	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM)		\
+	| 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\
+)
+
+static inline enum zone_type gfp_zone(gfp_t flags)
+{
+	enum zone_type z;
+	int bit = flags & GFP_ZONEMASK;
+
+	z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
+					 ((1 << ZONES_SHIFT) - 1);
+
+	if (__builtin_constant_p(bit))
+		BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+	else {
+#ifdef CONFIG_DEBUG_VM
+		BUG_ON((GFP_ZONE_BAD >> bit) & 1);
 #endif
-	return ZONE_NORMAL;
+	}
+	return z;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From bc75d33f0fc1d56e734db1f56d3cfc8097b8e0cf Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:32:48 -0700
Subject: page-allocator: clean up functions related to pages_min

Change the names of two functions. It doesn't affect behavior.

Presently, setup_per_zone_pages_min() changes low, high of zone as well as
min.  So a better name is setup_per_zone_wmarks().  That's because Mel
changed zone->pages_[hig/low/min] to zone->watermark array in "page
allocator: replace the watermark-related union in struct zone with a
watermark[] array".

 * setup_per_zone_pages_min => setup_per_zone_wmarks

Of course, we have to change init_per_zone_pages_min, too.  There are not
pages_min any more.

 * init_per_zone_pages_min => init_per_zone_wmark_min

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  |  2 +-
 mm/memory_hotplug.c |  2 +-
 mm/page_alloc.c     | 17 +++++++++--------
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c80dbd4a62c..6e11cda1ba0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1052,7 +1052,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn);
 extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
-extern void setup_per_zone_pages_min(void);
+extern void setup_per_zone_wmarks(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
 extern void show_mem(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5fd6d..037291e15b2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,7 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
 
-	setup_per_zone_pages_min();
+	setup_per_zone_wmarks();
 	if (onlined_pages) {
 		kswapd_run(zone_to_nid(zone));
 		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b09488d0f5..8629c84fd9b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4396,12 +4396,13 @@ static void setup_per_zone_lowmem_reserve(void)
 }
 
 /**
- * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-added
  *
- * Ensures that the pages_{min,low,high} values for each zone are set correctly
- * with respect to min_free_kbytes.
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
  */
-void setup_per_zone_pages_min(void)
+void setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
@@ -4519,7 +4520,7 @@ static void __init setup_per_zone_inactive_ratio(void)
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
-static int __init init_per_zone_pages_min(void)
+static int __init init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 
@@ -4530,12 +4531,12 @@ static int __init init_per_zone_pages_min(void)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
-	setup_per_zone_pages_min();
+	setup_per_zone_wmarks();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
-module_init(init_per_zone_pages_min)
+module_init(init_per_zone_wmark_min)
 
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
@@ -4547,7 +4548,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (write)
-		setup_per_zone_pages_min();
+		setup_per_zone_wmarks();
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 96cb4df5ddf5e6d5785b5acd4003e3689b87f896 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:32:49 -0700
Subject: page-allocator: add inactive ratio calculation function of each zone

Factor the per-zone arithemetic inside setup_per_zone_inactive_ratio()'s
loop into a a separate function, calculate_zone_inactive_ratio().  This
function will be used in a later patch

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/page_alloc.c    | 28 ++++++++++++++++------------
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6e11cda1ba0..f0473a88ca2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1053,6 +1053,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
 extern void setup_per_zone_wmarks(void);
+extern void calculate_zone_inactive_ratio(struct zone *zone);
 extern void mem_init(void);
 extern void __init mmap_init(void);
 extern void show_mem(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8629c84fd9b..303607f1d32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4478,22 +4478,26 @@ void setup_per_zone_wmarks(void)
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static void __init setup_per_zone_inactive_ratio(void)
+void calculate_zone_inactive_ratio(struct zone *zone)
 {
-	struct zone *zone;
+	unsigned int gb, ratio;
 
-	for_each_zone(zone) {
-		unsigned int gb, ratio;
+	/* Zone size in gigabytes */
+	gb = zone->present_pages >> (30 - PAGE_SHIFT);
+	if (gb)
+		ratio = int_sqrt(10 * gb);
+	else
+		ratio = 1;
 
-		/* Zone size in gigabytes */
-		gb = zone->present_pages >> (30 - PAGE_SHIFT);
-		if (gb)
-			ratio = int_sqrt(10 * gb);
-		else
-			ratio = 1;
+	zone->inactive_ratio = ratio;
+}
 
-		zone->inactive_ratio = ratio;
-	}
+static void __init setup_per_zone_inactive_ratio(void)
+{
+	struct zone *zone;
+
+	for_each_zone(zone)
+		calculate_zone_inactive_ratio(zone);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:51 -0700
Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option

Currently, nobody wants to turn UNEVICTABLE_LRU off.  Thus this
configurability is unnecessary.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andi Kleen <andi@firstfloor.org>
Acked-by: Minchan Kim <minchan.kim@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c        |  4 ----
 fs/proc/meminfo.c          |  4 ----
 fs/proc/page.c             |  2 --
 include/linux/mmzone.h     | 13 -------------
 include/linux/page-flags.h | 16 +---------------
 include/linux/pagemap.h    | 12 ------------
 include/linux/rmap.h       |  7 -------
 include/linux/swap.h       | 19 -------------------
 include/linux/vmstat.h     |  2 --
 kernel/sysctl.c            |  2 --
 mm/Kconfig                 | 14 +-------------
 mm/internal.h              |  6 ------
 mm/mlock.c                 | 22 ----------------------
 mm/page_alloc.c            |  9 ---------
 mm/rmap.c                  |  3 +--
 mm/vmscan.c                | 17 -----------------
 mm/vmstat.c                |  4 ----
 17 files changed, 3 insertions(+), 153 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 40b809742a1..91d4087b403 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -72,10 +72,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       "Node %d Inactive(anon): %8lu kB\n"
 		       "Node %d Active(file):   %8lu kB\n"
 		       "Node %d Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
 		       "Node %d Unevictable:    %8lu kB\n"
 		       "Node %d Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
 		       "Node %d HighTotal:      %8lu kB\n"
 		       "Node %d HighFree:       %8lu kB\n"
@@ -105,10 +103,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       nid, K(node_page_state(nid, NR_INACTIVE_ANON)),
 		       nid, K(node_page_state(nid, NR_ACTIVE_FILE)),
 		       nid, K(node_page_state(nid, NR_INACTIVE_FILE)),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		       nid, K(node_page_state(nid, NR_UNEVICTABLE)),
 		       nid, K(node_page_state(nid, NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
 		       nid, K(i.totalhigh),
 		       nid, K(i.freehigh),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c6b0302af4c..d5c410d47fa 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Inactive(anon): %8lu kB\n"
 		"Active(file):   %8lu kB\n"
 		"Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
 		"Unevictable:    %8lu kB\n"
 		"Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
 		"HighTotal:      %8lu kB\n"
 		"HighFree:       %8lu kB\n"
@@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(pages[LRU_INACTIVE_ANON]),
 		K(pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		K(pages[LRU_UNEVICTABLE]),
 		K(global_page_state(NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9d926bd279a..2707c6c7a20 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -172,10 +172,8 @@ static u64 get_uflags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
 	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
 	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
-#endif
 
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index db976b9f879..88959853737 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,13 +83,8 @@ enum zone_stat_item {
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
-#ifdef CONFIG_UNEVICTABLE_LRU
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
-#else
-	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
-	NR_MLOCK = NR_ACTIVE_FILE,
-#endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -132,11 +127,7 @@ enum lru_list {
 	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
-#ifdef CONFIG_UNEVICTABLE_LRU
 	LRU_UNEVICTABLE,
-#else
-	LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */
-#endif
 	NR_LRU_LISTS
 };
 
@@ -156,11 +147,7 @@ static inline int is_active_lru(enum lru_list l)
 
 static inline int is_unevictable_lru(enum lru_list l)
 {
-#ifdef CONFIG_UNEVICTABLE_LRU
 	return (l == LRU_UNEVICTABLE);
-#else
-	return 0;
-#endif
 }
 
 enum zone_watermarks {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 62214c7d2d9..d6792f88a17 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -95,9 +95,7 @@ enum pageflags {
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
-#ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
-#endif
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
@@ -248,14 +246,8 @@ PAGEFLAG_FALSE(SwapCache)
 	SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache)
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
-#else
-PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
-	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
-	__CLEARPAGEFLAG_NOOP(Unevictable)
-#endif
 
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define MLOCK_PAGES 1
@@ -382,12 +374,6 @@ static inline void __ClearPageTail(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
-#ifdef CONFIG_UNEVICTABLE_LRU
-#define __PG_UNEVICTABLE	(1 << PG_unevictable)
-#else
-#define __PG_UNEVICTABLE	0
-#endif
-
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define __PG_MLOCKED		(1 << PG_mlocked)
 #else
@@ -403,7 +389,7 @@ static inline void __ClearPageTail(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 __PG_UNEVICTABLE | __PG_MLOCKED)
+	 1 << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 34da5230faa..aec3252afcf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -22,9 +22,7 @@ enum mapping_flags {
 	AS_EIO		= __GFP_BITS_SHIFT + 0,	/* IO error on async write */
 	AS_ENOSPC	= __GFP_BITS_SHIFT + 1,	/* ENOSPC on async write */
 	AS_MM_ALL_LOCKS	= __GFP_BITS_SHIFT + 2,	/* under mm_take_all_locks() */
-#ifdef CONFIG_UNEVICTABLE_LRU
 	AS_UNEVICTABLE	= __GFP_BITS_SHIFT + 3,	/* e.g., ramdisk, SHM_LOCK */
-#endif
 };
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -37,8 +35,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 	}
 }
 
-#ifdef CONFIG_UNEVICTABLE_LRU
-
 static inline void mapping_set_unevictable(struct address_space *mapping)
 {
 	set_bit(AS_UNEVICTABLE, &mapping->flags);
@@ -55,14 +51,6 @@ static inline int mapping_unevictable(struct address_space *mapping)
 		return test_bit(AS_UNEVICTABLE, &mapping->flags);
 	return !!mapping;
 }
-#else
-static inline void mapping_set_unevictable(struct address_space *mapping) { }
-static inline void mapping_clear_unevictable(struct address_space *mapping) { }
-static inline int mapping_unevictable(struct address_space *mapping)
-{
-	return 0;
-}
-#endif
 
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b35bc0e19cd..619379a1dd9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -105,18 +105,11 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  */
 int page_mkclean(struct page *);
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * called in munlock()/munmap() path to check for other vmas holding
  * the page mlocked.
  */
 int try_to_munlock(struct page *);
-#else
-static inline int try_to_munlock(struct page *page)
-{
-	return 0;	/* a.k.a. SWAP_SUCCESS */
-}
-#endif
 
 #else	/* !CONFIG_MMU */
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d476aad3ff5..f30c06908f0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -235,7 +235,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
@@ -244,24 +243,6 @@ extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
-#else
-static inline int page_evictable(struct page *page,
-						struct vm_area_struct *vma)
-{
-	return 1;
-}
-
-static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
-{
-}
-
-static inline int scan_unevictable_register_node(struct node *node)
-{
-	return 0;
-}
-
-static inline void scan_unevictable_unregister_node(struct node *node) { }
-#endif
 
 extern int kswapd_run(int nid);
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 524cd1b28ec..ff4696c6dce 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -41,7 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
 		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
@@ -50,7 +49,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
 		UNEVICTABLE_MLOCKFREED,
-#endif
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0e51a35a448..2ccee08f92f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1325,7 +1325,6 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "scan_unevictable_pages",
@@ -1334,7 +1333,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &scan_unevictable_handler,
 	},
-#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/Kconfig b/mm/Kconfig
index 71830ba7b98..97d2c88b745 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -203,25 +203,13 @@ config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
 
-config UNEVICTABLE_LRU
-	bool "Add LRU list to track non-evictable pages"
-	default y
-	help
-	  Keeps unevictable pages off of the active and inactive pageout
-	  lists, so kswapd will not waste CPU time or have its balancing
-	  algorithms thrown off by scanning these pages.  Selecting this
-	  will use one page flag and increase the code size a little,
-	  say Y unless you know what you are doing.
-
-	  See Documentation/vm/unevictable-lru.txt for more information.
-
 config HAVE_MLOCK
 	bool
 	default y if MMU=y
 
 config HAVE_MLOCKED_PAGE_BIT
 	bool
-	default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+	default y if HAVE_MLOCK=y
 
 config MMU_NOTIFIER
 	bool
diff --git a/mm/internal.h b/mm/internal.h
index b4ac332e807..f02c7508068 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -73,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * unevictable_migrate_page() called only from migrate_page_copy() to
  * migrate unevictable flag to new page.
@@ -85,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 	if (TestClearPageUnevictable(old))
 		SetPageUnevictable(new);
 }
-#else
-static inline void unevictable_migrate_page(struct page *new, struct page *old)
-{
-}
-#endif
 
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
diff --git a/mm/mlock.c b/mm/mlock.c
index ac130433c7d..45eb650b965 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
 }
 EXPORT_SYMBOL(can_do_mlock);
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * Mlocked pages are marked with PageMlocked() flag for efficient testing
  * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
 	return retval;
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
-
-/*
- * Just make pages present if VM_LOCKED.  No-op if unlocking.
- */
-static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end,
-				   int mlock)
-{
-	if (mlock && (vma->vm_flags & VM_LOCKED))
-		return make_pages_present(start, end);
-	return 0;
-}
-
-static inline int __mlock_posix_error_return(long retval)
-{
-	return 0;
-}
-
-#endif /* CONFIG_UNEVICTABLE_LRU */
-
 /**
  * mlock_vma_pages_range() - mlock pages in specified vma range.
  * @vma - the vma containing the specfied address range
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 00e293734fc..c95a77cd581 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2077,19 +2077,14 @@ void show_free_areas(void)
 
 	printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
 		" inactive_file:%lu"
-//TODO:  check/adjust line lengths
-#ifdef CONFIG_UNEVICTABLE_LRU
 		" unevictable:%lu"
-#endif
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_INACTIVE_FILE),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		global_page_state(NR_UNEVICTABLE),
-#endif
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
@@ -2113,9 +2108,7 @@ void show_free_areas(void)
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
-#ifdef CONFIG_UNEVICTABLE_LRU
 			" unevictable:%lukB"
-#endif
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
@@ -2129,9 +2122,7 @@ void show_free_areas(void)
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
-#ifdef CONFIG_UNEVICTABLE_LRU
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
-#endif
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
diff --git a/mm/rmap.c b/mm/rmap.c
index 23122af3261..316c9d6930a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1202,7 +1202,6 @@ int try_to_unmap(struct page *page, int migration)
 	return ret;
 }
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /**
  * try_to_munlock - try to munlock a page
  * @page: the page to be munlocked
@@ -1226,4 +1225,4 @@ int try_to_munlock(struct page *page)
 	else
 		return try_to_unmap_file(page, 1, 0);
 }
-#endif
+
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 879d034930c..2c4b945b011 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -514,7 +514,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
  *
  * lru_lock must not be held, interrupts must be enabled.
  */
-#ifdef CONFIG_UNEVICTABLE_LRU
 void putback_lru_page(struct page *page)
 {
 	int lru;
@@ -568,20 +567,6 @@ redo:
 	put_page(page);		/* drop ref from isolate */
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
-
-void putback_lru_page(struct page *page)
-{
-	int lru;
-	VM_BUG_ON(PageLRU(page));
-
-	lru = !!TestClearPageActive(page) + page_is_file_cache(page);
-	lru_cache_add_lru(page, lru);
-	put_page(page);
-}
-#endif /* CONFIG_UNEVICTABLE_LRU */
-
-
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -2470,7 +2455,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * page_evictable - test whether a page is evictable
  * @page: the page to test
@@ -2717,4 +2701,3 @@ void scan_unevictable_unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
 }
 
-#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1e151cf6bf8..1e3aa8139f2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = {
 	"nr_active_anon",
 	"nr_inactive_file",
 	"nr_active_file",
-#ifdef CONFIG_UNEVICTABLE_LRU
 	"nr_unevictable",
 	"nr_mlock",
-#endif
 	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
@@ -687,7 +685,6 @@ static const char * const vmstat_text[] = {
 	"htlb_buddy_alloc_success",
 	"htlb_buddy_alloc_fail",
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
 	"unevictable_pgs_culled",
 	"unevictable_pgs_scanned",
 	"unevictable_pgs_rescued",
@@ -697,7 +694,6 @@ static const char * const vmstat_text[] = {
 	"unevictable_pgs_stranded",
 	"unevictable_pgs_mlockfreed",
 #endif
-#endif
 };
 
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
-- 
cgit v1.2.3-70-g09d2


From cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:52 -0700
Subject: mm: add swap cache interface for swap reference

In a following patch, the usage of swap cache is recorded into swap_map.
This patch is for necessary interface changes to do that.

2 interfaces:

  - swapcache_prepare()
  - swapcache_free()

are added for allocating/freeing refcnt from swap-cache to existing swap
entries.  But implementation itself is not changed under this patch.  At
adding swapcache_free(), memcg's hook code is moved under
swapcache_free().  This is better than using scattered hooks.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  7 +++++++
 mm/shmem.c           |  2 +-
 mm/swap_state.c      | 11 +++++------
 mm/swapfile.c        | 19 +++++++++++++++++++
 mm/vmscan.c          |  3 +--
 5 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f30c06908f0..259e96c150e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -282,8 +282,10 @@ extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
+extern void swapcache_free(swp_entry_t, struct page *page);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
@@ -352,11 +354,16 @@ static inline void show_swap_cache_info(void)
 
 #define free_swap_and_cache(swp)	is_migration_entry(swp)
 #define swap_duplicate(swp)		is_migration_entry(swp)
+#define swapcache_prepare(swp)		is_migration_entry(swp)
 
 static inline void swap_free(swp_entry_t swp)
 {
 }
 
+static inline void swapcache_free(swp_entry_t swp, struct page *page)
+{
+}
+
 static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 0132fbd45a2..47ab1918228 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	shmem_swp_unmap(entry);
 unlock:
 	spin_unlock(&info->lock);
-	swap_free(swap);
+	swapcache_free(swap, NULL);
 redirty:
 	set_page_dirty(page);
 	if (wbc->for_reclaim)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1416e7e9e02..19bdf3017a9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -162,11 +162,11 @@ int add_to_swap(struct page *page)
 			return 1;
 		case -EEXIST:
 			/* Raced with "speculative" read_swap_cache_async */
-			swap_free(entry);
+			swapcache_free(entry, NULL);
 			continue;
 		default:
 			/* -ENOMEM radix-tree allocation failure */
-			swap_free(entry);
+			swapcache_free(entry, NULL);
 			return 0;
 		}
 	}
@@ -188,8 +188,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&swapper_space.tree_lock);
 
-	mem_cgroup_uncharge_swapcache(page, entry);
-	swap_free(entry);
+	swapcache_free(entry, page);
 	page_cache_release(page);
 }
 
@@ -293,7 +292,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/*
 		 * Swap entry may have been freed since our caller observed it.
 		 */
-		if (!swap_duplicate(entry))
+		if (!swapcache_prepare(entry))
 			break;
 
 		/*
@@ -317,7 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		}
 		ClearPageSwapBacked(new_page);
 		__clear_page_locked(new_page);
-		swap_free(entry);
+		swapcache_free(entry, NULL);
 	} while (err != -ENOMEM);
 
 	if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6..3187079903f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -509,6 +509,16 @@ void swap_free(swp_entry_t entry)
 	}
 }
 
+/*
+ * Called after dropping swapcache to decrease refcnt to swap entries.
+ */
+void swapcache_free(swp_entry_t entry, struct page *page)
+{
+	if (page)
+		mem_cgroup_uncharge_swapcache(page, entry);
+	return swap_free(entry);
+}
+
 /*
  * How many references to page are currently swapped out?
  */
@@ -1979,6 +1989,15 @@ bad_file:
 	goto out;
 }
 
+/*
+ * Called when allocating swap cache for exising swap entry,
+ */
+int swapcache_prepare(swp_entry_t entry)
+{
+	return swap_duplicate(entry);
+}
+
+
 struct swap_info_struct *
 get_swap_info_struct(unsigned type)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2c4b945b011..52339dd7bf8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
-		mem_cgroup_uncharge_swapcache(page, swap);
-		swap_free(swap);
+		swapcache_free(swap, page);
 	} else {
 		__remove_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
-- 
cgit v1.2.3-70-g09d2


From 355cfa73ddff2fb8fa14e93bd94a057cc022512e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 16 Jun 2009 15:32:53 -0700
Subject: mm: modify swap_map and add SWAP_HAS_CACHE flag

This is a part of the patches for fixing memcg's swap accountinf leak.
But, IMHO, not a bad patch even if no memcg.

There are 2 kinds of references to swap.
 - reference from swap entry
 - reference from swap cache

Then,

 - If there is swap cache && swap's refcnt is 1, there is only swap cache.
  (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL

This counting logic have worked well for a long time.  But considering
that we cannot know there is a _real_ reference or not by swap_map[],
current usage of counter is not very good.

This patch adds a flag SWAP_HAS_CACHE and recored information that a swap
entry has a cache or not.  This will remove -1 magic used in swapfile.c
and be a help to avoid unnecessary find_get_page().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  14 ++--
 mm/swap_state.c      |   5 +-
 mm/swapfile.c        | 214 ++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 172 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 259e96c150e..fed5e8e1104 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -129,9 +129,10 @@ enum {
 
 #define SWAP_CLUSTER_MAX 32
 
-#define SWAP_MAP_MAX	0x7fff
-#define SWAP_MAP_BAD	0x8000
-
+#define SWAP_MAP_MAX	0x7ffe
+#define SWAP_MAP_BAD	0x7fff
+#define SWAP_HAS_CACHE  0x8000		/* There is a swap cache of entry. */
+#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE)
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -281,7 +282,7 @@ extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int swap_duplicate(swp_entry_t);
+extern void swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
@@ -353,9 +354,12 @@ static inline void show_swap_cache_info(void)
 }
 
 #define free_swap_and_cache(swp)	is_migration_entry(swp)
-#define swap_duplicate(swp)		is_migration_entry(swp)
 #define swapcache_prepare(swp)		is_migration_entry(swp)
 
+static inline void swap_duplicate(swp_entry_t swp)
+{
+}
+
 static inline void swap_free(swp_entry_t swp)
 {
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 19bdf3017a9..b9ca029673a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/*
 		 * Swap entry may have been freed since our caller observed it.
 		 */
-		if (!swapcache_prepare(entry))
+		err = swapcache_prepare(entry);
+		if (err == -EEXIST) /* seems racy */
+			continue;
+		if (err)           /* swp entry is obsolete ? */
 			break;
 
 		/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3187079903f..0d7296971ad 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,33 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
+/* For reference count accounting in swap_map */
+/* enum for swap_map[] handling. internal use only */
+enum {
+	SWAP_MAP = 0,	/* ops for reference from swap users */
+	SWAP_CACHE,	/* ops for reference from swap cache */
+};
+
+static inline int swap_count(unsigned short ent)
+{
+	return ent & SWAP_COUNT_MASK;
+}
+
+static inline bool swap_has_cache(unsigned short ent)
+{
+	return !!(ent & SWAP_HAS_CACHE);
+}
+
+static inline unsigned short encode_swapmap(int count, bool has_cache)
+{
+	unsigned short ret = count;
+
+	if (has_cache)
+		return SWAP_HAS_CACHE | ret;
+	return ret;
+}
+
+
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
  * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +194,8 @@ static int wait_for_discard(void *word)
 #define SWAPFILE_CLUSTER	256
 #define LATENCY_LIMIT		256
 
-static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+static inline unsigned long scan_swap_map(struct swap_info_struct *si,
+					  int cache)
 {
 	unsigned long offset;
 	unsigned long scan_base;
@@ -285,7 +313,10 @@ checks:
 		si->lowest_bit = si->max;
 		si->highest_bit = 0;
 	}
-	si->swap_map[offset] = 1;
+	if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
+		si->swap_map[offset] = encode_swapmap(0, true);
+	else /* at suspend */
+		si->swap_map[offset] = encode_swapmap(1, false);
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
 
@@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void)
 			continue;
 
 		swap_list.next = next;
-		offset = scan_swap_map(si);
+		/* This is called for allocating swap entry for cache */
+		offset = scan_swap_map(si, SWAP_CACHE);
 		if (offset) {
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
@@ -415,6 +447,7 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
+/* The only caller of this function is now susupend routine */
 swp_entry_t get_swap_page_of_type(int type)
 {
 	struct swap_info_struct *si;
@@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int type)
 	si = swap_info + type;
 	if (si->flags & SWP_WRITEOK) {
 		nr_swap_pages--;
-		offset = scan_swap_map(si);
+		/* This is called for allocating swap entry, not cache */
+		offset = scan_swap_map(si, SWAP_MAP);
 		if (offset) {
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
@@ -471,25 +505,38 @@ out:
 	return NULL;
 }
 
-static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
+static int swap_entry_free(struct swap_info_struct *p,
+			   swp_entry_t ent, int cache)
 {
 	unsigned long offset = swp_offset(ent);
-	int count = p->swap_map[offset];
-
-	if (count < SWAP_MAP_MAX) {
-		count--;
-		p->swap_map[offset] = count;
-		if (!count) {
-			if (offset < p->lowest_bit)
-				p->lowest_bit = offset;
-			if (offset > p->highest_bit)
-				p->highest_bit = offset;
-			if (p->prio > swap_info[swap_list.next].prio)
-				swap_list.next = p - swap_info;
-			nr_swap_pages++;
-			p->inuse_pages--;
-			mem_cgroup_uncharge_swap(ent);
+	int count = swap_count(p->swap_map[offset]);
+	bool has_cache;
+
+	has_cache = swap_has_cache(p->swap_map[offset]);
+
+	if (cache == SWAP_MAP) { /* dropping usage count of swap */
+		if (count < SWAP_MAP_MAX) {
+			count--;
+			p->swap_map[offset] = encode_swapmap(count, has_cache);
 		}
+	} else { /* dropping swap cache flag */
+		VM_BUG_ON(!has_cache);
+		p->swap_map[offset] = encode_swapmap(count, false);
+
+	}
+	/* return code. */
+	count = p->swap_map[offset];
+	/* free if no reference */
+	if (!count) {
+		if (offset < p->lowest_bit)
+			p->lowest_bit = offset;
+		if (offset > p->highest_bit)
+			p->highest_bit = offset;
+		if (p->prio > swap_info[swap_list.next].prio)
+			swap_list.next = p - swap_info;
+		nr_swap_pages++;
+		p->inuse_pages--;
+		mem_cgroup_uncharge_swap(ent);
 	}
 	return count;
 }
@@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		swap_entry_free(p, entry);
+		swap_entry_free(p, entry, SWAP_MAP);
 		spin_unlock(&swap_lock);
 	}
 }
@@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry)
  */
 void swapcache_free(swp_entry_t entry, struct page *page)
 {
+	struct swap_info_struct *p;
+
 	if (page)
 		mem_cgroup_uncharge_swapcache(page, entry);
-	return swap_free(entry);
+	p = swap_info_get(entry);
+	if (p) {
+		swap_entry_free(p, entry, SWAP_CACHE);
+		spin_unlock(&swap_lock);
+	}
+	return;
 }
 
 /*
@@ -531,8 +585,7 @@ static inline int page_swapcount(struct page *page)
 	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (p) {
-		/* Subtract the 1 for the swap cache itself */
-		count = p->swap_map[swp_offset(entry)] - 1;
+		count = swap_count(p->swap_map[swp_offset(entry)]);
 		spin_unlock(&swap_lock);
 	}
 	return count;
@@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, entry) == 1) {
+		if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
 			page = find_get_page(&swapper_space, entry.val);
 			if (page && !trylock_page(page)) {
 				page_cache_release(page);
@@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 			i = 1;
 		}
 		count = si->swap_map[i];
-		if (count && count != SWAP_MAP_BAD)
+		if (count && swap_count(count) != SWAP_MAP_BAD)
 			break;
 	}
 	return i;
@@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int type)
 		 */
 		shmem = 0;
 		swcount = *swap_map;
-		if (swcount > 1) {
+		if (swap_count(swcount)) {
 			if (start_mm == &init_mm)
 				shmem = shmem_unuse(entry, page);
 			else
 				retval = unuse_mm(start_mm, entry, page);
 		}
-		if (*swap_map > 1) {
+		if (swap_count(*swap_map)) {
 			int set_start_mm = (*swap_map >= swcount);
 			struct list_head *p = &start_mm->mmlist;
 			struct mm_struct *new_start_mm = start_mm;
@@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
 			atomic_inc(&new_start_mm->mm_users);
 			atomic_inc(&prev_mm->mm_users);
 			spin_lock(&mmlist_lock);
-			while (*swap_map > 1 && !retval && !shmem &&
+			while (swap_count(*swap_map) && !retval && !shmem &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
 				if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int type)
 				cond_resched();
 
 				swcount = *swap_map;
-				if (swcount <= 1)
+				if (!swap_count(swcount)) /* any usage ? */
 					;
 				else if (mm == &init_mm) {
 					set_start_mm = 1;
 					shmem = shmem_unuse(entry, page);
 				} else
 					retval = unuse_mm(mm, entry, page);
-				if (set_start_mm && *swap_map < swcount) {
+
+				if (set_start_mm &&
+				    swap_count(*swap_map) < swcount) {
 					mmput(new_start_mm);
 					atomic_inc(&mm->mm_users);
 					new_start_mm = mm;
@@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int type)
 		}
 
 		/*
-		 * How could swap count reach 0x7fff when the maximum
-		 * pid is 0x7fff, and there's no way to repeat a swap
-		 * page within an mm (except in shmem, where it's the
-		 * shared object which takes the reference count)?
-		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
-		 *
+		 * How could swap count reach 0x7ffe ?
+		 * There's no way to repeat a swap page within an mm
+		 * (except in shmem, where it's the shared object which takes
+		 * the reference count)?
+		 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
+		 * short is too small....)
 		 * If that's wrong, then we should worry more about
 		 * exit_mmap() and do_munmap() cases described above:
 		 * we might be resetting SWAP_MAP_MAX too early here.
 		 * We know "Undead"s can happen, they're okay, so don't
 		 * report them; but do report if we reset SWAP_MAP_MAX.
 		 */
-		if (*swap_map == SWAP_MAP_MAX) {
+		/* We might release the lock_page() in unuse_mm(). */
+		if (!PageSwapCache(page) || page_private(page) != entry.val)
+			goto retry;
+
+		if (swap_count(*swap_map) == SWAP_MAP_MAX) {
 			spin_lock(&swap_lock);
-			*swap_map = 1;
+			*swap_map = encode_swapmap(0, true);
 			spin_unlock(&swap_lock);
 			reset_overflow = 1;
 		}
@@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int type)
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
 		 */
-		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+		if (swap_count(*swap_map) &&
+		     PageDirty(page) && PageSwapCache(page)) {
 			struct writeback_control wbc = {
 				.sync_mode = WB_SYNC_NONE,
 			};
@@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int type)
 		 * mark page dirty so shrink_page_list will preserve it.
 		 */
 		SetPageDirty(page);
+retry:
 		unlock_page(page);
 		page_cache_release(page);
 
@@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val)
  *
  * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
  * "permanent", but will be reclaimed by the next swapoff.
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
  */
-int swap_duplicate(swp_entry_t entry)
+static int __swap_duplicate(swp_entry_t entry, bool cache)
 {
 	struct swap_info_struct * p;
 	unsigned long offset, type;
-	int result = 0;
+	int result = -EINVAL;
+	int count;
+	bool has_cache;
 
 	if (is_migration_entry(entry))
-		return 1;
+		return -EINVAL;
 
 	type = swp_type(entry);
 	if (type >= nr_swapfiles)
@@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry)
 	offset = swp_offset(entry);
 
 	spin_lock(&swap_lock);
-	if (offset < p->max && p->swap_map[offset]) {
-		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
-			p->swap_map[offset]++;
-			result = 1;
-		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+
+	if (unlikely(offset >= p->max))
+		goto unlock_out;
+
+	count = swap_count(p->swap_map[offset]);
+	has_cache = swap_has_cache(p->swap_map[offset]);
+
+	if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+
+		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
+		if (!has_cache && count) {
+			p->swap_map[offset] = encode_swapmap(count, true);
+			result = 0;
+		} else if (has_cache) /* someone added cache */
+			result = -EEXIST;
+		else if (!count) /* no users */
+			result = -ENOENT;
+
+	} else if (count || has_cache) {
+		if (count < SWAP_MAP_MAX - 1) {
+			p->swap_map[offset] = encode_swapmap(count + 1,
+							     has_cache);
+			result = 0;
+		} else if (count <= SWAP_MAP_MAX) {
 			if (swap_overflow++ < 5)
-				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
-			p->swap_map[offset] = SWAP_MAP_MAX;
-			result = 1;
+				printk(KERN_WARNING
+				       "swap_dup: swap entry overflow\n");
+			p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
+							      has_cache);
+			result = 0;
 		}
-	}
+	} else
+		result = -ENOENT; /* unused swap entry */
+unlock_out:
 	spin_unlock(&swap_lock);
 out:
 	return result;
@@ -1988,13 +2080,25 @@ bad_file:
 	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 	goto out;
 }
+/*
+ * increase reference count of swap entry by 1.
+ */
+void swap_duplicate(swp_entry_t entry)
+{
+	__swap_duplicate(entry, SWAP_MAP);
+}
 
 /*
+ * @entry: swap entry for which we allocate swap cache.
+ *
  * Called when allocating swap cache for exising swap entry,
+ * This can return error codes. Returns 0 at success.
+ * -EBUSY means there is a swap cache.
+ * Note: return code is different from swap_duplicate().
  */
 int swapcache_prepare(swp_entry_t entry)
 {
-	return swap_duplicate(entry);
+	return __swap_duplicate(entry, SWAP_CACHE);
 }
 
 
@@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		/* Don't read in free or bad pages */
 		if (!si->swap_map[toff])
 			break;
-		if (si->swap_map[toff] == SWAP_MAP_BAD)
+		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
 	}
 	/* Count contiguous allocated slots below our target */
@@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		/* Don't read in free or bad pages */
 		if (!si->swap_map[toff])
 			break;
-		if (si->swap_map[toff] == SWAP_MAP_BAD)
+		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
 	}
 	spin_unlock(&swap_lock);
-- 
cgit v1.2.3-70-g09d2


From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Jun 2009 15:32:56 -0700
Subject: oom: move oom_adj value from task_struct to mm_struct

The per-task oom_adj value is a characteristic of its mm more than the
task itself since it's not possible to oom kill any thread that shares the
mm.  If a task were to be killed while attached to an mm that could not be
freed because another thread were set to OOM_DISABLE, it would have
needlessly been terminated since there is no potential for future memory
freeing.

This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct mm_struct.  This requires task_lock() on a
task to check its oom_adj value to protect against exec, but it's already
necessary to take the lock when dereferencing the mm to find the total VM
size for the badness heuristic.

This fixes a livelock if the oom killer chooses a task and another thread
sharing the same memory has an oom_adj value of OOM_DISABLE.  This occurs
because oom_kill_task() repeatedly returns 1 and refuses to kill the
chosen task while select_bad_process() will repeatedly choose the same
task during the next retry.

Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in
oom_kill_task() to check for threads sharing the same memory will be
removed in the next patch in this series where it will no longer be
necessary.

Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since
these threads are immune from oom killing already.  They simply report an
oom_adj value of OOM_DISABLE.

Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 15 ++++++++++-----
 fs/proc/base.c                     | 19 ++++++++++++++++---
 include/linux/mm_types.h           |  2 ++
 include/linux/sched.h              |  1 -
 mm/oom_kill.c                      | 34 ++++++++++++++++++++++------------
 5 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cd8717a3627..ebff3c10a07 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
 ------------------------------------------------------
 
-This file can be used to adjust the score used to select which processes
-should be killed in an  out-of-memory  situation.  Giving it a high score will
-increase the likelihood of this process being killed by the oom-killer.  Valid
-values are in the range -16 to +15, plus the special value -17, which disables
-oom-killing altogether for this process.
+This file can be used to adjust the score used to select which processes should
+be killed in an out-of-memory situation.  The oom_adj value is a characteristic
+of the task's mm, so all threads that share an mm with pid will have the same
+oom_adj value.  A high value will increase the likelihood of this process being
+killed by the oom-killer.  Valid values are in the range -16 to +15 as
+explained below and a special value of -17, which disables oom-killing
+altogether for threads sharing pid's mm.
 
 The process to be killed in an out-of-memory situation is selected among all others
 based on its badness score. This value equals the original memory size of the process
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers
 are the prime candidates to be killed. Having only one 'hungry' child will make
 parent less preferable than the child.
 
+/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
+oom-killing already.
+
 /proc/<pid>/oom_score shows process' current badness score.
 
 The following heuristics are then applied:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1539e630c47..3ce5ae9e3d2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 
 	if (!task)
 		return -ESRCH;
-	oom_adjust = task->oomkilladj;
+	task_lock(task);
+	if (task->mm)
+		oom_adjust = task->mm->oom_adj;
+	else
+		oom_adjust = OOM_DISABLE;
+	task_unlock(task);
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task)
 		return -ESRCH;
-	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+	if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+		task_unlock(task);
 		put_task_struct(task);
 		return -EACCES;
 	}
-	task->oomkilladj = oom_adjust;
+	task->mm->oom_adj = oom_adjust;
+	task_unlock(task);
 	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26ecf2..f4408106fcb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -232,6 +232,8 @@ struct mm_struct {
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+	s8 oom_adj;	/* OOM kill score adjustment (bit shift) */
+
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1048bf50540..1bc6fae0c13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1178,7 +1178,6 @@ struct task_struct {
 	 * a short time
 	 */
 	unsigned char fpu_counter;
-	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922..b60913520ef 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
 	struct task_struct *child;
+	int oom_adj;
 
 	task_lock(p);
 	mm = p->mm;
@@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		task_unlock(p);
 		return 0;
 	}
+	oom_adj = mm->oom_adj;
 
 	/*
 	 * The memory size of the process is the basis for the badness.
@@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		points /= 8;
 
 	/*
-	 * Adjust the score by oomkilladj.
+	 * Adjust the score by oom_adj.
 	 */
-	if (p->oomkilladj) {
-		if (p->oomkilladj > 0) {
+	if (oom_adj) {
+		if (oom_adj > 0) {
 			if (!points)
 				points = 1;
-			points <<= p->oomkilladj;
+			points <<= oom_adj;
 		} else
-			points >>= -(p->oomkilladj);
+			points >>= -(oom_adj);
 	}
 
 #ifdef DEBUG
@@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			*ppoints = ULONG_MAX;
 		}
 
-		if (p->oomkilladj == OOM_DISABLE)
+		task_lock(p);
+		if (p->mm && p->mm->oom_adj == OOM_DISABLE) {
+			task_unlock(p);
 			continue;
+		}
+		task_unlock(p);
 
 		points = badness(p, uptime.tv_sec);
 		if (points > *ppoints || !chosen) {
@@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 		}
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
 		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-		       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
-		       p->comm);
+		       get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
 }
@@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p)
 	 * Don't kill the process if any threads are set to OOM_DISABLE
 	 */
 	do_each_thread(g, q) {
-		if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+		task_lock(q);
+		if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) {
+			task_unlock(q);
 			return 1;
+		}
+		task_unlock(q);
 	} while_each_thread(g, q);
 
 	__oom_kill_task(p, 1);
@@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	struct task_struct *c;
 
 	if (printk_ratelimit()) {
-		printk(KERN_WARNING "%s invoked oom-killer: "
-			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-			current->comm, gfp_mask, order, current->oomkilladj);
 		task_lock(current);
+		printk(KERN_WARNING "%s invoked oom-killer: "
+			"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
+			current->comm, gfp_mask, order,
+			current->mm ? current->mm->oom_adj : OOM_DISABLE);
 		cpuset_print_task_mems_allowed(current);
 		task_unlock(current);
 		dump_stack();
-- 
cgit v1.2.3-70-g09d2


From 286973552f051404abdb58dd9b2f8f7558efe4e5 Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Tue, 16 Jun 2009 15:32:59 -0700
Subject: mm: remove __invalidate_mapping_pages variant

Remove __invalidate_mapping_pages atomic variant now that its sole caller
can sleep (fixed in eccb95cee4f0d56faa46ef22fb94dd4a3578d3eb ("vfs: fix
lock inversion in drop_pagecache_sb()")).

This fixes softlockups that can occur while in the drop_caches path.

Signed-off-by: Mike Waychison <mikew@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/drop_caches.c   |  2 +-
 include/linux/fs.h |  3 ---
 mm/truncate.c      | 39 ++++++++++++++++-----------------------
 3 files changed, 17 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a909f..a2edb791344 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb)
 			continue;
 		__iget(inode);
 		spin_unlock(&inode_lock);
-		__invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
+		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;
 		spin_lock(&inode_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8146e0264ef..f5ae9f19b8a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2036,9 +2036,6 @@ extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
 extern int invalidate_inodes(struct super_block *);
-unsigned long __invalidate_mapping_pages(struct address_space *mapping,
-					pgoff_t start, pgoff_t end,
-					bool be_atomic);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 12e1579f916..ccc3ecf7cb9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 }
 EXPORT_SYMBOL(truncate_inode_pages);
 
-unsigned long __invalidate_mapping_pages(struct address_space *mapping,
-				pgoff_t start, pgoff_t end, bool be_atomic)
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end)
 {
 	struct pagevec pvec;
 	pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		if (likely(!be_atomic))
-			cond_resched();
+		cond_resched();
 	}
 	return ret;
 }
-
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
-				pgoff_t start, pgoff_t end)
-{
-	return __invalidate_mapping_pages(mapping, start, end, false);
-}
 EXPORT_SYMBOL(invalidate_mapping_pages);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From aca8bf323edd31ad462dc98c107c23a5c6022ca2 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 16 Jun 2009 15:33:02 -0700
Subject: mm: remove file argument from swap_readpage()

The file argument resulted from address_space's readpage long time ago.

We don't use it any more.  Let's remove unnecessary argement.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 2 +-
 mm/page_io.c         | 2 +-
 mm/swap_state.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index fed5e8e1104..0cedf31af0b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -256,7 +256,7 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
-extern int swap_readpage(struct file *, struct page *);
+extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c475e04..c6f3e5071de 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
 	return ret;
 }
 
-int swap_readpage(struct file *file, struct page *page)
+int swap_readpage(struct page *page)
 {
 	struct bio *bio;
 	int ret = 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b62e7f56051..42cd38eba79 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -313,7 +313,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			 * Initiate read into locked page and return.
 			 */
 			lru_cache_add_anon(new_page);
-			swap_readpage(NULL, new_page);
+			swap_readpage(new_page);
 			return new_page;
 		}
 		ClearPageSwapBacked(new_page);
-- 
cgit v1.2.3-70-g09d2


From 168f5ac668f63dfb64439766e3ef9e866b83719d Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@inbox.ru>
Date: Tue, 16 Jun 2009 15:33:02 -0700
Subject: mm cleanup: shmem_file_setup: 'char *' -> 'const char *' for name
 argument

As function shmem_file_setup does not modify/allocate/free/pass given
filename - mark it as const.

Signed-off-by: Sergei Trofimovich <slyfox@inbox.ru>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/shmem.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f0473a88ca2..d88d6fc530a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -724,7 +724,7 @@ static inline int shmem_lock(struct file *file, int lock,
 	return 0;
 }
 #endif
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
 
 int shmem_zero_setup(struct vm_area_struct *);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 47ab1918228..e89d7ec18ed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
  * @size: size to be set for the file
  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  */
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
 {
 	int error;
 	struct file *file;
-- 
cgit v1.2.3-70-g09d2


From 6fe6b7e35785e3232ffe7f81d3893f1316710a02 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 16 Jun 2009 15:33:05 -0700
Subject: vmscan: report vm_flags in page_referenced()

Collect vma->vm_flags of the VMAs that actually referenced the page.

This is preparing for more informed reclaim heuristics, eg.  to protect
executable file pages more aggressively.  For now only the VM_EXEC bit
will be used by the caller.

Thanks to Johannes, Peter and Minchan for all the good tips.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  5 +++--
 mm/rmap.c            | 37 ++++++++++++++++++++++++++-----------
 mm/vmscan.c          |  7 +++++--
 3 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 619379a1dd9..216d024f830 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -83,7 +83,8 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma,
 /*
  * Called from mm/vmscan.c to handle paging out
  */
-int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt);
+int page_referenced(struct page *, int is_locked,
+			struct mem_cgroup *cnt, unsigned long *vm_flags);
 int try_to_unmap(struct page *, int ignore_refs);
 
 /*
@@ -117,7 +118,7 @@ int try_to_munlock(struct page *);
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page,l,cnt) TestClearPageReferenced(page)
+#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page)
 #define try_to_unmap(page, refs) SWAP_FAIL
 
 static inline int page_mkclean(struct page *page)
diff --git a/mm/rmap.c b/mm/rmap.c
index 316c9d6930a..c9ccc1a72dc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
 static int page_referenced_one(struct page *page,
-	struct vm_area_struct *vma, unsigned int *mapcount)
+			       struct vm_area_struct *vma,
+			       unsigned int *mapcount,
+			       unsigned long *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
+	if (referenced)
+		*vm_flags |= vma->vm_flags;
 	return referenced;
 }
 
 static int page_referenced_anon(struct page *page,
-				struct mem_cgroup *mem_cont)
+				struct mem_cgroup *mem_cont,
+				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		referenced += page_referenced_one(page, vma, &mapcount);
+		referenced += page_referenced_one(page, vma,
+						  &mapcount, vm_flags);
 		if (!mapcount)
 			break;
 	}
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
  * page_referenced_file - referenced check for object-based rmap
  * @page: the page we're checking references on.
  * @mem_cont: target memory controller
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * For an object-based mapped page, find all the places it is mapped and
  * check/clear the referenced flag.  This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
  * This function is only called from page_referenced for object-based pages.
  */
 static int page_referenced_file(struct page *page,
-				struct mem_cgroup *mem_cont)
+				struct mem_cgroup *mem_cont,
+				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		referenced += page_referenced_one(page, vma, &mapcount);
+		referenced += page_referenced_one(page, vma,
+						  &mapcount, vm_flags);
 		if (!mapcount)
 			break;
 	}
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
  * @page: the page to test
  * @is_locked: caller holds lock on the page
  * @mem_cont: target memory controller
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
-int page_referenced(struct page *page, int is_locked,
-			struct mem_cgroup *mem_cont)
+int page_referenced(struct page *page,
+		    int is_locked,
+		    struct mem_cgroup *mem_cont,
+		    unsigned long *vm_flags)
 {
 	int referenced = 0;
 
 	if (TestClearPageReferenced(page))
 		referenced++;
 
+	*vm_flags = 0;
 	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
-			referenced += page_referenced_anon(page, mem_cont);
+			referenced += page_referenced_anon(page, mem_cont,
+								vm_flags);
 		else if (is_locked)
-			referenced += page_referenced_file(page, mem_cont);
+			referenced += page_referenced_file(page, mem_cont,
+								vm_flags);
 		else if (!trylock_page(page))
 			referenced++;
 		else {
 			if (page->mapping)
-				referenced +=
-					page_referenced_file(page, mem_cont);
+				referenced += page_referenced_file(page,
+							mem_cont, vm_flags);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 52339dd7bf8..6be2068f61c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -577,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
+	unsigned long vm_flags;
 
 	cond_resched();
 
@@ -627,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 		}
 
-		referenced = page_referenced(page, 1, sc->mem_cgroup);
+		referenced = page_referenced(page, 1,
+						sc->mem_cgroup, &vm_flags);
 		/* In active use or really unfreeable?  Activate it. */
 		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
 					referenced && page_mapping_inuse(page))
@@ -1208,6 +1210,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 {
 	unsigned long pgmoved;
 	unsigned long pgscanned;
+	unsigned long vm_flags;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);
 	struct page *page;
@@ -1248,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 		/* page_referenced clears PageReferenced */
 		if (page_mapping_inuse(page) &&
-		    page_referenced(page, 0, sc->mem_cgroup))
+		    page_referenced(page, 0, sc->mem_cgroup, &vm_flags))
 			pgmoved++;
 
 		list_add(&page->lru, &l_inactive);
-- 
cgit v1.2.3-70-g09d2


From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 16 Jun 2009 15:33:23 -0700
Subject: vmscan: count the number of times zone_reclaim() scans and fails

On NUMA machines, the administrator can configure zone_reclaim_mode that
is a more targetted form of direct reclaim.  On machines with large NUMA
distances for example, a zone_reclaim_mode defaults to 1 meaning that
clean unmapped pages will be reclaimed if the zone watermarks are not
being met.

There is a heuristic that determines if the scan is worthwhile but it is
possible that the heuristic will fail and the CPU gets tied up scanning
uselessly.  Detecting the situation requires some guesswork and
experimentation so this patch adds a counter "zreclaim_failed" to
/proc/vmstat.  If during high CPU utilisation this counter is increasing
rapidly, then the resolution to the problem may be to set
/proc/sys/vm/zone_reclaim_mode to 0.

[akpm@linux-foundation.org: name things consistently]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 3 +++
 mm/vmscan.c            | 3 +++
 mm/vmstat.c            | 3 +++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ff4696c6dce..81a97cf8f0a 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGSTEAL),
 		FOR_ALL_ZONES(PGSCAN_KSWAPD),
 		FOR_ALL_ZONES(PGSCAN_DIRECT),
+#ifdef CONFIG_NUMA
+		PGSCAN_ZONE_RECLAIM_FAILED,
+#endif
 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16c82a868e2..3018ad75613 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2519,6 +2519,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	ret = __zone_reclaim(zone, gfp_mask, order);
 	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
 
+	if (!ret)
+		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
+
 	return ret;
 }
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1e3aa8139f2..138bed53706 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -673,6 +673,9 @@ static const char * const vmstat_text[] = {
 	TEXTS_FOR_ZONES("pgscan_kswapd")
 	TEXTS_FOR_ZONES("pgscan_direct")
 
+#ifdef CONFIG_NUMA
+	"zone_reclaim_failed",
+#endif
 	"pginodesteal",
 	"slabs_scanned",
 	"kswapd_steal",
-- 
cgit v1.2.3-70-g09d2


From a7d932af06e8eee2163627d19898e18da5635449 Mon Sep 17 00:00:00 2001
From: Dan Smith <danms@us.ibm.com>
Date: Tue, 16 Jun 2009 15:33:33 -0700
Subject: utsname.h: make new_utsname fields use the proper length constant

The members of the new_utsname structure are defined with magic numbers
that *should* correspond to the constant __NEW_UTS_LEN+1.  Everywhere
else, code assumes this and uses the constant, so this patch makes the
structure match.

Originally suggested by Serge here:

https://lists.linux-foundation.org/pipermail/containers/2009-March/016258.html

Signed-off-by: Dan Smith <danms@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/utsname.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 11232676bff..3656b300de3 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -22,12 +22,12 @@ struct old_utsname {
 };
 
 struct new_utsname {
-	char sysname[65];
-	char nodename[65];
-	char release[65];
-	char version[65];
-	char machine[65];
-	char domainname[65];
+	char sysname[__NEW_UTS_LEN + 1];
+	char nodename[__NEW_UTS_LEN + 1];
+	char release[__NEW_UTS_LEN + 1];
+	char version[__NEW_UTS_LEN + 1];
+	char machine[__NEW_UTS_LEN + 1];
+	char domainname[__NEW_UTS_LEN + 1];
 };
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3-70-g09d2


From 4938d7e0233a455f04507bac81d0886c71529537 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 16 Jun 2009 15:33:36 -0700
Subject: poll: avoid extra wakeups in select/poll

After introduction of keyed wakeups Davide Libenzi did on epoll, we are
able to avoid spurious wakeups in poll()/select() code too.

For example, typical use of poll()/select() is to wait for incoming
network frames on many sockets.  But TX completion for UDP/TCP frames call
sock_wfree() which in turn schedules thread.

When scheduled, thread does a full scan of all polled fds and can sleep
again, because nothing is really available.  If number of fds is large,
this cause significant load.

This patch makes select()/poll() aware of keyed wakeups and useless
wakeups are avoided.  This reduces number of context switches by about 50%
on some setups, and work performed by sofirq handlers.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c          | 40 ++++++++++++++++++++++++++++++++++++----
 include/linux/poll.h |  3 +++
 2 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/select.c b/fs/select.c
index 0fe0e1469df..d870237e42c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 	return table->entry++;
 }
 
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_wqueues *pwq = wait->private;
 	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	return default_wake_function(&dummy_wait, mode, sync, key);
 }
 
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_table_entry *entry;
+
+	entry = container_of(wait, struct poll_table_entry, wait);
+	if (key && !((unsigned long)key & entry->key))
+		return 0;
+	return __pollwake(wait, mode, sync, key);
+}
+
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 				poll_table *p)
@@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	get_file(filp);
 	entry->filp = filp;
 	entry->wait_address = wait_address;
+	entry->key = p->key;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
 	entry->wait.private = pwq;
 	add_wait_queue(wait_address, &entry->wait);
@@ -362,6 +373,18 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+				unsigned long out, unsigned long bit)
+{
+	if (wait) {
+		wait->key = POLLEX_SET;
+		if (in & bit)
+			wait->key |= POLLIN_SET;
+		if (out & bit)
+			wait->key |= POLLOUT_SET;
+	}
+}
+
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
 	ktime_t expire, *to = NULL;
@@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 				if (file) {
 					f_op = file->f_op;
 					mask = DEFAULT_POLLMASK;
-					if (f_op && f_op->poll)
-						mask = (*f_op->poll)(file, retval ? NULL : wait);
+					if (f_op && f_op->poll) {
+						wait_key_set(wait, in, out, bit);
+						mask = (*f_op->poll)(file, wait);
+					}
 					fput_light(file, fput_needed);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
 						retval++;
+						wait = NULL;
 					}
 					if ((mask & POLLOUT_SET) && (out & bit)) {
 						res_out |= bit;
 						retval++;
+						wait = NULL;
 					}
 					if ((mask & POLLEX_SET) && (ex & bit)) {
 						res_ex |= bit;
 						retval++;
+						wait = NULL;
 					}
 				}
 			}
@@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 		mask = POLLNVAL;
 		if (file != NULL) {
 			mask = DEFAULT_POLLMASK;
-			if (file->f_op && file->f_op->poll)
+			if (file->f_op && file->f_op->poll) {
+				if (pwait)
+					pwait->key = pollfd->events |
+							POLLERR | POLLHUP;
 				mask = file->f_op->poll(file, pwait);
+			}
 			/* Mask out unneeded events. */
 			mask &= pollfd->events | POLLERR | POLLHUP;
 			fput_light(file, fput_needed);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 8c24ef8d997..fa287f25138 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -32,6 +32,7 @@ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_
 
 typedef struct poll_table_struct {
 	poll_queue_proc qproc;
+	unsigned long key;
 } poll_table;
 
 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -43,10 +44,12 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres
 static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 {
 	pt->qproc = qproc;
+	pt->key   = ~0UL; /* all events enabled */
 }
 
 struct poll_table_entry {
 	struct file *filp;
+	unsigned long key;
 	wait_queue_t wait;
 	wait_queue_head_t *wait_address;
 };
-- 
cgit v1.2.3-70-g09d2


From 0d9c25dde878a636ee9a9b53923569171bf9a55b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 16 Jun 2009 15:33:37 -0700
Subject: headers: move module_bug_finalize()/module_bug_cleanup() definitions
 into module.h

They're in linux/bug.h at present, which causes include order tangles.  In
particular, linux/bug.h cannot be used by linux/atomic.h because,
according to Nikanth:

linux/bug.h pulls in linux/module.h => linux/spinlock.h => asm/spinlock.h
(which uses atomic_inc) => asm/atomic.h.

bug.h is a pretty low-level thing and module.h is a higher-level thing,
IMO.

Cc: Nikanth Karthikesan <knikanth@novell.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h    | 12 ------------
 include/linux/module.h | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index 54398d2c6d8..d276b5510c8 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_BUG_H
 #define _LINUX_BUG_H
 
-#include <linux/module.h>
 #include <asm/bug.h>
 
 enum bug_trap_type {
@@ -24,10 +23,6 @@ const struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 
-int  module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
-			 struct module *);
-void module_bug_cleanup(struct module *);
-
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
 
@@ -38,13 +33,6 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 {
 	return BUG_TRAP_TYPE_BUG;
 }
-static inline int  module_bug_finalize(const Elf_Ehdr *hdr,
-					const Elf_Shdr *sechdrs,
-					struct module *mod)
-{
-	return 0;
-}
-static inline void module_bug_cleanup(struct module *mod) {}
 
 #endif	/* CONFIG_GENERIC_BUG */
 #endif	/* _LINUX_BUG_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index a7bc6e7b43a..505f20dcc1c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -697,4 +697,21 @@ static inline void module_remove_modinfo_attrs(struct module *mod)
 
 #define __MODULE_STRING(x) __stringify(x)
 
+
+#ifdef CONFIG_GENERIC_BUG
+int  module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
+			 struct module *);
+void module_bug_cleanup(struct module *);
+
+#else	/* !CONFIG_GENERIC_BUG */
+
+static inline int  module_bug_finalize(const Elf_Ehdr *hdr,
+					const Elf_Shdr *sechdrs,
+					struct module *mod)
+{
+	return 0;
+}
+static inline void module_bug_cleanup(struct module *mod) {}
+#endif	/* CONFIG_GENERIC_BUG */
+
 #endif /* _LINUX_MODULE_H */
-- 
cgit v1.2.3-70-g09d2


From 10fc89d01a7ea2ecc2a58d2f4e7700f47178cd62 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 16 Jun 2009 15:33:38 -0700
Subject: drbd: add major number to major.h

Since we have had a LANANA major number for years, and it is documented in
devices.txt, I think that this first patch can go upstream.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/major.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/major.h b/include/linux/major.h
index 058ec15dd06..6a8ca98c9a9 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -145,6 +145,7 @@
 #define UNIX98_PTY_MAJOR_COUNT	8
 #define UNIX98_PTY_SLAVE_MAJOR	(UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT)
 
+#define DRBD_MAJOR		147
 #define RTF_MAJOR		150
 #define RAW_MAJOR		162
 
-- 
cgit v1.2.3-70-g09d2


From 8b0b1db0133e4218a9b45c09e53793c039edebe1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Jun 2009 15:33:39 -0700
Subject: remove put_cpu_no_resched()

put_cpu_no_resched() is an optimization of put_cpu() which unfortunately
can cause high latencies.

The nfs iostats code uses put_cpu_no_resched() in a code sequence where a
reschedule request caused by an interrupt between the get_cpu() and the
put_cpu_no_resched() can delay the reschedule for at least HZ.

The other users of put_cpu_no_resched() optimize correctly in interrupt
code, but there is no real harm in using the put_cpu() function which is
an alias for preempt_enable().  The extra check of the preemmpt count is
not as critical as the potential source of missing a reschedule.

Debugged in the preempt-rt tree and verified in mainline.

Impact: remove a high latency source

[akpm@linux-foundation.org: build fix]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c | 2 +-
 fs/nfs/iostat.h            | 6 +++---
 include/linux/smp.h        | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 8a06dc48059..bdc176cb5e8 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5595,7 +5595,7 @@ pfm_interrupt_handler(int irq, void *arg)
 		(*pfm_alt_intr_handler->handler)(irq, arg, regs);
 	}
 
-	put_cpu_no_resched();
+	put_cpu();
 	return IRQ_HANDLED;
 }
 
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a2ab2529b5c..ceda50aad73 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
 	iostats->events[stat]++;
-	put_cpu_no_resched();
+	put_cpu();
 }
 
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
 	iostats->bytes[stat] += addend;
-	put_cpu_no_resched();
+	put_cpu();
 }
 
 static inline void nfs_add_stats(const struct inode *inode,
@@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
 	iostats->fscache[stat] += addend;
-	put_cpu_no_resched();
+	put_cpu();
 }
 #endif
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a69db820eed..9e3d8af0920 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -177,7 +177,6 @@ static inline void init_call_single_data(void)
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
 
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
-- 
cgit v1.2.3-70-g09d2


From cc6f26774136b7f5307abcd3887f08360c9b7554 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Tue, 16 Jun 2009 15:33:49 -0700
Subject: syscalls.h: remove duplicated declarations for sys_pipe2

sys_pipe2 is declared twice in include/linux/syscalls.h.

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 418d90f5eff..fa4242cdade 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -434,6 +434,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
 asmlinkage long sys_fcntl64(unsigned int fd,
 				unsigned int cmd, unsigned long arg);
 #endif
+asmlinkage long sys_pipe(int __user *fildes);
 asmlinkage long sys_pipe2(int __user *fildes, int flags);
 asmlinkage long sys_dup(unsigned int fildes);
 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
@@ -751,8 +752,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  struct timespec __user *, const sigset_t __user *,
 			  size_t);
-asmlinkage long sys_pipe2(int __user *, int);
-asmlinkage long sys_pipe(int __user *);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-- 
cgit v1.2.3-70-g09d2


From 55e331cf7ebe20665253770589cd9eb06048bf25 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Tue, 16 Jun 2009 15:33:53 -0700
Subject: drivers: add support for the TI VLYNQ bus

Add support for the TI VLYNQ high-speed, serial and packetized bus.

This bus allows external devices to be connected to the System-on-Chip and
appear in the main system memory just like any memory mapped peripheral.
It is widely used in TI's networking and multimedia SoC, including the AR7
SoC.

Signed-off-by: Eugene Konev <ejka@imfi.kspu.ru>
Signed-off-by: Florian Fainelli <florian@openwrt.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS            |   8 +
 drivers/Kconfig        |   2 +
 drivers/Makefile       |   1 +
 drivers/vlynq/Kconfig  |  20 ++
 drivers/vlynq/Makefile |   5 +
 drivers/vlynq/vlynq.c  | 814 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vlynq.h  | 161 ++++++++++
 7 files changed, 1011 insertions(+)
 create mode 100644 drivers/vlynq/Kconfig
 create mode 100644 drivers/vlynq/Makefile
 create mode 100644 drivers/vlynq/vlynq.c
 create mode 100644 include/linux/vlynq.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 2cb7566904b..b735f7dc91d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6281,6 +6281,14 @@ F:	drivers/net/macvlan.c
 F:	include/linux/if_*vlan.h
 F:	net/8021q/
 
+VLYNQ BUS
+P:	Florian Fainelli
+M:	florian@openwrt.org
+L:	openwrt-devel@lists.openwrt.org
+S:	Maintained
+F:	drivers/vlynq/vlynq.c
+F:	include/linux/vlynq.h
+
 VOLTAGE AND CURRENT REGULATOR FRAMEWORK
 P:	Liam Girdwood
 M:	lrg@slimlogic.co.uk
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 00cf9553f74..a442c8f29fc 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -104,6 +104,8 @@ source "drivers/auxdisplay/Kconfig"
 
 source "drivers/uio/Kconfig"
 
+source "drivers/vlynq/Kconfig"
+
 source "drivers/xen/Kconfig"
 
 source "drivers/staging/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 9e7d4e56c85..00b44f4ccf0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -105,6 +105,7 @@ obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
 obj-$(CONFIG_VIRTIO)		+= virtio/
+obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/
 obj-y				+= ieee802154/
diff --git a/drivers/vlynq/Kconfig b/drivers/vlynq/Kconfig
new file mode 100644
index 00000000000..f6542211db4
--- /dev/null
+++ b/drivers/vlynq/Kconfig
@@ -0,0 +1,20 @@
+menu "TI VLYNQ"
+
+config VLYNQ
+	bool "TI VLYNQ bus support"
+	depends on AR7 && EXPERIMENTAL
+	help
+	  Support for Texas Instruments(R) VLYNQ bus.
+	  The VLYNQ bus is a high-speed, serial and packetized
+	  data bus which allows external peripherals of a SoC
+	  to appear into the system's main memory.
+
+	  If unsure, say N
+
+config VLYNQ_DEBUG
+	bool "VLYNQ bus debug"
+	depends on VLYNQ && KERNEL_DEBUG
+	help
+	  Turn on VLYNQ bus debugging.
+
+endmenu
diff --git a/drivers/vlynq/Makefile b/drivers/vlynq/Makefile
new file mode 100644
index 00000000000..b3f61149b59
--- /dev/null
+++ b/drivers/vlynq/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for kernel vlynq drivers
+#
+
+obj-$(CONFIG_VLYNQ) += vlynq.o
diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c
new file mode 100644
index 00000000000..7335433b067
--- /dev/null
+++ b/drivers/vlynq/vlynq.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Parts of the VLYNQ specification can be found here:
+ * http://www.ti.com/litv/pdf/sprue36a
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+
+#include <linux/vlynq.h>
+
+#define VLYNQ_CTRL_PM_ENABLE		0x80000000
+#define VLYNQ_CTRL_CLOCK_INT		0x00008000
+#define VLYNQ_CTRL_CLOCK_DIV(x)		(((x) & 7) << 16)
+#define VLYNQ_CTRL_INT_LOCAL		0x00004000
+#define VLYNQ_CTRL_INT_ENABLE		0x00002000
+#define VLYNQ_CTRL_INT_VECTOR(x)	(((x) & 0x1f) << 8)
+#define VLYNQ_CTRL_INT2CFG		0x00000080
+#define VLYNQ_CTRL_RESET		0x00000001
+
+#define VLYNQ_CTRL_CLOCK_MASK          (0x7 << 16)
+
+#define VLYNQ_INT_OFFSET		0x00000014
+#define VLYNQ_REMOTE_OFFSET		0x00000080
+
+#define VLYNQ_STATUS_LINK		0x00000001
+#define VLYNQ_STATUS_LERROR		0x00000080
+#define VLYNQ_STATUS_RERROR		0x00000100
+
+#define VINT_ENABLE			0x00000100
+#define VINT_TYPE_EDGE			0x00000080
+#define VINT_LEVEL_LOW			0x00000040
+#define VINT_VECTOR(x)			((x) & 0x1f)
+#define VINT_OFFSET(irq)		(8 * ((irq) % 4))
+
+#define VLYNQ_AUTONEGO_V2		0x00010000
+
+struct vlynq_regs {
+	u32 revision;
+	u32 control;
+	u32 status;
+	u32 int_prio;
+	u32 int_status;
+	u32 int_pending;
+	u32 int_ptr;
+	u32 tx_offset;
+	struct vlynq_mapping rx_mapping[4];
+	u32 chip;
+	u32 autonego;
+	u32 unused[6];
+	u32 int_device[8];
+};
+
+#ifdef VLYNQ_DEBUG
+static void vlynq_dump_regs(struct vlynq_device *dev)
+{
+	int i;
+
+	printk(KERN_DEBUG "VLYNQ local=%p remote=%p\n",
+			dev->local, dev->remote);
+	for (i = 0; i < 32; i++) {
+		printk(KERN_DEBUG "VLYNQ: local %d: %08x\n",
+			i + 1, ((u32 *)dev->local)[i]);
+		printk(KERN_DEBUG "VLYNQ: remote %d: %08x\n",
+			i + 1, ((u32 *)dev->remote)[i]);
+	}
+}
+
+static void vlynq_dump_mem(u32 *base, int count)
+{
+	int i;
+
+	for (i = 0; i < (count + 3) / 4; i++) {
+		if (i % 4 == 0)
+			printk(KERN_DEBUG "\nMEM[0x%04x]:", i * 4);
+		printk(KERN_DEBUG " 0x%08x", *(base + i));
+	}
+	printk(KERN_DEBUG "\n");
+}
+#endif
+
+/* Check the VLYNQ link status with a given device */
+static int vlynq_linked(struct vlynq_device *dev)
+{
+	int i;
+
+	for (i = 0; i < 100; i++)
+		if (readl(&dev->local->status) & VLYNQ_STATUS_LINK)
+			return 1;
+		else
+			cpu_relax();
+
+	return 0;
+}
+
+static void vlynq_reset(struct vlynq_device *dev)
+{
+	writel(readl(&dev->local->control) | VLYNQ_CTRL_RESET,
+			&dev->local->control);
+
+	/* Wait for the devices to finish resetting */
+	msleep(5);
+
+	/* Remove reset bit */
+	writel(readl(&dev->local->control) & ~VLYNQ_CTRL_RESET,
+			&dev->local->control);
+
+	/* Give some time for the devices to settle */
+	msleep(5);
+}
+
+static void vlynq_irq_unmask(unsigned int irq)
+{
+	u32 val;
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+	int virq;
+
+	BUG_ON(!dev);
+	virq = irq - dev->irq_start;
+	val = readl(&dev->remote->int_device[virq >> 2]);
+	val |= (VINT_ENABLE | virq) << VINT_OFFSET(virq);
+	writel(val, &dev->remote->int_device[virq >> 2]);
+}
+
+static void vlynq_irq_mask(unsigned int irq)
+{
+	u32 val;
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+	int virq;
+
+	BUG_ON(!dev);
+	virq = irq - dev->irq_start;
+	val = readl(&dev->remote->int_device[virq >> 2]);
+	val &= ~(VINT_ENABLE << VINT_OFFSET(virq));
+	writel(val, &dev->remote->int_device[virq >> 2]);
+}
+
+static int vlynq_irq_type(unsigned int irq, unsigned int flow_type)
+{
+	u32 val;
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+	int virq;
+
+	BUG_ON(!dev);
+	virq = irq - dev->irq_start;
+	val = readl(&dev->remote->int_device[virq >> 2]);
+	switch (flow_type & IRQ_TYPE_SENSE_MASK) {
+	case IRQ_TYPE_EDGE_RISING:
+	case IRQ_TYPE_EDGE_FALLING:
+	case IRQ_TYPE_EDGE_BOTH:
+		val |= VINT_TYPE_EDGE << VINT_OFFSET(virq);
+		val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq));
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq));
+		val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq));
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq));
+		val |= VINT_LEVEL_LOW << VINT_OFFSET(virq);
+		break;
+	default:
+		return -EINVAL;
+	}
+	writel(val, &dev->remote->int_device[virq >> 2]);
+	return 0;
+}
+
+static void vlynq_local_ack(unsigned int irq)
+{
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+
+	u32 status = readl(&dev->local->status);
+
+	pr_debug("%s: local status: 0x%08x\n",
+		       dev_name(&dev->dev), status);
+	writel(status, &dev->local->status);
+}
+
+static void vlynq_remote_ack(unsigned int irq)
+{
+	struct vlynq_device *dev = get_irq_chip_data(irq);
+
+	u32 status = readl(&dev->remote->status);
+
+	pr_debug("%s: remote status: 0x%08x\n",
+		       dev_name(&dev->dev), status);
+	writel(status, &dev->remote->status);
+}
+
+static irqreturn_t vlynq_irq(int irq, void *dev_id)
+{
+	struct vlynq_device *dev = dev_id;
+	u32 status;
+	int virq = 0;
+
+	status = readl(&dev->local->int_status);
+	writel(status, &dev->local->int_status);
+
+	if (unlikely(!status))
+		spurious_interrupt();
+
+	while (status) {
+		if (status & 1)
+			do_IRQ(dev->irq_start + virq);
+		status >>= 1;
+		virq++;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct irq_chip vlynq_irq_chip = {
+	.name = "vlynq",
+	.unmask = vlynq_irq_unmask,
+	.mask = vlynq_irq_mask,
+	.set_type = vlynq_irq_type,
+};
+
+static struct irq_chip vlynq_local_chip = {
+	.name = "vlynq local error",
+	.unmask = vlynq_irq_unmask,
+	.mask = vlynq_irq_mask,
+	.ack = vlynq_local_ack,
+};
+
+static struct irq_chip vlynq_remote_chip = {
+	.name = "vlynq local error",
+	.unmask = vlynq_irq_unmask,
+	.mask = vlynq_irq_mask,
+	.ack = vlynq_remote_ack,
+};
+
+static int vlynq_setup_irq(struct vlynq_device *dev)
+{
+	u32 val;
+	int i, virq;
+
+	if (dev->local_irq == dev->remote_irq) {
+		printk(KERN_ERR
+		       "%s: local vlynq irq should be different from remote\n",
+		       dev_name(&dev->dev));
+		return -EINVAL;
+	}
+
+	/* Clear local and remote error bits */
+	writel(readl(&dev->local->status), &dev->local->status);
+	writel(readl(&dev->remote->status), &dev->remote->status);
+
+	/* Now setup interrupts */
+	val = VLYNQ_CTRL_INT_VECTOR(dev->local_irq);
+	val |= VLYNQ_CTRL_INT_ENABLE | VLYNQ_CTRL_INT_LOCAL |
+		VLYNQ_CTRL_INT2CFG;
+	val |= readl(&dev->local->control);
+	writel(VLYNQ_INT_OFFSET, &dev->local->int_ptr);
+	writel(val, &dev->local->control);
+
+	val = VLYNQ_CTRL_INT_VECTOR(dev->remote_irq);
+	val |= VLYNQ_CTRL_INT_ENABLE;
+	val |= readl(&dev->remote->control);
+	writel(VLYNQ_INT_OFFSET, &dev->remote->int_ptr);
+	writel(val, &dev->remote->int_ptr);
+	writel(val, &dev->remote->control);
+
+	for (i = dev->irq_start; i <= dev->irq_end; i++) {
+		virq = i - dev->irq_start;
+		if (virq == dev->local_irq) {
+			set_irq_chip_and_handler(i, &vlynq_local_chip,
+						 handle_level_irq);
+			set_irq_chip_data(i, dev);
+		} else if (virq == dev->remote_irq) {
+			set_irq_chip_and_handler(i, &vlynq_remote_chip,
+						 handle_level_irq);
+			set_irq_chip_data(i, dev);
+		} else {
+			set_irq_chip_and_handler(i, &vlynq_irq_chip,
+						 handle_simple_irq);
+			set_irq_chip_data(i, dev);
+			writel(0, &dev->remote->int_device[virq >> 2]);
+		}
+	}
+
+	if (request_irq(dev->irq, vlynq_irq, IRQF_SHARED, "vlynq", dev)) {
+		printk(KERN_ERR "%s: request_irq failed\n",
+					dev_name(&dev->dev));
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void vlynq_device_release(struct device *dev)
+{
+	struct vlynq_device *vdev = to_vlynq_device(dev);
+	kfree(vdev);
+}
+
+static int vlynq_device_match(struct device *dev,
+			      struct device_driver *drv)
+{
+	struct vlynq_device *vdev = to_vlynq_device(dev);
+	struct vlynq_driver *vdrv = to_vlynq_driver(drv);
+	struct vlynq_device_id *ids = vdrv->id_table;
+
+	while (ids->id) {
+		if (ids->id == vdev->dev_id) {
+			vdev->divisor = ids->divisor;
+			vlynq_set_drvdata(vdev, ids);
+			printk(KERN_INFO "Driver found for VLYNQ "
+				"device: %08x\n", vdev->dev_id);
+			return 1;
+		}
+		printk(KERN_DEBUG "Not using the %08x VLYNQ device's driver"
+			" for VLYNQ device: %08x\n", ids->id, vdev->dev_id);
+		ids++;
+	}
+	return 0;
+}
+
+static int vlynq_device_probe(struct device *dev)
+{
+	struct vlynq_device *vdev = to_vlynq_device(dev);
+	struct vlynq_driver *drv = to_vlynq_driver(dev->driver);
+	struct vlynq_device_id *id = vlynq_get_drvdata(vdev);
+	int result = -ENODEV;
+
+	if (drv->probe)
+		result = drv->probe(vdev, id);
+	if (result)
+		put_device(dev);
+	return result;
+}
+
+static int vlynq_device_remove(struct device *dev)
+{
+	struct vlynq_driver *drv = to_vlynq_driver(dev->driver);
+
+	if (drv->remove)
+		drv->remove(to_vlynq_device(dev));
+
+	return 0;
+}
+
+int __vlynq_register_driver(struct vlynq_driver *driver, struct module *owner)
+{
+	driver->driver.name = driver->name;
+	driver->driver.bus = &vlynq_bus_type;
+	return driver_register(&driver->driver);
+}
+EXPORT_SYMBOL(__vlynq_register_driver);
+
+void vlynq_unregister_driver(struct vlynq_driver *driver)
+{
+	driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL(vlynq_unregister_driver);
+
+/*
+ * A VLYNQ remote device can clock the VLYNQ bus master
+ * using a dedicated clock line. In that case, both the
+ * remove device and the bus master should have the same
+ * serial clock dividers configured. Iterate through the
+ * 8 possible dividers until we actually link with the
+ * device.
+ */
+static int __vlynq_try_remote(struct vlynq_device *dev)
+{
+	int i;
+
+	vlynq_reset(dev);
+	for (i = dev->dev_id ? vlynq_rdiv2 : vlynq_rdiv8; dev->dev_id ?
+			i <= vlynq_rdiv8 : i >= vlynq_rdiv2;
+		dev->dev_id ? i++ : i--) {
+
+		if (!vlynq_linked(dev))
+			break;
+
+		writel((readl(&dev->remote->control) &
+				~VLYNQ_CTRL_CLOCK_MASK) |
+				VLYNQ_CTRL_CLOCK_INT |
+				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1),
+				&dev->remote->control);
+		writel((readl(&dev->local->control)
+				& ~(VLYNQ_CTRL_CLOCK_INT |
+				VLYNQ_CTRL_CLOCK_MASK)) |
+				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1),
+				&dev->local->control);
+
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using remote clock divisor %d\n",
+				dev_name(&dev->dev), i - vlynq_rdiv1 + 1);
+			dev->divisor = i;
+			return 0;
+		} else {
+			vlynq_reset(dev);
+		}
+	}
+
+	return -ENODEV;
+}
+
+/*
+ * A VLYNQ remote device can be clocked by the VLYNQ bus
+ * master using a dedicated clock line. In that case, only
+ * the bus master configures the serial clock divider.
+ * Iterate through the 8 possible dividers until we
+ * actually get a link with the device.
+ */
+static int __vlynq_try_local(struct vlynq_device *dev)
+{
+	int i;
+
+	vlynq_reset(dev);
+
+	for (i = dev->dev_id ? vlynq_ldiv2 : vlynq_ldiv8; dev->dev_id ?
+			i <= vlynq_ldiv8 : i >= vlynq_ldiv2;
+		dev->dev_id ? i++ : i--) {
+
+		writel((readl(&dev->local->control) &
+				~VLYNQ_CTRL_CLOCK_MASK) |
+				VLYNQ_CTRL_CLOCK_INT |
+				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_ldiv1),
+				&dev->local->control);
+
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using local clock divisor %d\n",
+				dev_name(&dev->dev), i - vlynq_ldiv1 + 1);
+			dev->divisor = i;
+			return 0;
+		} else {
+			vlynq_reset(dev);
+		}
+	}
+
+	return -ENODEV;
+}
+
+/*
+ * When using external clocking method, serial clock
+ * is supplied by an external oscillator, therefore we
+ * should mask the local clock bit in the clock control
+ * register for both the bus master and the remote device.
+ */
+static int __vlynq_try_external(struct vlynq_device *dev)
+{
+	vlynq_reset(dev);
+	if (!vlynq_linked(dev))
+		return -ENODEV;
+
+	writel((readl(&dev->remote->control) &
+			~VLYNQ_CTRL_CLOCK_INT),
+			&dev->remote->control);
+
+	writel((readl(&dev->local->control) &
+			~VLYNQ_CTRL_CLOCK_INT),
+			&dev->local->control);
+
+	if (vlynq_linked(dev)) {
+		printk(KERN_DEBUG "%s: using external clock\n",
+			dev_name(&dev->dev));
+			dev->divisor = vlynq_div_external;
+		return 0;
+	}
+
+	return -ENODEV;
+}
+
+static int __vlynq_enable_device(struct vlynq_device *dev)
+{
+	int result;
+	struct plat_vlynq_ops *ops = dev->dev.platform_data;
+
+	result = ops->on(dev);
+	if (result)
+		return result;
+
+	switch (dev->divisor) {
+	case vlynq_div_external:
+	case vlynq_div_auto:
+		/* When the device is brought from reset it should have clock
+		 * generation negotiated by hardware.
+		 * Check which device is generating clocks and perform setup
+		 * accordingly */
+		if (vlynq_linked(dev) && readl(&dev->remote->control) &
+		   VLYNQ_CTRL_CLOCK_INT) {
+			if (!__vlynq_try_remote(dev) ||
+				!__vlynq_try_local(dev)  ||
+				!__vlynq_try_external(dev))
+				return 0;
+		} else {
+			if (!__vlynq_try_external(dev) ||
+				!__vlynq_try_local(dev)    ||
+				!__vlynq_try_remote(dev))
+				return 0;
+		}
+		break;
+	case vlynq_ldiv1:
+	case vlynq_ldiv2:
+	case vlynq_ldiv3:
+	case vlynq_ldiv4:
+	case vlynq_ldiv5:
+	case vlynq_ldiv6:
+	case vlynq_ldiv7:
+	case vlynq_ldiv8:
+		writel(VLYNQ_CTRL_CLOCK_INT |
+			VLYNQ_CTRL_CLOCK_DIV(dev->divisor -
+			vlynq_ldiv1), &dev->local->control);
+		writel(0, &dev->remote->control);
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using local clock divisor %d\n",
+				dev_name(&dev->dev),
+				dev->divisor - vlynq_ldiv1 + 1);
+			return 0;
+		}
+		break;
+	case vlynq_rdiv1:
+	case vlynq_rdiv2:
+	case vlynq_rdiv3:
+	case vlynq_rdiv4:
+	case vlynq_rdiv5:
+	case vlynq_rdiv6:
+	case vlynq_rdiv7:
+	case vlynq_rdiv8:
+		writel(0, &dev->local->control);
+		writel(VLYNQ_CTRL_CLOCK_INT |
+			VLYNQ_CTRL_CLOCK_DIV(dev->divisor -
+			vlynq_rdiv1), &dev->remote->control);
+		if (vlynq_linked(dev)) {
+			printk(KERN_DEBUG
+				"%s: using remote clock divisor %d\n",
+				dev_name(&dev->dev),
+				dev->divisor - vlynq_rdiv1 + 1);
+			return 0;
+		}
+		break;
+	}
+
+	ops->off(dev);
+	return -ENODEV;
+}
+
+int vlynq_enable_device(struct vlynq_device *dev)
+{
+	struct plat_vlynq_ops *ops = dev->dev.platform_data;
+	int result = -ENODEV;
+
+	result = __vlynq_enable_device(dev);
+	if (result)
+		return result;
+
+	result = vlynq_setup_irq(dev);
+	if (result)
+		ops->off(dev);
+
+	dev->enabled = !result;
+	return result;
+}
+EXPORT_SYMBOL(vlynq_enable_device);
+
+
+void vlynq_disable_device(struct vlynq_device *dev)
+{
+	struct plat_vlynq_ops *ops = dev->dev.platform_data;
+
+	dev->enabled = 0;
+	free_irq(dev->irq, dev);
+	ops->off(dev);
+}
+EXPORT_SYMBOL(vlynq_disable_device);
+
+int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
+			    struct vlynq_mapping *mapping)
+{
+	int i;
+
+	if (!dev->enabled)
+		return -ENXIO;
+
+	writel(tx_offset, &dev->local->tx_offset);
+	for (i = 0; i < 4; i++) {
+		writel(mapping[i].offset, &dev->local->rx_mapping[i].offset);
+		writel(mapping[i].size, &dev->local->rx_mapping[i].size);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_local_mapping);
+
+int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
+			     struct vlynq_mapping *mapping)
+{
+	int i;
+
+	if (!dev->enabled)
+		return -ENXIO;
+
+	writel(tx_offset, &dev->remote->tx_offset);
+	for (i = 0; i < 4; i++) {
+		writel(mapping[i].offset, &dev->remote->rx_mapping[i].offset);
+		writel(mapping[i].size, &dev->remote->rx_mapping[i].size);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_remote_mapping);
+
+int vlynq_set_local_irq(struct vlynq_device *dev, int virq)
+{
+	int irq = dev->irq_start + virq;
+	if (dev->enabled)
+		return -EBUSY;
+
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	if (virq == dev->remote_irq)
+		return -EINVAL;
+
+	dev->local_irq = virq;
+
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_local_irq);
+
+int vlynq_set_remote_irq(struct vlynq_device *dev, int virq)
+{
+	int irq = dev->irq_start + virq;
+	if (dev->enabled)
+		return -EBUSY;
+
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	if (virq == dev->local_irq)
+		return -EINVAL;
+
+	dev->remote_irq = virq;
+
+	return 0;
+}
+EXPORT_SYMBOL(vlynq_set_remote_irq);
+
+static int vlynq_probe(struct platform_device *pdev)
+{
+	struct vlynq_device *dev;
+	struct resource *regs_res, *mem_res, *irq_res;
+	int len, result;
+
+	regs_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
+	if (!regs_res)
+		return -ENODEV;
+
+	mem_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mem");
+	if (!mem_res)
+		return -ENODEV;
+
+	irq_res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, "devirq");
+	if (!irq_res)
+		return -ENODEV;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		printk(KERN_ERR
+		       "vlynq: failed to allocate device structure\n");
+		return -ENOMEM;
+	}
+
+	dev->id = pdev->id;
+	dev->dev.bus = &vlynq_bus_type;
+	dev->dev.parent = &pdev->dev;
+	dev_set_name(&dev->dev, "vlynq%d", dev->id);
+	dev->dev.platform_data = pdev->dev.platform_data;
+	dev->dev.release = vlynq_device_release;
+
+	dev->regs_start = regs_res->start;
+	dev->regs_end = regs_res->end;
+	dev->mem_start = mem_res->start;
+	dev->mem_end = mem_res->end;
+
+	len = regs_res->end - regs_res->start;
+	if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) {
+		printk(KERN_ERR "%s: Can't request vlynq registers\n",
+		       dev_name(&dev->dev));
+		result = -ENXIO;
+		goto fail_request;
+	}
+
+	dev->local = ioremap(regs_res->start, len);
+	if (!dev->local) {
+		printk(KERN_ERR "%s: Can't remap vlynq registers\n",
+		       dev_name(&dev->dev));
+		result = -ENXIO;
+		goto fail_remap;
+	}
+
+	dev->remote = (struct vlynq_regs *)((void *)dev->local +
+					    VLYNQ_REMOTE_OFFSET);
+
+	dev->irq = platform_get_irq_byname(pdev, "irq");
+	dev->irq_start = irq_res->start;
+	dev->irq_end = irq_res->end;
+	dev->local_irq = dev->irq_end - dev->irq_start;
+	dev->remote_irq = dev->local_irq - 1;
+
+	if (device_register(&dev->dev))
+		goto fail_register;
+	platform_set_drvdata(pdev, dev);
+
+	printk(KERN_INFO "%s: regs 0x%p, irq %d, mem 0x%p\n",
+	       dev_name(&dev->dev), (void *)dev->regs_start, dev->irq,
+	       (void *)dev->mem_start);
+
+	dev->dev_id = 0;
+	dev->divisor = vlynq_div_auto;
+	result = __vlynq_enable_device(dev);
+	if (result == 0) {
+		dev->dev_id = readl(&dev->remote->chip);
+		((struct plat_vlynq_ops *)(dev->dev.platform_data))->off(dev);
+	}
+	if (dev->dev_id)
+		printk(KERN_INFO "Found a VLYNQ device: %08x\n", dev->dev_id);
+
+	return 0;
+
+fail_register:
+	iounmap(dev->local);
+fail_remap:
+fail_request:
+	release_mem_region(regs_res->start, len);
+	kfree(dev);
+	return result;
+}
+
+static int vlynq_remove(struct platform_device *pdev)
+{
+	struct vlynq_device *dev = platform_get_drvdata(pdev);
+
+	device_unregister(&dev->dev);
+	iounmap(dev->local);
+	release_mem_region(dev->regs_start, dev->regs_end - dev->regs_start);
+
+	kfree(dev);
+
+	return 0;
+}
+
+static struct platform_driver vlynq_platform_driver = {
+	.driver.name = "vlynq",
+	.probe = vlynq_probe,
+	.remove = __devexit_p(vlynq_remove),
+};
+
+struct bus_type vlynq_bus_type = {
+	.name = "vlynq",
+	.match = vlynq_device_match,
+	.probe = vlynq_device_probe,
+	.remove = vlynq_device_remove,
+};
+EXPORT_SYMBOL(vlynq_bus_type);
+
+static int __devinit vlynq_init(void)
+{
+	int res = 0;
+
+	res = bus_register(&vlynq_bus_type);
+	if (res)
+		goto fail_bus;
+
+	res = platform_driver_register(&vlynq_platform_driver);
+	if (res)
+		goto fail_platform;
+
+	return 0;
+
+fail_platform:
+	bus_unregister(&vlynq_bus_type);
+fail_bus:
+	return res;
+}
+
+static void __devexit vlynq_exit(void)
+{
+	platform_driver_unregister(&vlynq_platform_driver);
+	bus_unregister(&vlynq_bus_type);
+}
+
+module_init(vlynq_init);
+module_exit(vlynq_exit);
diff --git a/include/linux/vlynq.h b/include/linux/vlynq.h
new file mode 100644
index 00000000000..8f6a95882b0
--- /dev/null
+++ b/include/linux/vlynq.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __VLYNQ_H__
+#define __VLYNQ_H__
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define VLYNQ_NUM_IRQS 32
+
+struct vlynq_mapping {
+	u32 size;
+	u32 offset;
+};
+
+enum vlynq_divisor {
+	vlynq_div_auto = 0,
+	vlynq_ldiv1,
+	vlynq_ldiv2,
+	vlynq_ldiv3,
+	vlynq_ldiv4,
+	vlynq_ldiv5,
+	vlynq_ldiv6,
+	vlynq_ldiv7,
+	vlynq_ldiv8,
+	vlynq_rdiv1,
+	vlynq_rdiv2,
+	vlynq_rdiv3,
+	vlynq_rdiv4,
+	vlynq_rdiv5,
+	vlynq_rdiv6,
+	vlynq_rdiv7,
+	vlynq_rdiv8,
+	vlynq_div_external
+};
+
+struct vlynq_device_id {
+	u32 id;
+	enum vlynq_divisor divisor;
+	unsigned long driver_data;
+};
+
+struct vlynq_regs;
+struct vlynq_device {
+	u32 id, dev_id;
+	int local_irq;
+	int remote_irq;
+	enum vlynq_divisor divisor;
+	u32 regs_start, regs_end;
+	u32 mem_start, mem_end;
+	u32 irq_start, irq_end;
+	int irq;
+	int enabled;
+	struct vlynq_regs *local;
+	struct vlynq_regs *remote;
+	struct device dev;
+};
+
+struct vlynq_driver {
+	char *name;
+	struct vlynq_device_id *id_table;
+	int (*probe)(struct vlynq_device *dev, struct vlynq_device_id *id);
+	void (*remove)(struct vlynq_device *dev);
+	struct device_driver driver;
+};
+
+struct plat_vlynq_ops {
+	int (*on)(struct vlynq_device *dev);
+	void (*off)(struct vlynq_device *dev);
+};
+
+static inline struct vlynq_driver *to_vlynq_driver(struct device_driver *drv)
+{
+	return container_of(drv, struct vlynq_driver, driver);
+}
+
+static inline struct vlynq_device *to_vlynq_device(struct device *device)
+{
+	return container_of(device, struct vlynq_device, dev);
+}
+
+extern struct bus_type vlynq_bus_type;
+
+extern int __vlynq_register_driver(struct vlynq_driver *driver,
+				   struct module *owner);
+
+static inline int vlynq_register_driver(struct vlynq_driver *driver)
+{
+	return __vlynq_register_driver(driver, THIS_MODULE);
+}
+
+static inline void *vlynq_get_drvdata(struct vlynq_device *dev)
+{
+	return dev_get_drvdata(&dev->dev);
+}
+
+static inline void vlynq_set_drvdata(struct vlynq_device *dev, void *data)
+{
+	dev_set_drvdata(&dev->dev, data);
+}
+
+static inline u32 vlynq_mem_start(struct vlynq_device *dev)
+{
+	return dev->mem_start;
+}
+
+static inline u32 vlynq_mem_end(struct vlynq_device *dev)
+{
+	return dev->mem_end;
+}
+
+static inline u32 vlynq_mem_len(struct vlynq_device *dev)
+{
+	return dev->mem_end - dev->mem_start + 1;
+}
+
+static inline int vlynq_virq_to_irq(struct vlynq_device *dev, int virq)
+{
+	int irq = dev->irq_start + virq;
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	return irq;
+}
+
+static inline int vlynq_irq_to_virq(struct vlynq_device *dev, int irq)
+{
+	if ((irq < dev->irq_start) || (irq > dev->irq_end))
+		return -EINVAL;
+
+	return irq - dev->irq_start;
+}
+
+extern void vlynq_unregister_driver(struct vlynq_driver *driver);
+extern int vlynq_enable_device(struct vlynq_device *dev);
+extern void vlynq_disable_device(struct vlynq_device *dev);
+extern int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
+				   struct vlynq_mapping *mapping);
+extern int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
+				    struct vlynq_mapping *mapping);
+extern int vlynq_set_local_irq(struct vlynq_device *dev, int virq);
+extern int vlynq_set_remote_irq(struct vlynq_device *dev, int virq);
+
+#endif /* __VLYNQ_H__ */
-- 
cgit v1.2.3-70-g09d2


From 8f3128e714ded7cf1e8c786c204a4f253b5d8ff4 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Tue, 16 Jun 2009 15:34:17 -0700
Subject: lis3: add click function

The LIS302DL accelerometer chip has a 'click' feature which can be used to
detect sudden motion on any of the three axis.  Configuration data is
passed via spi platform_data and no action is taken if that's not
specified, so it won't harm any existing platform.

To make the configuration effective, the IRQ lines need to be set up
appropriately.  This patch also adds a way to do that from board support
code.

The DD_* definitions were factored out to an own enum because they are
specific to LIS3LV02D devices.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/lis3lv02d.c     | 20 ++++++++++++++++++++
 drivers/hwmon/lis3lv02d.h     | 19 ++++++++++++++++++-
 drivers/hwmon/lis3lv02d_spi.c |  1 +
 include/linux/lis3lv02d.h     | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/lis3lv02d.h

(limited to 'include/linux')

diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index 366190609c7..271338bdb6b 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -438,6 +438,26 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
 	if (lis3lv02d_joystick_enable())
 		printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n");
 
+	/* passing in platform specific data is purely optional and only
+	 * used by the SPI transport layer at the moment */
+	if (dev->pdata) {
+		struct lis3lv02d_platform_data *p = dev->pdata;
+
+		if (p->click_flags && (dev->whoami == LIS_SINGLE_ID)) {
+			dev->write(dev, CLICK_CFG, p->click_flags);
+			dev->write(dev, CLICK_TIMELIMIT, p->click_time_limit);
+			dev->write(dev, CLICK_LATENCY, p->click_latency);
+			dev->write(dev, CLICK_WINDOW, p->click_window);
+			dev->write(dev, CLICK_THSZ, p->click_thresh_z & 0xf);
+			dev->write(dev, CLICK_THSY_X,
+					(p->click_thresh_x & 0xf) |
+					(p->click_thresh_y << 4));
+		}
+
+		if (p->irq_cfg)
+			dev->write(dev, CTRL_REG3, p->irq_cfg);
+	}
+
 	/* bail if we did not get an IRQ from the bus layer */
 	if (!dev->irq) {
 		printk(KERN_ERR DRIVER_NAME
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index 5a5a196e6a6..e320e2f511f 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -29,12 +29,14 @@
  * They can also be connected via I²C.
  */
 
+#include <linux/lis3lv02d.h>
+
 /* 2-byte registers */
 #define LIS_DOUBLE_ID	0x3A /* LIS3LV02D[LQ] */
 /* 1-byte registers */
 #define LIS_SINGLE_ID	0x3B /* LIS[32]02DL and others */
 
-enum lis3lv02d_reg {
+enum lis3_reg {
 	WHO_AM_I	= 0x0F,
 	OFFSET_X	= 0x16,
 	OFFSET_Y	= 0x17,
@@ -62,6 +64,19 @@ enum lis3lv02d_reg {
 	FF_WU_THS_L	= 0x34,
 	FF_WU_THS_H	= 0x35,
 	FF_WU_DURATION	= 0x36,
+};
+
+enum lis302d_reg {
+	CLICK_CFG	= 0x38,
+	CLICK_SRC	= 0x39,
+	CLICK_THSY_X	= 0x3B,
+	CLICK_THSZ	= 0x3C,
+	CLICK_TIMELIMIT	= 0x3D,
+	CLICK_LATENCY	= 0x3E,
+	CLICK_WINDOW	= 0x3F,
+};
+
+enum lis3lv02d_reg {
 	DD_CFG		= 0x38,
 	DD_SRC		= 0x39,
 	DD_ACK		= 0x3A,
@@ -183,6 +198,8 @@ struct lis3lv02d {
 	struct fasync_struct	*async_queue; /* queue for the misc device */
 	wait_queue_head_t	misc_wait; /* Wait queue for the misc device */
 	unsigned long		misc_opened; /* bit0: whether the device is open */
+
+	struct lis3lv02d_platform_data *pdata;	/* for passing board config */
 };
 
 int lis3lv02d_init_device(struct lis3lv02d *lis3);
diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c
index 07ae74b0e19..3827ff04485 100644
--- a/drivers/hwmon/lis3lv02d_spi.c
+++ b/drivers/hwmon/lis3lv02d_spi.c
@@ -72,6 +72,7 @@ static int __devinit lis302dl_spi_probe(struct spi_device *spi)
 	lis3_dev.write = lis3_spi_write;
 	lis3_dev.irq = spi->irq;
 	lis3_dev.ac = lis3lv02d_axis_normal;
+	lis3_dev.pdata = spi->dev.platform_data;
 	spi_set_drvdata(spi, &lis3_dev);
 
 	ret = lis3lv02d_init_device(&lis3_dev);
diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h
new file mode 100644
index 00000000000..ad651f4e45a
--- /dev/null
+++ b/include/linux/lis3lv02d.h
@@ -0,0 +1,39 @@
+#ifndef __LIS3LV02D_H_
+#define __LIS3LV02D_H_
+
+struct lis3lv02d_platform_data {
+	/* please note: the 'click' feature is only supported for
+	 * LIS[32]02DL variants of the chip and will be ignored for
+	 * others */
+#define LIS3_CLICK_SINGLE_X	(1 << 0)
+#define LIS3_CLICK_DOUBLE_X	(1 << 1)
+#define LIS3_CLICK_SINGLE_Y	(1 << 2)
+#define LIS3_CLICK_DOUBLE_Y	(1 << 3)
+#define LIS3_CLICK_SINGLE_Z	(1 << 4)
+#define LIS3_CLICK_DOUBLE_Z	(1 << 5)
+	unsigned char click_flags;
+	unsigned char click_thresh_x;
+	unsigned char click_thresh_y;
+	unsigned char click_thresh_z;
+	unsigned char click_time_limit;
+	unsigned char click_latency;
+	unsigned char click_window;
+
+#define LIS3_IRQ1_DISABLE	(0 << 0)
+#define LIS3_IRQ1_FF_WU_1	(1 << 0)
+#define LIS3_IRQ1_FF_WU_2	(2 << 0)
+#define LIS3_IRQ1_FF_WU_12	(3 << 0)
+#define LIS3_IRQ1_DATA_READY	(4 << 0)
+#define LIS3_IRQ1_CLICK		(7 << 0)
+#define LIS3_IRQ2_DISABLE	(0 << 3)
+#define LIS3_IRQ2_FF_WU_1	(1 << 3)
+#define LIS3_IRQ2_FF_WU_2	(2 << 3)
+#define LIS3_IRQ2_FF_WU_12	(3 << 3)
+#define LIS3_IRQ2_DATA_READY	(4 << 3)
+#define LIS3_IRQ2_CLICK		(7 << 3)
+#define LIS3_IRQ_OPEN_DRAIN	(1 << 6)
+#define LIS3_IRQ_ACTIVE_HIGH	(1 << 7)
+	unsigned char irq_cfg;
+};
+
+#endif /* __LIS3LV02D_H_ */
-- 
cgit v1.2.3-70-g09d2


From ae52bb2384f721562f15f719de1acb8e934733cb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Tue, 16 Jun 2009 15:34:19 -0700
Subject: fbdev: move logo externs to header file

Now we have __initconst, we can finally move the external declarations for
the various Linux logo structures to <linux/linux_logo.h>.

James' ack dates back to the previous submission (way to long ago), when the
logos were still __initdata, which caused failures on some platforms with some
toolchain versions.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: James Simmons <jsimmons@infradead.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/prom_init.c        |  3 ---
 arch/powerpc/platforms/cell/spu_base.c | 11 +----------
 arch/um/include/shared/init.h          |  2 +-
 drivers/video/logo/logo.c              | 15 ---------------
 include/linux/init.h                   |  2 +-
 include/linux/linux_logo.h             | 16 ++++++++++++++++
 scripts/pnmtologo.c                    | 18 +++++++++---------
 7 files changed, 28 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 2f0e64b5364..ef6f64950e9 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -44,10 +44,7 @@
 #include <asm/sections.h>
 #include <asm/machdep.h>
 
-#ifdef CONFIG_LOGO_LINUX_CLUT224
 #include <linux/linux_logo.h>
-extern const struct linux_logo logo_linux_clut224;
-#endif
 
 /*
  * Properties whose value is longer than this get excluded from our
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 9abd210d87c..8547e86bfb4 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -752,17 +752,8 @@ static int __init init_spu_base(void)
 		goto out_unregister_sysdev_class;
 	}
 
-	if (ret > 0) {
-		/*
-		 * We cannot put the forward declaration in
-		 * <linux/linux_logo.h> because of conflicting session type
-		 * conflicts for const and __initdata with different compiler
-		 * versions
-		 */
-		extern const struct linux_logo logo_spe_clut224;
-
+	if (ret > 0)
 		fb_append_extra_logo(&logo_spe_clut224, ret);
-	}
 
 	mutex_lock(&spu_full_list_mutex);
 	xmon_register_spus(&spu_full_list);
diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h
index 37dd097c16c..b3906f860a8 100644
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -27,7 +27,7 @@
  * sign followed by value, e.g.:
  *
  * static int init_variable __initdata = 0;
- * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
+ * static const char linux_logo[] __initconst = { 0x32, 0x36, ... };
  *
  * Don't forget to initialize data not at file scope, i.e. within a function,
  * as gcc otherwise puts the data into the bss section and not into the init
diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c
index 2e85a2b52d0..ea7a8ccc830 100644
--- a/drivers/video/logo/logo.c
+++ b/drivers/video/logo/logo.c
@@ -21,21 +21,6 @@
 #include <asm/bootinfo.h>
 #endif
 
-extern const struct linux_logo logo_linux_mono;
-extern const struct linux_logo logo_linux_vga16;
-extern const struct linux_logo logo_linux_clut224;
-extern const struct linux_logo logo_blackfin_vga16;
-extern const struct linux_logo logo_blackfin_clut224;
-extern const struct linux_logo logo_dec_clut224;
-extern const struct linux_logo logo_mac_clut224;
-extern const struct linux_logo logo_parisc_clut224;
-extern const struct linux_logo logo_sgi_clut224;
-extern const struct linux_logo logo_sun_clut224;
-extern const struct linux_logo logo_superh_mono;
-extern const struct linux_logo logo_superh_vga16;
-extern const struct linux_logo logo_superh_clut224;
-extern const struct linux_logo logo_m32r_clut224;
-
 static int nologo;
 module_param(nologo, bool, 0);
 MODULE_PARM_DESC(nologo, "Disables startup logo");
diff --git a/include/linux/init.h b/include/linux/init.h
index b2189803f19..8c2c9989626 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -29,7 +29,7 @@
  * sign followed by value, e.g.:
  *
  * static int init_variable __initdata = 0;
- * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
+ * static const char linux_logo[] __initconst = { 0x32, 0x36, ... };
  *
  * Don't forget to initialize data not at file scope, i.e. within a function,
  * as gcc otherwise puts the data into the bss section and not into the init
diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h
index 08a92969c76..ca5bd91d12e 100644
--- a/include/linux/linux_logo.h
+++ b/include/linux/linux_logo.h
@@ -32,6 +32,22 @@ struct linux_logo {
 	const unsigned char *data;
 };
 
+extern const struct linux_logo logo_linux_mono;
+extern const struct linux_logo logo_linux_vga16;
+extern const struct linux_logo logo_linux_clut224;
+extern const struct linux_logo logo_blackfin_vga16;
+extern const struct linux_logo logo_blackfin_clut224;
+extern const struct linux_logo logo_dec_clut224;
+extern const struct linux_logo logo_mac_clut224;
+extern const struct linux_logo logo_parisc_clut224;
+extern const struct linux_logo logo_sgi_clut224;
+extern const struct linux_logo logo_sun_clut224;
+extern const struct linux_logo logo_superh_mono;
+extern const struct linux_logo logo_superh_vga16;
+extern const struct linux_logo logo_superh_clut224;
+extern const struct linux_logo logo_m32r_clut224;
+extern const struct linux_logo logo_spe_clut224;
+
 extern const struct linux_logo *fb_find_logo(int depth);
 #ifdef CONFIG_FB_LOGO_EXTRA
 extern void fb_append_extra_logo(const struct linux_logo *logo,
diff --git a/scripts/pnmtologo.c b/scripts/pnmtologo.c
index 6aa2a2483f8..64f5ddb09ea 100644
--- a/scripts/pnmtologo.c
+++ b/scripts/pnmtologo.c
@@ -237,22 +237,22 @@ static void write_header(void)
     fprintf(out, " *  Linux logo %s\n", logoname);
     fputs(" */\n\n", out);
     fputs("#include <linux/linux_logo.h>\n\n", out);
-    fprintf(out, "static unsigned char %s_data[] __initdata = {\n",
+    fprintf(out, "static const unsigned char %s_data[] __initconst = {\n",
 	    logoname);
 }
 
 static void write_footer(void)
 {
     fputs("\n};\n\n", out);
-    fprintf(out, "struct linux_logo %s __initdata = {\n", logoname);
-    fprintf(out, "    .type\t= %s,\n", logo_types[logo_type]);
-    fprintf(out, "    .width\t= %d,\n", logo_width);
-    fprintf(out, "    .height\t= %d,\n", logo_height);
+    fprintf(out, "const struct linux_logo %s __initconst = {\n", logoname);
+    fprintf(out, "\t.type\t\t= %s,\n", logo_types[logo_type]);
+    fprintf(out, "\t.width\t\t= %d,\n", logo_width);
+    fprintf(out, "\t.height\t\t= %d,\n", logo_height);
     if (logo_type == LINUX_LOGO_CLUT224) {
-	fprintf(out, "    .clutsize\t= %d,\n", logo_clutsize);
-	fprintf(out, "    .clut\t= %s_clut,\n", logoname);
+	fprintf(out, "\t.clutsize\t= %d,\n", logo_clutsize);
+	fprintf(out, "\t.clut\t\t= %s_clut,\n", logoname);
     }
-    fprintf(out, "    .data\t= %s_data\n", logoname);
+    fprintf(out, "\t.data\t\t= %s_data\n", logoname);
     fputs("};\n\n", out);
 
     /* close logo file */
@@ -374,7 +374,7 @@ static void write_logo_clut224(void)
     fputs("\n};\n\n", out);
 
     /* write logo clut */
-    fprintf(out, "static unsigned char %s_clut[] __initdata = {\n",
+    fprintf(out, "static const unsigned char %s_clut[] __initconst = {\n",
 	    logoname);
     write_hex_cnt = 0;
     for (i = 0; i < logo_clutsize; i++) {
-- 
cgit v1.2.3-70-g09d2


From 4410f3910947dcea8672280b3adecd53cec4e85e Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jun 2009 15:34:38 -0700
Subject: fbdev: add support for handoff from firmware to hw framebuffers

With KMS we have ran into an issue where we really want the KMS fb driver
to be the one running the console, so panics etc can be shown by switching
out of X etc.

However with vesafb/efifb built-in, we end up with those on fb0 and the
KMS fb driver on fb1, driving the same piece of hw, so this adds an fb
info flag to denote a firmware fbdev, and adds a new aperture base/size
range which can be compared when the hw drivers are installed to see if
there is a conflict with a firmware driver, and if there is the firmware
driver is unregistered and the hw driver takes over.

It uses new aperture_base/size members instead of comparing on the fix
smem_start/length, as smem_start/length might for example only cover the
first 1MB of the PCI aperture, and we could allocate the kms fb from 8MB
into the aperture, thus they would never overlap.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Dave Airlie <airlied@redhat.com>
Acked-by: Peter Jones <pjones@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/i915/intel_fb.c |  8 ++++++++
 drivers/video/efifb.c           |  5 ++++-
 drivers/video/fbmem.c           | 31 +++++++++++++++++++++++++++++++
 drivers/video/vesafb.c          | 15 ++++++++++++++-
 include/linux/fb.h              | 12 +++++++++++-
 5 files changed, 68 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c
index 0ecf6b76a40..8e28e5993df 100644
--- a/drivers/gpu/drm/i915/intel_fb.c
+++ b/drivers/gpu/drm/i915/intel_fb.c
@@ -504,6 +504,14 @@ static int intelfb_create(struct drm_device *dev, uint32_t fb_width,
 	info->fbops = &intelfb_ops;
 
 	info->fix.line_length = fb->pitch;
+
+	/* setup aperture base/size for vesafb takeover */
+	info->aperture_base = dev->mode_config.fb_base;
+	if (IS_I9XX(dev))
+		info->aperture_size = pci_resource_len(dev->pdev, 2);
+	else
+		info->aperture_size = pci_resource_len(dev->pdev, 0);
+
 	info->fix.smem_start = dev->mode_config.fb_base + obj_priv->gtt_offset;
 	info->fix.smem_len = size;
 
diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c
index 8dea2bc9270..eb12182b205 100644
--- a/drivers/video/efifb.c
+++ b/drivers/video/efifb.c
@@ -280,6 +280,9 @@ static int __init efifb_probe(struct platform_device *dev)
 	info->pseudo_palette = info->par;
 	info->par = NULL;
 
+	info->aperture_base = efifb_fix.smem_start;
+	info->aperture_size = size_total;
+
 	info->screen_base = ioremap(efifb_fix.smem_start, efifb_fix.smem_len);
 	if (!info->screen_base) {
 		printk(KERN_ERR "efifb: abort, cannot ioremap video memory "
@@ -337,7 +340,7 @@ static int __init efifb_probe(struct platform_device *dev)
 	info->fbops = &efifb_ops;
 	info->var = efifb_defined;
 	info->fix = efifb_fix;
-	info->flags = FBINFO_FLAG_DEFAULT;
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE;
 
 	if ((err = fb_alloc_cmap(&info->cmap, 256, 0)) < 0) {
 		printk(KERN_ERR "efifb: cannot allocate colormap\n");
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index d412a1ddc12..f8a09bf8d0c 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1462,6 +1462,16 @@ static int fb_check_foreignness(struct fb_info *fi)
 	return 0;
 }
 
+static bool fb_do_apertures_overlap(struct fb_info *gen, struct fb_info *hw)
+{
+	/* is the generic aperture base the same as the HW one */
+	if (gen->aperture_base == hw->aperture_base)
+		return true;
+	/* is the generic aperture base inside the hw base->hw base+size */
+	if (gen->aperture_base > hw->aperture_base && gen->aperture_base <= hw->aperture_base + hw->aperture_size)
+		return true;
+	return false;
+}
 /**
  *	register_framebuffer - registers a frame buffer device
  *	@fb_info: frame buffer info structure
@@ -1485,6 +1495,23 @@ register_framebuffer(struct fb_info *fb_info)
 	if (fb_check_foreignness(fb_info))
 		return -ENOSYS;
 
+	/* check all firmware fbs and kick off if the base addr overlaps */
+	for (i = 0 ; i < FB_MAX; i++) {
+		if (!registered_fb[i])
+			continue;
+
+		if (registered_fb[i]->flags & FBINFO_MISC_FIRMWARE) {
+			if (fb_do_apertures_overlap(registered_fb[i], fb_info)) {
+				printk(KERN_ERR "fb: conflicting fb hw usage "
+				       "%s vs %s - removing generic driver\n",
+				       fb_info->fix.id,
+				       registered_fb[i]->fix.id);
+				unregister_framebuffer(registered_fb[i]);
+				break;
+			}
+		}
+	}
+
 	num_registered_fb++;
 	for (i = 0 ; i < FB_MAX; i++)
 		if (!registered_fb[i])
@@ -1586,6 +1613,10 @@ unregister_framebuffer(struct fb_info *fb_info)
 	device_destroy(fb_class, MKDEV(FB_MAJOR, i));
 	event.info = fb_info;
 	fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event);
+
+	/* this may free fb info */
+	if (fb_info->fbops->fb_destroy)
+		fb_info->fbops->fb_destroy(fb_info);
 done:
 	return ret;
 }
diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c
index d6856f43d24..bd37ee1f6a2 100644
--- a/drivers/video/vesafb.c
+++ b/drivers/video/vesafb.c
@@ -174,8 +174,17 @@ static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green,
 	return err;
 }
 
+static void vesafb_destroy(struct fb_info *info)
+{
+	if (info->screen_base)
+		iounmap(info->screen_base);
+	release_mem_region(info->aperture_base, info->aperture_size);
+	framebuffer_release(info);
+}
+
 static struct fb_ops vesafb_ops = {
 	.owner		= THIS_MODULE,
+	.fb_destroy     = vesafb_destroy,
 	.fb_setcolreg	= vesafb_setcolreg,
 	.fb_pan_display	= vesafb_pan_display,
 	.fb_fillrect	= cfb_fillrect,
@@ -286,6 +295,10 @@ static int __init vesafb_probe(struct platform_device *dev)
 	info->pseudo_palette = info->par;
 	info->par = NULL;
 
+	/* set vesafb aperture size for generic probing */
+	info->aperture_base = screen_info.lfb_base;
+	info->aperture_size = size_total;
+
 	info->screen_base = ioremap(vesafb_fix.smem_start, vesafb_fix.smem_len);
 	if (!info->screen_base) {
 		printk(KERN_ERR
@@ -437,7 +450,7 @@ static int __init vesafb_probe(struct platform_device *dev)
 	info->fbops = &vesafb_ops;
 	info->var = vesafb_defined;
 	info->fix = vesafb_fix;
-	info->flags = FBINFO_FLAG_DEFAULT |
+	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE |
 		(ypan ? FBINFO_HWACCEL_YPAN : 0);
 
 	if (!ypan)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 330c4b1bfca..3c5562a5216 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -677,6 +677,9 @@ struct fb_ops {
 	/* get capability given var */
 	void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps,
 			    struct fb_var_screeninfo *var);
+
+	/* teardown any resources to do with this framebuffer */
+	void (*fb_destroy)(struct fb_info *info);
 };
 
 #ifdef CONFIG_FB_TILEBLITTING
@@ -786,6 +789,8 @@ struct fb_tile_ops {
 #define FBINFO_MISC_USEREVENT          0x10000 /* event request
 						  from userspace */
 #define FBINFO_MISC_TILEBLITTING       0x20000 /* use tile blitting */
+#define FBINFO_MISC_FIRMWARE           0x40000 /* a replaceable firmware
+						  inited framebuffer */
 
 /* A driver may set this flag to indicate that it does want a set_par to be
  * called every time when fbcon_switch is executed. The advantage is that with
@@ -854,7 +859,12 @@ struct fb_info {
 	u32 state;			/* Hardware state i.e suspend */
 	void *fbcon_par;                /* fbcon use-only private area */
 	/* From here on everything is device dependent */
-	void *par;	
+	void *par;
+	/* we need the PCI or similiar aperture base/size not
+	   smem_start/size as smem_start may just be an object
+	   allocated inside the aperture so may not actually overlap */
+	resource_size_t aperture_base;
+	resource_size_t aperture_size;
 };
 
 #ifdef MODULE
-- 
cgit v1.2.3-70-g09d2


From 00115e6690ed5aa90530e1e41c467cd895dd18fc Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 16 Jun 2009 15:34:40 -0700
Subject: fbdev: blackfin has __raw I/O accessors, so use them in fb.h

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 3c5562a5216..dd68358996b 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -903,7 +903,7 @@ struct fb_info {
 #define fb_writeq sbus_writeq
 #define fb_memset sbus_memset_io
 
-#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__)
+#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__) || defined(__bfin__)
 
 #define fb_readb __raw_readb
 #define fb_readw __raw_readw
-- 
cgit v1.2.3-70-g09d2


From 608ba50bd0225d95469154feba8f00a6457848c1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Jun 2009 14:52:13 -0400
Subject: Cleanup of adfs headers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/adfs.h             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/adfs/dir.c              |  8 -------
 fs/adfs/dir_f.c            |  8 -------
 fs/adfs/dir_fplus.c        |  8 -------
 fs/adfs/file.c             |  4 ----
 fs/adfs/inode.c            | 10 ---------
 fs/adfs/map.c              |  6 -----
 fs/adfs/super.c            | 17 ++------------
 include/linux/adfs_fs.h    | 13 -----------
 include/linux/adfs_fs_i.h  | 24 --------------------
 include/linux/adfs_fs_sb.h | 38 --------------------------------
 11 files changed, 57 insertions(+), 134 deletions(-)
 delete mode 100644 include/linux/adfs_fs_i.h
 delete mode 100644 include/linux/adfs_fs_sb.h

(limited to 'include/linux')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index a6665f37f45..9cc18775b83 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -1,3 +1,6 @@
+#include <linux/fs.h>
+#include <linux/adfs_fs.h>
+
 /* Internal data structures for ADFS */
 
 #define ADFS_FREE_FRAG		 0
@@ -16,6 +19,58 @@
 
 struct buffer_head;
 
+/*
+ * adfs file system inode data in memory
+ */
+struct adfs_inode_info {
+	loff_t		mmu_private;
+	unsigned long	parent_id;	/* object id of parent		*/
+	__u32		loadaddr;	/* RISC OS load address		*/
+	__u32		execaddr;	/* RISC OS exec address		*/
+	unsigned int	filetype;	/* RISC OS file type		*/
+	unsigned int	attr;		/* RISC OS permissions		*/
+	unsigned int	stamped:1;	/* RISC OS file has date/time	*/
+	struct inode vfs_inode;
+};
+
+/*
+ * Forward-declare this
+ */
+struct adfs_discmap;
+struct adfs_dir_ops;
+
+/*
+ * ADFS file system superblock data in memory
+ */
+struct adfs_sb_info {
+	struct adfs_discmap *s_map;	/* bh list containing map		 */
+	struct adfs_dir_ops *s_dir;	/* directory operations			 */
+
+	uid_t		s_uid;		/* owner uid				 */
+	gid_t		s_gid;		/* owner gid				 */
+	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
+	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
+
+	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
+	__u32		s_idlen;	/* length of ID in map			 */
+	__u32		s_map_size;	/* sector size of a map			 */
+	unsigned long	s_size;		/* total size (in blocks) of this fs	 */
+	signed int	s_map2blk;	/* shift left by this for map->sector	 */
+	unsigned int	s_log2sharesize;/* log2 share size			 */
+	__le32		s_version;	/* disc format version			 */
+	unsigned int	s_namelen;	/* maximum number of characters in name	 */
+};
+
+static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+	return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+
 /*
  * Directory handling
  */
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 4d4073447d1..23aa52f548a 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,15 +9,7 @@
  *
  *  Common directory handling for ADFS
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>		/* for file_fsync() */
-
 #include "adfs.h"
 
 /*
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 31df6adf0de..bafc71222e2 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -9,15 +9,7 @@
  *
  *  E and F format directory handling
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
-
 #include "adfs.h"
 #include "dir_f.h"
 
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 139e0f345f1..1796bb352d0 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -7,15 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
-
 #include "adfs.h"
 #include "dir_fplus.h"
 
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 8224d54a2af..005ea34d175 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -19,10 +19,6 @@
  *
  *  adfs regular file handling primitives           
  */
-#include <linux/fs.h>
-#include <linux/buffer_head.h>			/* for file_fsync() */
-#include <linux/adfs_fs.h>
-
 #include "adfs.h"
 
 const struct file_operations adfs_file_operations = {
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 05b3a677201..798cb071d13 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,17 +7,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/mm.h>
 #include <linux/smp_lock.h>
-#include <linux/module.h>
 #include <linux/buffer_head.h>
-
 #include "adfs.h"
 
 /*
@@ -395,4 +386,3 @@ int adfs_write_inode(struct inode *inode, int wait)
 	unlock_kernel();
 	return ret;
 }
-MODULE_LICENSE("GPL");
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 568081b93f7..d1a5932bb0f 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -7,14 +7,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-
 #include <asm/unaligned.h>
-
 #include "adfs.h"
 
 /*
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 0ec5aaf47aa..aad92f0a104 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -8,26 +8,12 @@
  * published by the Free Software Foundation.
  */
 #include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
 #include <linux/parser.h>
-#include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <stdarg.h>
-
+#include <linux/statfs.h>
 #include "adfs.h"
 #include "dir_f.h"
 #include "dir_fplus.h"
@@ -534,3 +520,4 @@ static void __exit exit_adfs_fs(void)
 
 module_init(init_adfs_fs)
 module_exit(exit_adfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/include/linux/adfs_fs.h b/include/linux/adfs_fs.h
index ef788c2085a..b19801f7389 100644
--- a/include/linux/adfs_fs.h
+++ b/include/linux/adfs_fs.h
@@ -41,8 +41,6 @@ struct adfs_discrecord {
 #define ADFS_DR_SIZE_BITS	(ADFS_DR_SIZE << 3)
 
 #ifdef __KERNEL__
-#include <linux/adfs_fs_i.h>
-#include <linux/adfs_fs_sb.h>
 /*
  * Calculate the boot block checksum on an ADFS drive.  Note that this will
  * appear to be correct if the sector contains all zeros, so also check that
@@ -60,17 +58,6 @@ static inline int adfs_checkbblk(unsigned char *ptr)
 
 	return (result & 0xff) != ptr[511];
 }
-
-static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
-{
-	return container_of(inode, struct adfs_inode_info, vfs_inode);
-}
-
 #endif
 
 #endif
diff --git a/include/linux/adfs_fs_i.h b/include/linux/adfs_fs_i.h
deleted file mode 100644
index cb543034e54..00000000000
--- a/include/linux/adfs_fs_i.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *  linux/include/linux/adfs_fs_i.h
- *
- * Copyright (C) 1997 Russell King
- */
-
-#ifndef _ADFS_FS_I
-#define _ADFS_FS_I
-
-/*
- * adfs file system inode data in memory
- */
-struct adfs_inode_info {
-	loff_t		mmu_private;
-	unsigned long	parent_id;	/* object id of parent		*/
-	__u32		loadaddr;	/* RISC OS load address		*/
-	__u32		execaddr;	/* RISC OS exec address		*/
-	unsigned int	filetype;	/* RISC OS file type		*/
-	unsigned int	attr;		/* RISC OS permissions		*/
-	unsigned int	stamped:1;	/* RISC OS file has date/time	*/
-	struct inode vfs_inode;
-};
-
-#endif
diff --git a/include/linux/adfs_fs_sb.h b/include/linux/adfs_fs_sb.h
deleted file mode 100644
index d9bf05c02cc..00000000000
--- a/include/linux/adfs_fs_sb.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *  linux/include/linux/adfs_fs_sb.h
- *
- * Copyright (C) 1997-1999 Russell King
- */
-
-#ifndef _ADFS_FS_SB
-#define _ADFS_FS_SB
-
-/*
- * Forward-declare this
- */
-struct adfs_discmap;
-struct adfs_dir_ops;
-
-/*
- * ADFS file system superblock data in memory
- */
-struct adfs_sb_info {
-	struct adfs_discmap *s_map;	/* bh list containing map		 */
-	struct adfs_dir_ops *s_dir;	/* directory operations			 */
-
-	uid_t		s_uid;		/* owner uid				 */
-	gid_t		s_gid;		/* owner gid				 */
-	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
-	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
-
-	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
-	__u32		s_idlen;	/* length of ID in map			 */
-	__u32		s_map_size;	/* sector size of a map			 */
-	unsigned long	s_size;		/* total size (in blocks) of this fs	 */
-	signed int	s_map2blk;	/* shift left by this for map->sector	 */
-	unsigned int	s_log2sharesize;/* log2 share size			 */
-	__le32		s_version;	/* disc format version			 */
-	unsigned int	s_namelen;	/* maximum number of characters in name	 */
-};
-
-#endif
-- 
cgit v1.2.3-70-g09d2


From a42fc8f6943127787ad2a416436cf211d5531229 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 16 Jun 2009 16:56:38 +0000
Subject: skbuff.h: fix skb_dst kernel-doc

Fix kernel-doc warnings (missing + extra entries) in skbuff.h.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fa51293f270..3d289367aae 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -264,7 +264,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
- *	@dst: destination entry
+ *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
-- 
cgit v1.2.3-70-g09d2


From e088a4ad7fa53c3dc3c29f930025f41ccf01953e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 22 May 2009 13:49:49 -0700
Subject: [IA64] Convert ia64 to use int-ll64.h

It is generally agreed that it would be beneficial for u64 to be an
unsigned long long on all architectures.  ia64 (in common with several
other 64-bit architectures) currently uses unsigned long.  Migrating
piecemeal is too painful; this giant patch fixes all compilation warnings
and errors that come as a result of switching to use int-ll64.h.

Note that userspace will still see __u64 defined as unsigned long.  This
is important as it affects C++ name mangling.

[Updated by Tony Luck to change efi.h:efi_freemem_callback_t to use
 u64 for start/end rather than unsigned long]

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/hp/common/sba_iommu.c         |  2 +-
 arch/ia64/include/asm/gcc_intrin.h      | 18 ++++-----
 arch/ia64/include/asm/mca.h             | 38 +++++++++---------
 arch/ia64/include/asm/meminit.h         | 18 ++++-----
 arch/ia64/include/asm/pal.h             | 24 ++++++------
 arch/ia64/include/asm/processor.h       | 56 +++++++++++++--------------
 arch/ia64/include/asm/sal.h             |  8 ++--
 arch/ia64/include/asm/sn/sn_sal.h       |  2 +-
 arch/ia64/include/asm/types.h           | 13 +++++--
 arch/ia64/kernel/efi.c                  | 10 ++---
 arch/ia64/kernel/mca.c                  | 18 ++++-----
 arch/ia64/kernel/module.c               | 14 ++++---
 arch/ia64/kernel/palinfo.c              | 68 ++++++++++++++++-----------------
 arch/ia64/kernel/pci-dma.c              |  2 +-
 arch/ia64/kernel/perfmon.c              |  6 +--
 arch/ia64/kernel/setup.c                | 32 ++++++++--------
 arch/ia64/kernel/smp.c                  |  2 +-
 arch/ia64/kernel/smpboot.c              |  2 +-
 arch/ia64/kernel/time.c                 |  2 +-
 arch/ia64/kernel/topology.c             |  4 +-
 arch/ia64/kernel/uncached.c             |  3 +-
 arch/ia64/mm/contig.c                   |  9 ++---
 arch/ia64/mm/init.c                     | 15 +++-----
 arch/ia64/mm/tlb.c                      |  4 +-
 arch/ia64/pci/pci.c                     | 12 +++---
 arch/ia64/sn/kernel/io_acpi_init.c      |  4 +-
 arch/ia64/sn/kernel/io_common.c         |  2 +-
 arch/ia64/sn/kernel/sn2/sn_hwperf.c     |  4 +-
 arch/ia64/sn/kernel/sn2/sn_proc_fs.c    |  2 +-
 arch/ia64/sn/kernel/tiocx.c             |  2 +-
 arch/ia64/sn/pci/pcibr/pcibr_provider.c |  2 +-
 arch/ia64/sn/pci/tioca_provider.c       |  6 +--
 arch/ia64/sn/pci/tioce_provider.c       |  6 +--
 drivers/char/agp/hp-agp.c               |  5 ++-
 drivers/firmware/pcdp.c                 |  4 +-
 include/linux/efi.h                     |  2 +-
 36 files changed, 212 insertions(+), 209 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 56ceb68eb99..21c04114ddd 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1787,7 +1787,7 @@ static struct ioc_iommu ioc_iommu_info[] __initdata = {
 };
 
 static struct ioc * __init
-ioc_init(u64 hpa, void *handle)
+ioc_init(unsigned long hpa, void *handle)
 {
 	struct ioc *ioc;
 	struct ioc_iommu *info;
diff --git a/arch/ia64/include/asm/gcc_intrin.h b/arch/ia64/include/asm/gcc_intrin.h
index c2c5fd8fcac..21ddee54ada 100644
--- a/arch/ia64/include/asm/gcc_intrin.h
+++ b/arch/ia64/include/asm/gcc_intrin.h
@@ -388,7 +388,7 @@ register unsigned long ia64_r13 asm ("r13") __used;
 
 #define ia64_native_thash(addr)							\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));	\
 	ia64_intri_res;								\
 })
@@ -419,7 +419,7 @@ register unsigned long ia64_r13 asm ("r13") __used;
 
 #define ia64_tpa(addr)								\
 ({										\
-	__u64 ia64_pa;								\
+	unsigned long ia64_pa;							\
 	asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");	\
 	ia64_pa;								\
 })
@@ -444,35 +444,35 @@ register unsigned long ia64_r13 asm ("r13") __used;
 
 #define ia64_native_get_cpuid(index)							\
 ({											\
-	__u64 ia64_intri_res;								\
+	unsigned long ia64_intri_res;							\
 	asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));	\
 	ia64_intri_res;									\
 })
 
 #define __ia64_get_dbr(index)							\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("mov %0=dbr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
 	ia64_intri_res;								\
 })
 
 #define ia64_get_ibr(index)							\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
 	ia64_intri_res;								\
 })
 
 #define ia64_get_pkr(index)							\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
 	ia64_intri_res;								\
 })
 
 #define ia64_get_pmc(index)							\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
 	ia64_intri_res;								\
 })
@@ -480,14 +480,14 @@ register unsigned long ia64_r13 asm ("r13") __used;
 
 #define ia64_native_get_pmd(index)						\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
 	ia64_intri_res;								\
 })
 
 #define ia64_native_get_rr(index)						\
 ({										\
-	__u64 ia64_intri_res;							\
+	unsigned long ia64_intri_res;						\
 	asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));	\
 	ia64_intri_res;								\
 })
diff --git a/arch/ia64/include/asm/mca.h b/arch/ia64/include/asm/mca.h
index 18a4321349a..44a0b53df90 100644
--- a/arch/ia64/include/asm/mca.h
+++ b/arch/ia64/include/asm/mca.h
@@ -72,39 +72,39 @@ typedef struct ia64_mc_info_s {
 struct ia64_sal_os_state {
 
 	/* SAL to OS */
-	u64			os_gp;			/* GP of the os registered with the SAL, physical */
-	u64			pal_proc;		/* PAL_PROC entry point, physical */
-	u64			sal_proc;		/* SAL_PROC entry point, physical */
-	u64			rv_rc;			/* MCA - Rendezvous state, INIT - reason code */
-	u64			proc_state_param;	/* from R18 */
-	u64			monarch;		/* 1 for a monarch event, 0 for a slave */
+	unsigned long		os_gp;			/* GP of the os registered with the SAL, physical */
+	unsigned long		pal_proc;		/* PAL_PROC entry point, physical */
+	unsigned long		sal_proc;		/* SAL_PROC entry point, physical */
+	unsigned long		rv_rc;			/* MCA - Rendezvous state, INIT - reason code */
+	unsigned long		proc_state_param;	/* from R18 */
+	unsigned long		monarch;		/* 1 for a monarch event, 0 for a slave */
 
 	/* common */
-	u64			sal_ra;			/* Return address in SAL, physical */
-	u64			sal_gp;			/* GP of the SAL - physical */
+	unsigned long		sal_ra;			/* Return address in SAL, physical */
+	unsigned long		sal_gp;			/* GP of the SAL - physical */
 	pal_min_state_area_t	*pal_min_state;		/* from R17.  physical in asm, virtual in C */
 	/* Previous values of IA64_KR(CURRENT) and IA64_KR(CURRENT_STACK).
 	 * Note: if the MCA/INIT recovery code wants to resume to a new context
 	 * then it must change these values to reflect the new kernel stack.
 	 */
-	u64			prev_IA64_KR_CURRENT;	/* previous value of IA64_KR(CURRENT) */
-	u64			prev_IA64_KR_CURRENT_STACK;
+	unsigned long		prev_IA64_KR_CURRENT;	/* previous value of IA64_KR(CURRENT) */
+	unsigned long		prev_IA64_KR_CURRENT_STACK;
 	struct task_struct	*prev_task;		/* previous task, NULL if it is not useful */
 	/* Some interrupt registers are not saved in minstate, pt_regs or
 	 * switch_stack.  Because MCA/INIT can occur when interrupts are
 	 * disabled, we need to save the additional interrupt registers over
 	 * MCA/INIT and resume.
 	 */
-	u64			isr;
-	u64			ifa;
-	u64			itir;
-	u64			iipa;
-	u64			iim;
-	u64			iha;
+	unsigned long		isr;
+	unsigned long		ifa;
+	unsigned long		itir;
+	unsigned long		iipa;
+	unsigned long		iim;
+	unsigned long		iha;
 
 	/* OS to SAL */
-	u64			os_status;		/* OS status to SAL, enum below */
-	u64			context;		/* 0 if return to same context
+	unsigned long		os_status;		/* OS status to SAL, enum below */
+	unsigned long		context;		/* 0 if return to same context
 							   1 if return to new context */
 };
 
@@ -150,7 +150,7 @@ extern void ia64_slave_init_handler(void);
 extern void ia64_mca_cmc_vector_setup(void);
 extern int  ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
 extern void ia64_unreg_MCA_extension(void);
-extern u64 ia64_get_rnat(u64 *);
+extern unsigned long ia64_get_rnat(unsigned long *);
 extern void ia64_mca_printk(const char * fmt, ...)
 	 __attribute__ ((format (printf, 1, 2)));
 
diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h
index c0cea375620..688a812c017 100644
--- a/arch/ia64/include/asm/meminit.h
+++ b/arch/ia64/include/asm/meminit.h
@@ -25,8 +25,8 @@
 #define IA64_MAX_RSVD_REGIONS 9
 
 struct rsvd_region {
-	unsigned long start;	/* virtual address of beginning of element */
-	unsigned long end;	/* virtual address of end of element + 1 */
+	u64 start;	/* virtual address of beginning of element */
+	u64 end;	/* virtual address of end of element + 1 */
 };
 
 extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
@@ -35,13 +35,13 @@ extern int num_rsvd_regions;
 extern void find_memory (void);
 extern void reserve_memory (void);
 extern void find_initrd (void);
-extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg);
-extern int filter_memory (unsigned long start, unsigned long end, void *arg);
-extern unsigned long efi_memmap_init(unsigned long *s, unsigned long *e);
-extern int find_max_min_low_pfn (unsigned long , unsigned long, void *);
+extern int filter_rsvd_memory (u64 start, u64 end, void *arg);
+extern int filter_memory (u64 start, u64 end, void *arg);
+extern unsigned long efi_memmap_init(u64 *s, u64 *e);
+extern int find_max_min_low_pfn (u64, u64, void *);
 
 extern unsigned long vmcore_find_descriptor_size(unsigned long address);
-extern int reserve_elfcorehdr(unsigned long *start, unsigned long *end);
+extern int reserve_elfcorehdr(u64 *start, u64 *end);
 
 /*
  * For rounding an address to the next IA64_GRANULE_SIZE or order
@@ -63,8 +63,8 @@ extern int register_active_ranges(u64 start, u64 len, int nid);
 # define LARGE_GAP	0x40000000 /* Use virtual mem map if hole is > than this */
   extern unsigned long vmalloc_end;
   extern struct page *vmem_map;
-  extern int find_largest_hole (u64 start, u64 end, void *arg);
-  extern int create_mem_map_page_table (u64 start, u64 end, void *arg);
+  extern int find_largest_hole(u64 start, u64 end, void *arg);
+  extern int create_mem_map_page_table(u64 start, u64 end, void *arg);
   extern int vmemmap_find_next_valid_pfn(int, int);
 #else
 static inline int vmemmap_find_next_valid_pfn(int node, int i)
diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h
index 67b02901ead..6a292505b39 100644
--- a/arch/ia64/include/asm/pal.h
+++ b/arch/ia64/include/asm/pal.h
@@ -989,8 +989,8 @@ ia64_pal_cache_read (pal_cache_line_id_u_t line_id, u64 physical_addr)
 }
 
 /* Return summary information about the hierarchy of caches controlled by the processor */
-static inline s64
-ia64_pal_cache_summary (u64 *cache_levels, u64 *unique_caches)
+static inline long ia64_pal_cache_summary(unsigned long *cache_levels,
+						unsigned long *unique_caches)
 {
 	struct ia64_pal_retval iprv;
 	PAL_CALL(iprv, PAL_CACHE_SUMMARY, 0, 0, 0);
@@ -1038,8 +1038,8 @@ ia64_pal_copy_pal (u64 target_addr, u64 alloc_size, u64 processor, u64 *pal_proc
 }
 
 /* Return the number of instruction and data debug register pairs */
-static inline s64
-ia64_pal_debug_info (u64 *inst_regs,  u64 *data_regs)
+static inline long ia64_pal_debug_info(unsigned long *inst_regs,
+						unsigned long *data_regs)
 {
 	struct ia64_pal_retval iprv;
 	PAL_CALL(iprv, PAL_DEBUG_INFO, 0, 0, 0);
@@ -1074,8 +1074,7 @@ ia64_pal_fixed_addr (u64 *global_unique_addr)
 }
 
 /* Get base frequency of the platform if generated by the processor */
-static inline s64
-ia64_pal_freq_base (u64 *platform_base_freq)
+static inline long ia64_pal_freq_base(unsigned long *platform_base_freq)
 {
 	struct ia64_pal_retval iprv;
 	PAL_CALL(iprv, PAL_FREQ_BASE, 0, 0, 0);
@@ -1437,7 +1436,7 @@ ia64_pal_proc_set_features (u64 feature_select)
  * possible.
  */
 typedef struct ia64_ptce_info_s {
-	u64		base;
+	unsigned long	base;
 	u32		count[2];
 	u32		stride[2];
 } ia64_ptce_info_t;
@@ -1478,9 +1477,9 @@ ia64_pal_register_info (u64 info_request, u64 *reg_info_1, u64 *reg_info_2)
 }
 
 typedef union pal_hints_u {
-	u64			ph_data;
+	unsigned long		ph_data;
 	struct {
-	       u64		si		: 1,
+	       unsigned long	si		: 1,
 				li		: 1,
 				reserved	: 62;
 	} pal_hints_s;
@@ -1489,8 +1488,8 @@ typedef union pal_hints_u {
 /* Return information about the register stack and RSE for this processor
  * implementation.
  */
-static inline s64
-ia64_pal_rse_info (u64 *num_phys_stacked, pal_hints_u_t *hints)
+static inline long ia64_pal_rse_info(unsigned long *num_phys_stacked,
+							pal_hints_u_t *hints)
 {
 	struct ia64_pal_retval iprv;
 	PAL_CALL(iprv, PAL_RSE_INFO, 0, 0, 0);
@@ -1608,8 +1607,7 @@ ia64_pal_vm_info (u64 tc_level, u64 tc_type,  pal_tc_info_u_t *tc_info, u64 *tc_
 /* Get page size information about the virtual memory characteristics of the processor
  * implementation.
  */
-static inline s64
-ia64_pal_vm_page_size (u64 *tr_pages, u64 *vw_pages)
+static inline s64 ia64_pal_vm_page_size(u64 *tr_pages, u64 *vw_pages)
 {
 	struct ia64_pal_retval iprv;
 	PAL_CALL(iprv, PAL_VM_PAGE_SIZE, 0, 0, 0);
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index f88fa054d01..3eaeedf1aef 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -187,40 +187,40 @@ union  ia64_rr {
  * state comes earlier:
  */
 struct cpuinfo_ia64 {
-	__u32 softirq_pending;
-	__u64 itm_delta;	/* # of clock cycles between clock ticks */
-	__u64 itm_next;		/* interval timer mask value to use for next clock tick */
-	__u64 nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
-	__u64 unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
-	__u64 unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
-	__u64 itc_freq;		/* frequency of ITC counter */
-	__u64 proc_freq;	/* frequency of processor */
-	__u64 cyc_per_usec;	/* itc_freq/1000000 */
-	__u64 ptce_base;
-	__u32 ptce_count[2];
-	__u32 ptce_stride[2];
+	unsigned int softirq_pending;
+	unsigned long itm_delta;	/* # of clock cycles between clock ticks */
+	unsigned long itm_next;		/* interval timer mask value to use for next clock tick */
+	unsigned long nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
+	unsigned long unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
+	unsigned long unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
+	unsigned long itc_freq;		/* frequency of ITC counter */
+	unsigned long proc_freq;	/* frequency of processor */
+	unsigned long cyc_per_usec;	/* itc_freq/1000000 */
+	unsigned long ptce_base;
+	unsigned int ptce_count[2];
+	unsigned int ptce_stride[2];
 	struct task_struct *ksoftirqd;	/* kernel softirq daemon for this CPU */
 
 #ifdef CONFIG_SMP
-	__u64 loops_per_jiffy;
+	unsigned long loops_per_jiffy;
 	int cpu;
-	__u32 socket_id;	/* physical processor socket id */
-	__u16 core_id;		/* core id */
-	__u16 thread_id;	/* thread id */
-	__u16 num_log;		/* Total number of logical processors on
+	unsigned int socket_id;	/* physical processor socket id */
+	unsigned short core_id;	/* core id */
+	unsigned short thread_id; /* thread id */
+	unsigned short num_log;	/* Total number of logical processors on
 				 * this socket that were successfully booted */
-	__u8  cores_per_socket;	/* Cores per processor socket */
-	__u8  threads_per_core;	/* Threads per core */
+	unsigned char cores_per_socket;	/* Cores per processor socket */
+	unsigned char threads_per_core;	/* Threads per core */
 #endif
 
 	/* CPUID-derived information: */
-	__u64 ppn;
-	__u64 features;
-	__u8 number;
-	__u8 revision;
-	__u8 model;
-	__u8 family;
-	__u8 archrev;
+	unsigned long ppn;
+	unsigned long features;
+	unsigned char number;
+	unsigned char revision;
+	unsigned char model;
+	unsigned char family;
+	unsigned char archrev;
 	char vendor[16];
 	char *model_name;
 
@@ -329,8 +329,8 @@ struct thread_struct {
 #else
 # define INIT_THREAD_PM
 #endif
-	__u64 dbr[IA64_NUM_DBG_REGS];
-	__u64 ibr[IA64_NUM_DBG_REGS];
+	unsigned long dbr[IA64_NUM_DBG_REGS];
+	unsigned long ibr[IA64_NUM_DBG_REGS];
 	struct ia64_fpreg fph[96];	/* saved/loaded on demand */
 };
 
diff --git a/arch/ia64/include/asm/sal.h b/arch/ia64/include/asm/sal.h
index 966797a97c9..d19ddba4e32 100644
--- a/arch/ia64/include/asm/sal.h
+++ b/arch/ia64/include/asm/sal.h
@@ -106,10 +106,10 @@ struct ia64_sal_retval {
 	 * informational value should be printed (e.g., "reboot for
 	 * change to take effect").
 	 */
-	s64 status;
-	u64 v0;
-	u64 v1;
-	u64 v2;
+	long status;
+	unsigned long v0;
+	unsigned long v1;
+	unsigned long v2;
 };
 
 typedef struct ia64_sal_retval (*ia64_sal_handler) (u64, ...);
diff --git a/arch/ia64/include/asm/sn/sn_sal.h b/arch/ia64/include/asm/sn/sn_sal.h
index e310fc0135d..1f5ff470a5a 100644
--- a/arch/ia64/include/asm/sn/sn_sal.h
+++ b/arch/ia64/include/asm/sn/sn_sal.h
@@ -929,7 +929,7 @@ ia64_sn_sysctl_tio_clock_reset(nasid_t nasid)
 /*
  * Get the associated ioboard type for a given nasid.
  */
-static inline s64
+static inline long
 ia64_sn_sysctl_ioboard_get(nasid_t nasid, u16 *ioboard)
 {
 	struct ia64_sal_retval isrv;
diff --git a/arch/ia64/include/asm/types.h b/arch/ia64/include/asm/types.h
index fbf1ed3b44c..bcd260e597d 100644
--- a/arch/ia64/include/asm/types.h
+++ b/arch/ia64/include/asm/types.h
@@ -2,10 +2,11 @@
 #define _ASM_IA64_TYPES_H
 
 /*
- * This file is never included by application software unless explicitly requested (e.g.,
- * via linux/types.h) in which case the application is Linux specific so (user-) name
- * space pollution is not a major issue.  However, for interoperability, libraries still
- * need to be careful to avoid a name clashes.
+ * This file is never included by application software unless explicitly
+ * requested (e.g., via linux/types.h) in which case the application is
+ * Linux specific so (user-) name space pollution is not a major issue.
+ * However, for interoperability, libraries still need to be careful to
+ * avoid naming clashes.
  *
  * Based on <asm-alpha/types.h>.
  *
@@ -13,7 +14,11 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
  */
 
+#ifdef __KERNEL__
+#include <asm-generic/int-ll64.h>
+#else
 #include <asm-generic/int-l64.h>
+#endif
 
 #ifdef __ASSEMBLY__
 # define __IA64_UL(x)		(x)
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 7ef80e8161c..c745d0aeb6e 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -46,7 +46,7 @@ extern efi_status_t efi_call_phys (void *, ...);
 struct efi efi;
 EXPORT_SYMBOL(efi);
 static efi_runtime_services_t *runtime;
-static unsigned long mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
+static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
 
 #define efi_call_virt(f, args...)	(*(f))(args)
 
@@ -356,7 +356,7 @@ efi_get_pal_addr (void)
 
 		if (++pal_code_count > 1) {
 			printk(KERN_ERR "Too many EFI Pal Code memory ranges, "
-			       "dropped @ %lx\n", md->phys_addr);
+			       "dropped @ %llx\n", md->phys_addr);
 			continue;
 		}
 		/*
@@ -490,10 +490,10 @@ efi_init (void)
 		}
 	}
 	if (min_addr != 0UL)
-		printk(KERN_INFO "Ignoring memory below %luMB\n",
+		printk(KERN_INFO "Ignoring memory below %lluMB\n",
 		       min_addr >> 20);
 	if (max_addr != ~0UL)
-		printk(KERN_INFO "Ignoring memory above %luMB\n",
+		printk(KERN_INFO "Ignoring memory above %lluMB\n",
 		       max_addr >> 20);
 
 	efi.systab = __va(ia64_boot_param->efi_systab);
@@ -1066,7 +1066,7 @@ find_memmap_space (void)
  * parts exist, and are WB.
  */
 unsigned long
-efi_memmap_init(unsigned long *s, unsigned long *e)
+efi_memmap_init(u64 *s, u64 *e)
 {
 	struct kern_memdesc *k, *prev = NULL;
 	u64	contig_low=0, contig_high=0;
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 1cce4ceb48a..c259b9467fc 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -850,7 +850,7 @@ EXPORT_SYMBOL(ia64_unreg_MCA_extension);
 
 
 static inline void
-copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
+copy_reg(const u64 *fr, u64 fnat, unsigned long *tr, unsigned long *tnat)
 {
 	u64 fslot, tslot, nat;
 	*tr = *fr;
@@ -914,9 +914,9 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	struct switch_stack *old_sw;
 	unsigned size = sizeof(struct pt_regs) +
 			sizeof(struct switch_stack) + 16;
-	u64 *old_bspstore, *old_bsp;
-	u64 *new_bspstore, *new_bsp;
-	u64 old_unat, old_rnat, new_rnat, nat;
+	unsigned long *old_bspstore, *old_bsp;
+	unsigned long *new_bspstore, *new_bsp;
+	unsigned long old_unat, old_rnat, new_rnat, nat;
 	u64 slots, loadrs = regs->loadrs;
 	u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1];
 	u64 ar_bspstore = regs->ar_bspstore;
@@ -968,10 +968,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	 * loadrs for the new stack and save it in the new pt_regs, where
 	 * ia64_old_stack() can get it.
 	 */
-	old_bspstore = (u64 *)ar_bspstore;
-	old_bsp = (u64 *)ar_bsp;
+	old_bspstore = (unsigned long *)ar_bspstore;
+	old_bsp = (unsigned long *)ar_bsp;
 	slots = ia64_rse_num_regs(old_bspstore, old_bsp);
-	new_bspstore = (u64 *)((u64)current + IA64_RBS_OFFSET);
+	new_bspstore = (unsigned long *)((u64)current + IA64_RBS_OFFSET);
 	new_bsp = ia64_rse_skip_regs(new_bspstore, slots);
 	regs->loadrs = (new_bsp - new_bspstore) * 8 << 16;
 
@@ -1918,9 +1918,9 @@ ia64_mca_init(void)
 	ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave;
 	ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
 	int i;
-	s64 rc;
+	long rc;
 	struct ia64_sal_retval isrv;
-	u64 timeout = IA64_MCA_RENDEZ_TIMEOUT;	/* platform specific */
+	unsigned long timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */
 	static struct notifier_block default_init_monarch_nb = {
 		.notifier_call = default_monarch_init_process,
 		.priority = 0/* we need to notified last */
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index da3b0cf495a..1481b0a28ca 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -171,7 +171,8 @@ apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
 		return 0;
 	}
 	if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) {
-		printk(KERN_ERR "%s: value %ld out of IMM60 range\n", mod->name, (int64_t) val);
+		printk(KERN_ERR "%s: value %ld out of IMM60 range\n",
+			mod->name, (long) val);
 		return 0;
 	}
 	ia64_patch_imm60((u64) insn, val);
@@ -182,7 +183,8 @@ static int
 apply_imm22 (struct module *mod, struct insn *insn, uint64_t val)
 {
 	if (val + (1 << 21) >= (1 << 22)) {
-		printk(KERN_ERR "%s: value %li out of IMM22 range\n", mod->name, (int64_t)val);
+		printk(KERN_ERR "%s: value %li out of IMM22 range\n",
+			mod->name, (long)val);
 		return 0;
 	}
 	ia64_patch((u64) insn, 0x01fffcfe000UL, (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
@@ -196,7 +198,8 @@ static int
 apply_imm21b (struct module *mod, struct insn *insn, uint64_t val)
 {
 	if (val + (1 << 20) >= (1 << 21)) {
-		printk(KERN_ERR "%s: value %li out of IMM21b range\n", mod->name, (int64_t)val);
+		printk(KERN_ERR "%s: value %li out of IMM21b range\n",
+			mod->name, (long)val);
 		return 0;
 	}
 	ia64_patch((u64) insn, 0x11ffffe000UL, (  ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
@@ -701,8 +704,9 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
 	      case RV_PCREL2:
 		if (r_type == R_IA64_PCREL21BI) {
 			if (!is_internal(mod, val)) {
-				printk(KERN_ERR "%s: %s reloc against non-local symbol (%lx)\n",
-				       __func__, reloc_name[r_type], val);
+				printk(KERN_ERR "%s: %s reloc against "
+					"non-local symbol (%lx)\n", __func__,
+					reloc_name[r_type], (unsigned long)val);
 				return -ENOEXEC;
 			}
 			format = RF_INSN21B;
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index a4f19c70aad..fdf6f9d013e 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -218,10 +218,10 @@ static int
 cache_info(char *page)
 {
 	char *p = page;
-	u64 i, levels, unique_caches;
+	unsigned long i, levels, unique_caches;
 	pal_cache_config_info_t cci;
 	int j, k;
-	s64 status;
+	long status;
 
 	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
 		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
@@ -303,7 +303,7 @@ vm_info(char *page)
 	ia64_ptce_info_t ptce;
 	const char *sep;
 	int i, j;
-	s64 status;
+	long status;
 
 	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
 		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
@@ -431,9 +431,9 @@ register_info(char *page)
 	char *p = page;
 	u64 reg_info[2];
 	u64 info;
-	u64 phys_stacked;
+	unsigned long phys_stacked;
 	pal_hints_u_t hints;
-	u64 iregs, dregs;
+	unsigned long iregs, dregs;
 	char *info_type[]={
 		"Implemented AR(s)",
 		"AR(s) with read side-effects",
@@ -530,8 +530,8 @@ static char **proc_features[]={
 	NULL, NULL, NULL, NULL,
 };
 
-static char *
-feature_set_info(char *page, u64 avail, u64 status, u64 control, u64 set)
+static char * feature_set_info(char *page, u64 avail, u64 status, u64 control,
+							unsigned long set)
 {
 	char *p = page;
 	char **vf, **v;
@@ -714,7 +714,7 @@ frequency_info(char *page)
 {
 	char *p = page;
 	struct pal_freq_ratio proc, itc, bus;
-	u64 base;
+	unsigned long base;
 
 	if (ia64_pal_freq_base(&base) == -1)
 		p += sprintf(p, "Output clock            : not implemented\n");
@@ -736,43 +736,43 @@ static int
 tr_info(char *page)
 {
 	char *p = page;
-	s64 status;
+	long status;
 	pal_tr_valid_u_t tr_valid;
 	u64 tr_buffer[4];
 	pal_vm_info_1_u_t vm_info_1;
 	pal_vm_info_2_u_t vm_info_2;
-	u64 i, j;
-	u64 max[3], pgm;
+	unsigned long i, j;
+	unsigned long max[3], pgm;
 	struct ifa_reg {
-		u64 valid:1;
-		u64 ig:11;
-		u64 vpn:52;
+		unsigned long valid:1;
+		unsigned long ig:11;
+		unsigned long vpn:52;
 	} *ifa_reg;
 	struct itir_reg {
-		u64 rv1:2;
-		u64 ps:6;
-		u64 key:24;
-		u64 rv2:32;
+		unsigned long rv1:2;
+		unsigned long ps:6;
+		unsigned long key:24;
+		unsigned long rv2:32;
 	} *itir_reg;
 	struct gr_reg {
-		u64 p:1;
-		u64 rv1:1;
-		u64 ma:3;
-		u64 a:1;
-		u64 d:1;
-		u64 pl:2;
-		u64 ar:3;
-		u64 ppn:38;
-		u64 rv2:2;
-		u64 ed:1;
-		u64 ig:11;
+		unsigned long p:1;
+		unsigned long rv1:1;
+		unsigned long ma:3;
+		unsigned long a:1;
+		unsigned long d:1;
+		unsigned long pl:2;
+		unsigned long ar:3;
+		unsigned long ppn:38;
+		unsigned long rv2:2;
+		unsigned long ed:1;
+		unsigned long ig:11;
 	} *gr_reg;
 	struct rid_reg {
-		u64 ig1:1;
-		u64 rv1:1;
-		u64 ig2:6;
-		u64 rid:24;
-		u64 rv2:32;
+		unsigned long ig1:1;
+		unsigned long rv1:1;
+		unsigned long ig2:6;
+		unsigned long rid:24;
+		unsigned long rv2:32;
 	} *rid_reg;
 
 	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index eb987386f69..1376da45fd0 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -91,7 +91,7 @@ int iommu_dma_supported(struct device *dev, u64 mask)
 	   type. Normally this doesn't make any difference, but gives
 	   more gentle handling of IOMMU overflow. */
 	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
-		dev_info(dev, "Force SAC with mask %lx\n", mask);
+		dev_info(dev, "Force SAC with mask %llx\n", mask);
 		return 0;
 	}
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 8a06dc48059..89ad0bbb861 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -312,7 +312,7 @@ typedef struct pfm_context {
 	unsigned long		th_pmcs[PFM_NUM_PMC_REGS];	/* PMC thread save state */
 	unsigned long		th_pmds[PFM_NUM_PMD_REGS];	/* PMD thread save state */
 
-	u64			ctx_saved_psr_up;	/* only contains psr.up value */
+	unsigned long		ctx_saved_psr_up;	/* only contains psr.up value */
 
 	unsigned long		ctx_last_activation;	/* context last activation number for last_cpu */
 	unsigned int		ctx_last_cpu;		/* CPU id of current or last CPU used (SMP only) */
@@ -5213,8 +5213,8 @@ pfm_end_notify_user(pfm_context_t *ctx)
  * main overflow processing routine.
  * it can be called from the interrupt path or explicitly during the context switch code
  */
-static void
-pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
+static void pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx,
+				unsigned long pmc0, struct pt_regs *regs)
 {
 	pfm_ovfl_arg_t *ovfl_arg;
 	unsigned long mask;
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 714066aeda7..1b23ec126b6 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -151,9 +151,9 @@ int num_rsvd_regions __initdata;
  * This routine does not assume the incoming segments are sorted.
  */
 int __init
-filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
+filter_rsvd_memory (u64 start, u64 end, void *arg)
 {
-	unsigned long range_start, range_end, prev_start;
+	u64 range_start, range_end, prev_start;
 	void (*func)(unsigned long, unsigned long, int);
 	int i;
 
@@ -191,7 +191,7 @@ filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
  * are not filtered out.
  */
 int __init
-filter_memory(unsigned long start, unsigned long end, void *arg)
+filter_memory(u64 start, u64 end, void *arg)
 {
 	void (*func)(unsigned long, unsigned long, int);
 
@@ -397,7 +397,7 @@ find_initrd (void)
 		initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start);
 		initrd_end   = initrd_start+ia64_boot_param->initrd_size;
 
-		printk(KERN_INFO "Initial ramdisk at: 0x%lx (%lu bytes)\n",
+		printk(KERN_INFO "Initial ramdisk at: 0x%lx (%llu bytes)\n",
 		       initrd_start, ia64_boot_param->initrd_size);
 	}
 #endif
@@ -505,9 +505,9 @@ static int __init parse_elfcorehdr(char *arg)
 }
 early_param("elfcorehdr", parse_elfcorehdr);
 
-int __init reserve_elfcorehdr(unsigned long *start, unsigned long *end)
+int __init reserve_elfcorehdr(u64 *start, u64 *end)
 {
-	unsigned long length;
+	u64 length;
 
 	/* We get the address using the kernel command line,
 	 * but the size is extracted from the EFI tables.
@@ -588,7 +588,7 @@ setup_arch (char **cmdline_p)
 	ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
 #else
 	{
-		u64 num_phys_stacked;
+		unsigned long num_phys_stacked;
 
 		if (ia64_pal_rse_info(&num_phys_stacked, 0) == 0 && num_phys_stacked > 96)
 			ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
@@ -872,9 +872,9 @@ static void __cpuinit
 get_cache_info(void)
 {
 	unsigned long line_size, max = 1;
-	u64 l, levels, unique_caches;
-        pal_cache_config_info_t cci;
-        s64 status;
+	unsigned long l, levels, unique_caches;
+	pal_cache_config_info_t cci;
+	long status;
 
         status = ia64_pal_cache_summary(&levels, &unique_caches);
         if (status != 0) {
@@ -892,9 +892,9 @@ get_cache_info(void)
 		/* cache_type (data_or_unified)=2 */
 		status = ia64_pal_cache_config_info(l, 2, &cci);
 		if (status != 0) {
-			printk(KERN_ERR
-			       "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
-			       __func__, l, status);
+			printk(KERN_ERR "%s: ia64_pal_cache_config_info"
+				"(l=%lu, 2) failed (status=%ld)\n",
+				__func__, l, status);
 			max = SMP_CACHE_BYTES;
 			/* The safest setup for "flush_icache_range()" */
 			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
@@ -914,10 +914,10 @@ get_cache_info(void)
 			/* cache_type (instruction)=1*/
 			status = ia64_pal_cache_config_info(l, 1, &cci);
 			if (status != 0) {
-				printk(KERN_ERR
-				"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
+				printk(KERN_ERR "%s: ia64_pal_cache_config_info"
+					"(l=%lu, 1) failed (status=%ld)\n",
 					__func__, l, status);
-				/* The safest setup for "flush_icache_range()" */
+				/* The safest setup for flush_icache_range() */
 				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
 			}
 		}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 5230eaafd83..f0c521b0ba4 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -66,7 +66,7 @@ static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cachelin
 #define IPI_KDUMP_CPU_STOP	3
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU_SHARED_ALIGNED(u64, ipi_operation);
+static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, ipi_operation);
 
 extern void cpu_halt (void);
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 2a70af473b3..de100aa7ff0 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -865,7 +865,7 @@ init_smp_config(void)
 void __devinit
 identify_siblings(struct cpuinfo_ia64 *c)
 {
-	s64 status;
+	long status;
 	u16 pltid;
 	pal_logical_to_physical_t info;
 
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 604c1a35db3..4990495d753 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -385,7 +385,7 @@ ia64_init_itm (void)
 
 static cycle_t itc_get_cycles(struct clocksource *cs)
 {
-	u64 lcycle, now, ret;
+	unsigned long lcycle, now, ret;
 
 	if (!itc_jitter_data.itc_jitter)
 		return get_cycles();
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index a8d61a3e9a9..bc80dff1df7 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -306,10 +306,10 @@ static void __cpuinit cpu_cache_sysfs_exit(unsigned int cpu)
 
 static int __cpuinit cpu_cache_sysfs_init(unsigned int cpu)
 {
-	u64 i, levels, unique_caches;
+	unsigned long i, levels, unique_caches;
 	pal_cache_config_info_t cci;
 	int j;
-	s64 status;
+	long status;
 	struct cache_info *this_cache;
 	int num_cache_leaves = 0;
 
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 8eff8c1d40a..e6ac3c332d1 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -249,8 +249,7 @@ EXPORT_SYMBOL(uncached_free_page);
  * Called at boot time to build a map of pages that can be used for
  * memory special operations.
  */
-static int __init uncached_build_memmap(unsigned long uc_start,
-					unsigned long uc_end, void *arg)
+static int __init uncached_build_memmap(u64 uc_start, u64 uc_end, void *arg)
 {
 	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
 	struct gen_pool *pool = uncached_pools[nid].pool;
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 0ee085efbe2..2f724d2bf29 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -107,10 +107,10 @@ unsigned long bootmap_start;
  * bootmap_start.  This address must be page-aligned.
  */
 static int __init
-find_bootmap_location (unsigned long start, unsigned long end, void *arg)
+find_bootmap_location (u64 start, u64 end, void *arg)
 {
-	unsigned long needed = *(unsigned long *)arg;
-	unsigned long range_start, range_end, free_start;
+	u64 needed = *(unsigned long *)arg;
+	u64 range_start, range_end, free_start;
 	int i;
 
 #if IGNORE_PFN0
@@ -229,8 +229,7 @@ find_memory (void)
 	alloc_per_cpu_data();
 }
 
-static int
-count_pages (u64 start, u64 end, void *arg)
+static int count_pages(u64 start, u64 end, void *arg)
 {
 	unsigned long *count = arg;
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index c0f3bee6904..b115b3bbf04 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -422,8 +422,7 @@ retry_pte:
 	return hole_next_pfn - pgdat->node_start_pfn;
 }
 
-int __init
-create_mem_map_page_table (u64 start, u64 end, void *arg)
+int __init create_mem_map_page_table(u64 start, u64 end, void *arg)
 {
 	unsigned long address, start_page, end_page;
 	struct page *map_start, *map_end;
@@ -469,7 +468,7 @@ struct memmap_init_callback_data {
 };
 
 static int __meminit
-virtual_memmap_init (u64 start, u64 end, void *arg)
+virtual_memmap_init(u64 start, u64 end, void *arg)
 {
 	struct memmap_init_callback_data *args;
 	struct page *map_start, *map_end;
@@ -531,8 +530,7 @@ ia64_pfn_valid (unsigned long pfn)
 }
 EXPORT_SYMBOL(ia64_pfn_valid);
 
-int __init
-find_largest_hole (u64 start, u64 end, void *arg)
+int __init find_largest_hole(u64 start, u64 end, void *arg)
 {
 	u64 *max_gap = arg;
 
@@ -548,8 +546,7 @@ find_largest_hole (u64 start, u64 end, void *arg)
 
 #endif /* CONFIG_VIRTUAL_MEM_MAP */
 
-int __init
-register_active_ranges(u64 start, u64 len, int nid)
+int __init register_active_ranges(u64 start, u64 len, int nid)
 {
 	u64 end = start + len;
 
@@ -567,7 +564,7 @@ register_active_ranges(u64 start, u64 len, int nid)
 }
 
 static int __init
-count_reserved_pages (u64 start, u64 end, void *arg)
+count_reserved_pages(u64 start, u64 end, void *arg)
 {
 	unsigned long num_reserved = 0;
 	unsigned long *count = arg;
@@ -580,7 +577,7 @@ count_reserved_pages (u64 start, u64 end, void *arg)
 }
 
 int
-find_max_min_low_pfn (unsigned long start, unsigned long end, void *arg)
+find_max_min_low_pfn (u64 start, u64 end, void *arg)
 {
 	unsigned long pfn_start, pfn_end;
 #ifdef CONFIG_FLATMEM
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index b9f3d7bbb33..f426dc78d95 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -34,7 +34,7 @@
 #include <asm/tlb.h>
 
 static struct {
-	unsigned long mask;	/* mask of supported purge page-sizes */
+	u64 mask;		/* mask of supported purge page-sizes */
 	unsigned long max_bits;	/* log2 of largest supported purge page-size */
 } purge;
 
@@ -328,7 +328,7 @@ void __devinit
 ia64_tlb_init (void)
 {
 	ia64_ptce_info_t uninitialized_var(ptce_info); /* GCC be quiet */
-	unsigned long tr_pgbits;
+	u64 tr_pgbits;
 	long status;
 	pal_vm_info_1_u_t vm_info_1;
 	pal_vm_info_2_u_t vm_info_2;
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 61f1af5c23c..e643373e470 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -163,7 +163,7 @@ add_io_space (struct pci_root_info *info, struct acpi_resource_address64 *addr)
 {
 	struct resource *resource;
 	char *name;
-	u64 base, min, max, base_port;
+	unsigned long base, min, max, base_port;
 	unsigned int sparse = 0, space_nr, len;
 
 	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
@@ -292,7 +292,7 @@ static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 	window->offset = offset;
 
 	if (insert_resource(root, &window->resource)) {
-		printk(KERN_ERR "alloc 0x%lx-0x%lx from %s for %s failed\n",
+		printk(KERN_ERR "alloc 0x%llx-0x%llx from %s for %s failed\n",
 			window->resource.start, window->resource.end,
 			root->name, info->name);
 	}
@@ -314,8 +314,8 @@ pcibios_setup_root_windows(struct pci_bus *bus, struct pci_controller *ctrl)
 		    (res->end - res->start < 16))
 			continue;
 		if (j >= PCI_BUS_NUM_RESOURCES) {
-			printk("Ignoring range [%lx-%lx] (%lx)\n", res->start,
-					res->end, res->flags);
+			printk("Ignoring range [%#llx-%#llx] (%lx)\n",
+					res->start, res->end, res->flags);
 			continue;
 		}
 		bus->resource[j++] = res;
@@ -728,8 +728,8 @@ extern u8 pci_cache_line_size;
  */
 static void __init set_pci_cacheline_size(void)
 {
-	u64 levels, unique_caches;
-	s64 status;
+	unsigned long levels, unique_caches;
+	long status;
 	pal_cache_config_info_t cci;
 
 	status = ia64_pal_cache_summary(&levels, &unique_caches);
diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c
index d0223abbbbd..fd50ff94302 100644
--- a/arch/ia64/sn/kernel/io_acpi_init.c
+++ b/arch/ia64/sn/kernel/io_acpi_init.c
@@ -40,7 +40,7 @@ struct sn_pcidev_match {
 /*
  * Perform the early IO init in PROM.
  */
-static s64
+static long
 sal_ioif_init(u64 *result)
 {
 	struct ia64_sal_retval isrv = {0,0,0,0};
@@ -492,7 +492,7 @@ void __init
 sn_io_acpi_init(void)
 {
 	u64 result;
-	s64 status;
+	long status;
 
 	/* SN Altix does not follow the IOSAPIC IRQ routing model */
 	acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 57f280dd9de..76645cf6ac5 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -342,7 +342,7 @@ sn_common_bus_fixup(struct pci_bus *bus,
 		struct pcibus_bussoft *b = SN_PCIBUS_BUSSOFT(bus);
 
 		printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%u "
-		       "L_IO=%lx L_MEM=%lx BASE=%lx\n",
+		       "L_IO=%llx L_MEM=%llx BASE=%llx\n",
 		       b->bs_asic_type, b->bs_xid, b->bs_persist_busnum,
 		       b->bs_legacy_io, b->bs_legacy_mem, b->bs_base);
 		printk(KERN_WARNING "on node %d but only %d nodes online."
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 9e6491cf72b..4c7e7479095 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -414,7 +414,7 @@ static int sn_topology_show(struct seq_file *s, void *d)
 		}
 		seq_printf(s, "partition %u %s local "
 			"shubtype %s, "
-			"nasid_mask 0x%016lx, "
+			"nasid_mask 0x%016llx, "
 			"nasid_bits %d:%d, "
 			"system_size %d, "
 			"sharing_size %d, "
@@ -683,7 +683,7 @@ static int sn_hwperf_map_err(int hwperf_err)
  * ioctl for "sn_hwperf" misc device
  */
 static int
-sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
+sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, unsigned long arg)
 {
 	struct sn_hwperf_ioctl_args a;
 	struct cpuinfo_ia64 *cdata;
diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
index 2526e5c783a..c76d8dc3aea 100644
--- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
+++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
@@ -36,7 +36,7 @@ static int system_serial_number_open(struct inode *inode, struct file *file)
 
 static int licenseID_show(struct seq_file *s, void *p)
 {
-	seq_printf(s, "0x%lx\n", sn_partition_serial_number_val());
+	seq_printf(s, "0x%llx\n", sn_partition_serial_number_val());
 	return 0;
 }
 
diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c
index 3f864238566..c1bd1cfda32 100644
--- a/arch/ia64/sn/kernel/tiocx.c
+++ b/arch/ia64/sn/kernel/tiocx.c
@@ -368,7 +368,7 @@ static void tio_corelet_reset(nasid_t nasid, int corelet)
 static int is_fpga_tio(int nasid, int *bt)
 {
 	u16 uninitialized_var(ioboard_type);	/* GCC be quiet */
-	s64 rc;
+	long rc;
 
 	rc = ia64_sn_sysctl_ioboard_get(nasid, &ioboard_type);
 	if (rc) {
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_provider.c b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
index 2c676cc0541..d13e5a22a55 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_provider.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
@@ -79,7 +79,7 @@ static int sal_pcibr_error_interrupt(struct pcibus_info *soft)
 
 u16 sn_ioboard_to_pci_bus(struct pci_bus *pci_bus)
 {
-	s64 rc;
+	long rc;
 	u16 uninitialized_var(ioboard);		/* GCC be quiet */
 	nasid_t nasid = NASID_GET(SN_PCIBUS_BUSSOFT(pci_bus)->bs_base);
 
diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c
index 79165122501..35b2a27d2e7 100644
--- a/arch/ia64/sn/pci/tioca_provider.c
+++ b/arch/ia64/sn/pci/tioca_provider.c
@@ -123,7 +123,7 @@ tioca_gart_init(struct tioca_kernel *tioca_kern)
 
 	if (!tmp) {
 		printk(KERN_ERR "%s:  Could not allocate "
-		       "%lu bytes (order %d) for GART\n",
+		       "%llu bytes (order %d) for GART\n",
 		       __func__,
 		       tioca_kern->ca_gart_size,
 		       get_order(tioca_kern->ca_gart_size));
@@ -348,7 +348,7 @@ tioca_dma_d48(struct pci_dev *pdev, u64 paddr)
 	agp_dma_extn = __sn_readq_relaxed(&ca_base->ca_agp_dma_addr_extn);
 	if (node_upper != (agp_dma_extn >> CA_AGP_DMA_NODE_ID_SHFT)) {
 		printk(KERN_ERR "%s:  coretalk upper node (%u) "
-		       "mismatch with ca_agp_dma_addr_extn (%lu)\n",
+		       "mismatch with ca_agp_dma_addr_extn (%llu)\n",
 		       __func__,
 		       node_upper, (agp_dma_extn >> CA_AGP_DMA_NODE_ID_SHFT));
 		return 0;
@@ -367,7 +367,7 @@ tioca_dma_d48(struct pci_dev *pdev, u64 paddr)
  * dma_addr_t is guaranteed to be contiguous in CA bus space.
  */
 static dma_addr_t
-tioca_dma_mapped(struct pci_dev *pdev, u64 paddr, size_t req_size)
+tioca_dma_mapped(struct pci_dev *pdev, unsigned long paddr, size_t req_size)
 {
 	int i, ps, ps_shift, entry, entries, mapsize, last_entry;
 	u64 xio_addr, end_xio_addr;
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index 94e584527f4..012f3b82ee5 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -493,7 +493,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
 
 		if (&map->ce_dmamap_list == &ce_kern->ce_dmamap_list) {
 			printk(KERN_WARNING
-			       "%s:  %s - no map found for bus_addr 0x%lx\n",
+			       "%s:  %s - no map found for bus_addr 0x%llx\n",
 			       __func__, pci_name(pdev), bus_addr);
 		} else if (--map->refcnt == 0) {
 			for (i = 0; i < map->ate_count; i++) {
@@ -642,7 +642,7 @@ dma_map_done:
  * in the address.
  */
 static u64
-tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags)
+tioce_dma(struct pci_dev *pdev, unsigned long  paddr, size_t byte_count, int dma_flags)
 {
 	return tioce_do_dma_map(pdev, paddr, byte_count, 0, dma_flags);
 }
@@ -657,7 +657,7 @@ tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags)
  * in the address.
  */
 static u64
-tioce_dma_consistent(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags)
+tioce_dma_consistent(struct pci_dev *pdev, unsigned long  paddr, size_t byte_count, int dma_flags)
 {
 	return tioce_do_dma_map(pdev, paddr, byte_count, 1, dma_flags);
 }
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
index 183ac3fe44f..9c7e2343c39 100644
--- a/drivers/char/agp/hp-agp.c
+++ b/drivers/char/agp/hp-agp.c
@@ -518,8 +518,9 @@ zx1_gart_probe (acpi_handle obj, u32 depth, void *context, void **ret)
 	if (hp_zx1_setup(sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa))
 		return AE_OK;
 
-	printk(KERN_INFO PFX "Detected HP ZX1 %s AGP chipset (ioc=%lx, lba=%lx)\n",
-		(char *) context, sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa);
+	printk(KERN_INFO PFX "Detected HP ZX1 %s AGP chipset "
+		"(ioc=%llx, lba=%llx)\n", (char *)context,
+		sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa);
 
 	hp_zx1_gart_found = 1;
 	return AE_CTRL_TERMINATE; /* we only support one bridge; quit looking */
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
index 58e9f8e457f..51e0e2d8fac 100644
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -28,10 +28,10 @@ setup_serial_console(struct pcdp_uart *uart)
 	char parity;
 
 	mmio = (uart->addr.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY);
-	p += sprintf(p, "uart8250,%s,0x%lx",
+	p += sprintf(p, "uart8250,%s,0x%llx",
 		mmio ? "mmio" : "io", uart->addr.address);
 	if (uart->baud) {
-		p += sprintf(p, ",%lu", uart->baud);
+		p += sprintf(p, ",%llu", uart->baud);
 		if (uart->bits) {
 			switch (uart->parity) {
 			    case 0x2: parity = 'e'; break;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index bb66feb164b..ce4581fbc08 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -101,7 +101,7 @@ typedef struct {
 	u64 attribute;
 } efi_memory_desc_t;
 
-typedef int (*efi_freemem_callback_t) (unsigned long start, unsigned long end, void *arg);
+typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
 
 /*
  * Types and defines for Time Services
-- 
cgit v1.2.3-70-g09d2


From 14fa56917d73d823538151b0429d98211fa439c1 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.ml.walleij@gmail.com>
Date: Thu, 21 May 2009 23:17:06 +0200
Subject: mfd: add U300 AB3100 core support

This adds a core driver for the AB3100 mixed-signal circuit
found in the ST-Ericsson U300 series platforms. This driver
is a singleton proxy for all accesses to the AB3100
sub-drivers which will be merged on top of this one, RTC,
regulators, battery and system power control, vibrator,
LEDs, and an ALSA codec.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Reviewed-by: Mike Rapoport <mike@compulab.co.il>
Reviewed-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig        |  14 +
 drivers/mfd/Makefile       |   3 +-
 drivers/mfd/ab3100-core.c  | 991 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/ab3100.h | 103 +++++
 4 files changed, 1110 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mfd/ab3100-core.c
 create mode 100644 include/linux/mfd/ab3100.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index ee3927ab11e..61f0346650b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -241,6 +241,20 @@ config PCF50633_GPIO
 	 Say yes here if you want to include support GPIO for pins on
 	 the PCF50633 chip.
 
+config AB3100_CORE
+	tristate "ST-Ericsson AB3100 Mixed Signal Circuit core functions"
+	depends on I2C
+	default y if ARCH_U300
+	help
+	  Select this to enable the AB3100 Mixed Signal IC core
+	  functionality. This connects to a AB3100 on the I2C bus
+	  and expose a number of symbols needed for dependent devices
+	  to read and write registers and subscribe to events from
+	  this multi-functional IC. This is needed to use other features
+	  of the AB3100 such as battery-backed RTC, charging control,
+	  LEDs, vibrator, system power and temperature, power management
+	  and ALSA sound.
+
 endmenu
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 3afb5192e4d..f5f337143e6 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -40,4 +40,5 @@ obj-$(CONFIG_PMIC_DA903X)	+= da903x.o
 
 obj-$(CONFIG_MFD_PCF50633)	+= pcf50633-core.o
 obj-$(CONFIG_PCF50633_ADC)	+= pcf50633-adc.o
-obj-$(CONFIG_PCF50633_GPIO)	+= pcf50633-gpio.o
\ No newline at end of file
+obj-$(CONFIG_PCF50633_GPIO)	+= pcf50633-gpio.o
+obj-$(CONFIG_AB3100_CORE)	+= ab3100-core.o
diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
new file mode 100644
index 00000000000..13e7d7bfe85
--- /dev/null
+++ b/drivers/mfd/ab3100-core.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (C) 2007-2009 ST-Ericsson
+ * License terms: GNU General Public License (GPL) version 2
+ * Low-level core for exclusive access to the AB3100 IC on the I2C bus
+ * and some basic chip-configuration.
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ */
+
+#include <linux/i2c.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/mfd/ab3100.h>
+
+/* These are the only registers inside AB3100 used in this main file */
+
+/* Interrupt event registers */
+#define AB3100_EVENTA1		0x21
+#define AB3100_EVENTA2		0x22
+#define AB3100_EVENTA3		0x23
+
+/* AB3100 DAC converter registers */
+#define AB3100_DIS		0x00
+#define AB3100_D0C		0x01
+#define AB3100_D1C		0x02
+#define AB3100_D2C		0x03
+#define AB3100_D3C		0x04
+
+/* Chip ID register */
+#define AB3100_CID		0x20
+
+/* AB3100 interrupt registers */
+#define AB3100_IMRA1		0x24
+#define AB3100_IMRA2		0x25
+#define AB3100_IMRA3		0x26
+#define AB3100_IMRB1		0x2B
+#define AB3100_IMRB2		0x2C
+#define AB3100_IMRB3		0x2D
+
+/* System Power Monitoring and control registers */
+#define AB3100_MCA		0x2E
+#define AB3100_MCB		0x2F
+
+/* SIM power up */
+#define AB3100_SUP		0x50
+
+/*
+ * I2C communication
+ *
+ * The AB3100 is usually assigned address 0x48 (7-bit)
+ * The chip is defined in the platform i2c_board_data section.
+ */
+static unsigned short normal_i2c[] = { 0x48, I2C_CLIENT_END };
+I2C_CLIENT_INSMOD_1(ab3100);
+
+u8 ab3100_get_chip_type(struct ab3100 *ab3100)
+{
+	u8 chip = ABUNKNOWN;
+
+	switch (ab3100->chip_id & 0xf0) {
+	case  0xa0:
+		chip = AB3000;
+		break;
+	case  0xc0:
+		chip = AB3100;
+		break;
+	}
+	return chip;
+}
+EXPORT_SYMBOL(ab3100_get_chip_type);
+
+int ab3100_set_register(struct ab3100 *ab3100, u8 reg, u8 regval)
+{
+	u8 regandval[2] = {reg, regval};
+	int err;
+
+	err = mutex_lock_interruptible(&ab3100->access_mutex);
+	if (err)
+		return err;
+
+	/*
+	 * A two-byte write message with the first byte containing the register
+	 * number and the second byte containing the value to be written
+	 * effectively sets a register in the AB3100.
+	 */
+	err = i2c_master_send(ab3100->i2c_client, regandval, 2);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (write register): %d\n",
+			err);
+	} else if (err != 2) {
+		dev_err(ab3100->dev,
+			"write error (write register) "
+			"%d bytes transferred (expected 2)\n",
+			err);
+		err = -EIO;
+	} else {
+		/* All is well */
+		err = 0;
+	}
+	mutex_unlock(&ab3100->access_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ab3100_set_register);
+
+/*
+ * The test registers exist at an I2C bus address up one
+ * from the ordinary base. They are not supposed to be used
+ * in production code, but sometimes you have to do that
+ * anyway. It's currently only used from this file so declare
+ * it static and do not export.
+ */
+static int ab3100_set_test_register(struct ab3100 *ab3100,
+				    u8 reg, u8 regval)
+{
+	u8 regandval[2] = {reg, regval};
+	int err;
+
+	err = mutex_lock_interruptible(&ab3100->access_mutex);
+	if (err)
+		return err;
+
+	err = i2c_master_send(ab3100->testreg_client, regandval, 2);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (write test register): %d\n",
+			err);
+	} else if (err != 2) {
+		dev_err(ab3100->dev,
+			"write error (write test register) "
+			"%d bytes transferred (expected 2)\n",
+			err);
+		err = -EIO;
+	} else {
+		/* All is well */
+		err = 0;
+	}
+	mutex_unlock(&ab3100->access_mutex);
+
+	return err;
+}
+
+int ab3100_get_register(struct ab3100 *ab3100, u8 reg, u8 *regval)
+{
+	int err;
+
+	err = mutex_lock_interruptible(&ab3100->access_mutex);
+	if (err)
+		return err;
+
+	/*
+	 * AB3100 require an I2C "stop" command between each message, else
+	 * it will not work. The only way of achieveing this with the
+	 * message transport layer is to send the read and write messages
+	 * separately.
+	 */
+	err = i2c_master_send(ab3100->i2c_client, &reg, 1);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (send register address): %d\n",
+			err);
+		goto get_reg_out_unlock;
+	} else if (err != 1) {
+		dev_err(ab3100->dev,
+			"write error (send register address) "
+			"%d bytes transferred (expected 1)\n",
+			err);
+		err = -EIO;
+		goto get_reg_out_unlock;
+	} else {
+		/* All is well */
+		err = 0;
+	}
+
+	err = i2c_master_recv(ab3100->i2c_client, regval, 1);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (read register): %d\n",
+			err);
+		goto get_reg_out_unlock;
+	} else if (err != 1) {
+		dev_err(ab3100->dev,
+			"write error (read register) "
+			"%d bytes transferred (expected 1)\n",
+			err);
+		err = -EIO;
+		goto get_reg_out_unlock;
+	} else {
+		/* All is well */
+		err = 0;
+	}
+
+ get_reg_out_unlock:
+	mutex_unlock(&ab3100->access_mutex);
+	return err;
+}
+EXPORT_SYMBOL(ab3100_get_register);
+
+int ab3100_get_register_page(struct ab3100 *ab3100,
+			     u8 first_reg, u8 *regvals, u8 numregs)
+{
+	int err;
+
+	if (ab3100->chip_id == 0xa0 ||
+	    ab3100->chip_id == 0xa1)
+		/* These don't support paged reads */
+		return -EIO;
+
+	err = mutex_lock_interruptible(&ab3100->access_mutex);
+	if (err)
+		return err;
+
+	/*
+	 * Paged read also require an I2C "stop" command.
+	 */
+	err = i2c_master_send(ab3100->i2c_client, &first_reg, 1);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (send first register address): %d\n",
+			err);
+		goto get_reg_page_out_unlock;
+	} else if (err != 1) {
+		dev_err(ab3100->dev,
+			"write error (send first register address) "
+			"%d bytes transferred (expected 1)\n",
+			err);
+		err = -EIO;
+		goto get_reg_page_out_unlock;
+	}
+
+	err = i2c_master_recv(ab3100->i2c_client, regvals, numregs);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (read register page): %d\n",
+			err);
+		goto get_reg_page_out_unlock;
+	} else if (err != numregs) {
+		dev_err(ab3100->dev,
+			"write error (read register page) "
+			"%d bytes transferred (expected %d)\n",
+			err, numregs);
+		err = -EIO;
+		goto get_reg_page_out_unlock;
+	}
+
+	/* All is well */
+	err = 0;
+
+ get_reg_page_out_unlock:
+	mutex_unlock(&ab3100->access_mutex);
+	return err;
+}
+EXPORT_SYMBOL(ab3100_get_register_page);
+
+int ab3100_mask_and_set_register(struct ab3100 *ab3100,
+				 u8 reg, u8 andmask, u8 ormask)
+{
+	u8 regandval[2] = {reg, 0};
+	int err;
+
+	err = mutex_lock_interruptible(&ab3100->access_mutex);
+	if (err)
+		return err;
+
+	/* First read out the target register */
+	err = i2c_master_send(ab3100->i2c_client, &reg, 1);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (maskset send address): %d\n",
+			err);
+		goto get_maskset_unlock;
+	} else if (err != 1) {
+		dev_err(ab3100->dev,
+			"write error (maskset send address) "
+			"%d bytes transferred (expected 1)\n",
+			err);
+		err = -EIO;
+		goto get_maskset_unlock;
+	}
+
+	err = i2c_master_recv(ab3100->i2c_client, &regandval[1], 1);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (maskset read register): %d\n",
+			err);
+		goto get_maskset_unlock;
+	} else if (err != 1) {
+		dev_err(ab3100->dev,
+			"write error (maskset read register) "
+			"%d bytes transferred (expected 1)\n",
+			err);
+		err = -EIO;
+		goto get_maskset_unlock;
+	}
+
+	/* Modify the register */
+	regandval[1] &= andmask;
+	regandval[1] |= ormask;
+
+	/* Write the register */
+	err = i2c_master_send(ab3100->i2c_client, regandval, 2);
+	if (err < 0) {
+		dev_err(ab3100->dev,
+			"write error (write register): %d\n",
+			err);
+		goto get_maskset_unlock;
+	} else if (err != 2) {
+		dev_err(ab3100->dev,
+			"write error (write register) "
+			"%d bytes transferred (expected 2)\n",
+			err);
+		err = -EIO;
+		goto get_maskset_unlock;
+	}
+
+	/* All is well */
+	err = 0;
+
+ get_maskset_unlock:
+	mutex_unlock(&ab3100->access_mutex);
+	return err;
+}
+EXPORT_SYMBOL(ab3100_mask_and_set_register);
+
+/*
+ * Register a simple callback for handling any AB3100 events.
+ */
+int ab3100_event_register(struct ab3100 *ab3100,
+			  struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&ab3100->event_subscribers,
+					       nb);
+}
+EXPORT_SYMBOL(ab3100_event_register);
+
+/*
+ * Remove a previously registered callback.
+ */
+int ab3100_event_unregister(struct ab3100 *ab3100,
+			    struct notifier_block *nb)
+{
+  return blocking_notifier_chain_unregister(&ab3100->event_subscribers,
+					    nb);
+}
+EXPORT_SYMBOL(ab3100_event_unregister);
+
+
+int ab3100_event_registers_startup_state_get(struct ab3100 *ab3100,
+					     u32 *fatevent)
+{
+	if (!ab3100->startup_events_read)
+		return -EAGAIN; /* Try again later */
+	*fatevent = ab3100->startup_events;
+	return 0;
+}
+EXPORT_SYMBOL(ab3100_event_registers_startup_state_get);
+
+/* Interrupt handling worker */
+static void ab3100_work(struct work_struct *work)
+{
+	struct ab3100 *ab3100 = container_of(work, struct ab3100, work);
+	u8 event_regs[3];
+	u32 fatevent;
+	int err;
+
+	err = ab3100_get_register_page(ab3100, AB3100_EVENTA1,
+				       event_regs, 3);
+	if (err)
+		goto err_event_wq;
+
+	fatevent = (event_regs[0] << 16) |
+		(event_regs[1] << 8) |
+		event_regs[2];
+
+	if (!ab3100->startup_events_read) {
+		ab3100->startup_events = fatevent;
+		ab3100->startup_events_read = true;
+	}
+	/*
+	 * The notified parties will have to mask out the events
+	 * they're interested in and react to them. They will be
+	 * notified on all events, then they use the fatevent value
+	 * to determine if they're interested.
+	 */
+	blocking_notifier_call_chain(&ab3100->event_subscribers,
+				     fatevent, NULL);
+
+	dev_dbg(ab3100->dev,
+		"IRQ Event: 0x%08x\n", fatevent);
+
+	/* By now the IRQ should be acked and deasserted so enable it again */
+	enable_irq(ab3100->i2c_client->irq);
+	return;
+
+ err_event_wq:
+	dev_dbg(ab3100->dev,
+		"error in event workqueue\n");
+	/* Enable the IRQ anyway, what choice do we have? */
+	enable_irq(ab3100->i2c_client->irq);
+	return;
+}
+
+static irqreturn_t ab3100_irq_handler(int irq, void *data)
+{
+	struct ab3100 *ab3100 = data;
+	/*
+	 * Disable the IRQ and dispatch a worker to handle the
+	 * event. Since the chip resides on I2C this is slow
+	 * stuff and we will re-enable the interrupts once th
+	 * worker has finished.
+	 */
+	disable_irq(ab3100->i2c_client->irq);
+	schedule_work(&ab3100->work);
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Some debugfs entries only exposed if we're using debug
+ */
+static int ab3100_registers_print(struct seq_file *s, void *p)
+{
+	struct ab3100 *ab3100 = s->private;
+	u8 value;
+	u8 reg;
+
+	seq_printf(s, "AB3100 registers:\n");
+
+	for (reg = 0; reg < 0xff; reg++) {
+		ab3100_get_register(ab3100, reg, &value);
+		seq_printf(s, "[0x%x]:  0x%x\n", reg, value);
+	}
+	return 0;
+}
+
+static int ab3100_registers_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab3100_registers_print, inode->i_private);
+}
+
+static const struct file_operations ab3100_registers_fops = {
+	.open = ab3100_registers_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+struct ab3100_get_set_reg_priv {
+	struct ab3100 *ab3100;
+	bool mode;
+};
+
+static int ab3100_get_set_reg_open_file(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static int ab3100_get_set_reg(struct file *file,
+			      const char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	struct ab3100_get_set_reg_priv *priv = file->private_data;
+	struct ab3100 *ab3100 = priv->ab3100;
+	char buf[32];
+	int buf_size;
+	int regp;
+	unsigned long user_reg;
+	int err;
+	int i = 0;
+
+	/* Get userspace string and assure termination */
+	buf_size = min(count, (sizeof(buf)-1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	/*
+	 * The idea is here to parse a string which is either
+	 * "0xnn" for reading a register, or "0xaa 0xbb" for
+	 * writing 0xbb to the register 0xaa. First move past
+	 * whitespace and then begin to parse the register.
+	 */
+	while ((i < buf_size) && (buf[i] == ' '))
+		i++;
+	regp = i;
+
+	/*
+	 * Advance pointer to end of string then terminate
+	 * the register string. This is needed to satisfy
+	 * the strict_strtoul() function.
+	 */
+	while ((i < buf_size) && (buf[i] != ' '))
+		i++;
+	buf[i] = '\0';
+
+	err = strict_strtoul(&buf[regp], 16, &user_reg);
+	if (err)
+		return err;
+	if (user_reg > 0xff)
+		return -EINVAL;
+
+	/* Either we read or we write a register here */
+	if (!priv->mode) {
+		/* Reading */
+		u8 reg = (u8) user_reg;
+		u8 regvalue;
+
+		ab3100_get_register(ab3100, reg, &regvalue);
+
+		dev_info(ab3100->dev,
+			 "debug read AB3100 reg[0x%02x]: 0x%02x\n",
+			 reg, regvalue);
+	} else {
+		int valp;
+		unsigned long user_value;
+		u8 reg = (u8) user_reg;
+		u8 value;
+		u8 regvalue;
+
+		/*
+		 * Writing, we need some value to write to
+		 * the register so keep parsing the string
+		 * from userspace.
+		 */
+		i++;
+		while ((i < buf_size) && (buf[i] == ' '))
+			i++;
+		valp = i;
+		while ((i < buf_size) && (buf[i] != ' '))
+			i++;
+		buf[i] = '\0';
+
+		err = strict_strtoul(&buf[valp], 16, &user_value);
+		if (err)
+			return err;
+		if (user_reg > 0xff)
+			return -EINVAL;
+
+		value = (u8) user_value;
+		ab3100_set_register(ab3100, reg, value);
+		ab3100_get_register(ab3100, reg, &regvalue);
+
+		dev_info(ab3100->dev,
+			 "debug write reg[0x%02x] with 0x%02x, "
+			 "after readback: 0x%02x\n",
+			 reg, value, regvalue);
+	}
+	return buf_size;
+}
+
+static const struct file_operations ab3100_get_set_reg_fops = {
+	.open = ab3100_get_set_reg_open_file,
+	.write = ab3100_get_set_reg,
+};
+
+static struct dentry *ab3100_dir;
+static struct dentry *ab3100_reg_file;
+static struct ab3100_get_set_reg_priv ab3100_get_priv;
+static struct dentry *ab3100_get_reg_file;
+static struct ab3100_get_set_reg_priv ab3100_set_priv;
+static struct dentry *ab3100_set_reg_file;
+
+static void ab3100_setup_debugfs(struct ab3100 *ab3100)
+{
+	int err;
+
+	ab3100_dir = debugfs_create_dir("ab3100", NULL);
+	if (!ab3100_dir)
+		goto exit_no_debugfs;
+
+	ab3100_reg_file = debugfs_create_file("registers",
+				S_IRUGO, ab3100_dir, ab3100,
+				&ab3100_registers_fops);
+	if (!ab3100_reg_file) {
+		err = -ENOMEM;
+		goto exit_destroy_dir;
+	}
+
+	ab3100_get_priv.ab3100 = ab3100;
+	ab3100_get_priv.mode = false;
+	ab3100_get_reg_file = debugfs_create_file("get_reg",
+				S_IWUGO, ab3100_dir, &ab3100_get_priv,
+				&ab3100_get_set_reg_fops);
+	if (!ab3100_get_reg_file) {
+		err = -ENOMEM;
+		goto exit_destroy_reg;
+	}
+
+	ab3100_set_priv.ab3100 = ab3100;
+	ab3100_set_priv.mode = true;
+	ab3100_set_reg_file = debugfs_create_file("set_reg",
+				S_IWUGO, ab3100_dir, &ab3100_set_priv,
+				&ab3100_get_set_reg_fops);
+	if (!ab3100_set_reg_file) {
+		err = -ENOMEM;
+		goto exit_destroy_get_reg;
+	}
+	return;
+
+ exit_destroy_get_reg:
+	debugfs_remove(ab3100_get_reg_file);
+ exit_destroy_reg:
+	debugfs_remove(ab3100_reg_file);
+ exit_destroy_dir:
+	debugfs_remove(ab3100_dir);
+ exit_no_debugfs:
+	return;
+}
+static inline void ab3100_remove_debugfs(void)
+{
+	debugfs_remove(ab3100_set_reg_file);
+	debugfs_remove(ab3100_get_reg_file);
+	debugfs_remove(ab3100_reg_file);
+	debugfs_remove(ab3100_dir);
+}
+#else
+static inline void ab3100_setup_debugfs(struct ab3100 *ab3100)
+{
+}
+static inline void ab3100_remove_debugfs(void)
+{
+}
+#endif
+
+/*
+ * Basic set-up, datastructure creation/destruction and I2C interface.
+ * This sets up a default config in the AB3100 chip so that it
+ * will work as expected.
+ */
+
+struct ab3100_init_setting {
+	u8 abreg;
+	u8 setting;
+};
+
+static const struct ab3100_init_setting __initdata
+ab3100_init_settings[] = {
+	{
+		.abreg = AB3100_MCA,
+		.setting = 0x01
+	}, {
+		.abreg = AB3100_MCB,
+		.setting = 0x30
+	}, {
+		.abreg = AB3100_IMRA1,
+		.setting = 0x00
+	}, {
+		.abreg = AB3100_IMRA2,
+		.setting = 0xFF
+	}, {
+		.abreg = AB3100_IMRA3,
+		.setting = 0x01
+	}, {
+		.abreg = AB3100_IMRB1,
+		.setting = 0xFF
+	}, {
+		.abreg = AB3100_IMRB2,
+		.setting = 0xFF
+	}, {
+		.abreg = AB3100_IMRB3,
+		.setting = 0xFF
+	}, {
+		.abreg = AB3100_SUP,
+		.setting = 0x00
+	}, {
+		.abreg = AB3100_DIS,
+		.setting = 0xF0
+	}, {
+		.abreg = AB3100_D0C,
+		.setting = 0x00
+	}, {
+		.abreg = AB3100_D1C,
+		.setting = 0x00
+	}, {
+		.abreg = AB3100_D2C,
+		.setting = 0x00
+	}, {
+		.abreg = AB3100_D3C,
+		.setting = 0x00
+	},
+};
+
+static int __init ab3100_setup(struct ab3100 *ab3100)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ab3100_init_settings); i++) {
+		err = ab3100_set_register(ab3100,
+					  ab3100_init_settings[i].abreg,
+					  ab3100_init_settings[i].setting);
+		if (err)
+			goto exit_no_setup;
+	}
+
+	/*
+	 * Special trick to make the AB3100 use the 32kHz clock (RTC)
+	 * bit 3 in test registe 0x02 is a special, undocumented test
+	 * register bit that only exist in AB3100 P1E
+	 */
+	if (ab3100->chip_id == 0xc4) {
+		dev_warn(ab3100->dev,
+			 "AB3100 P1E variant detected, "
+			 "forcing chip to 32KHz\n");
+		err = ab3100_set_test_register(ab3100, 0x02, 0x08);
+	}
+
+ exit_no_setup:
+	return err;
+}
+
+/*
+ * Here we define all the platform devices that appear
+ * as children of the AB3100. These are regular platform
+ * devices with the IORESOURCE_IO .start and .end set
+ * to correspond to the internal AB3100 register range
+ * mapping to the corresponding subdevice.
+ */
+
+#define AB3100_DEVICE(devname, devid)				\
+static struct platform_device ab3100_##devname##_device = {	\
+	.name		= devid,				\
+	.id		= -1,					\
+}
+
+/*
+ * This lists all the subdevices and corresponding register
+ * ranges.
+ */
+AB3100_DEVICE(dac, "ab3100-dac");
+AB3100_DEVICE(leds, "ab3100-leds");
+AB3100_DEVICE(power, "ab3100-power");
+AB3100_DEVICE(regulators, "ab3100-regulators");
+AB3100_DEVICE(sim, "ab3100-sim");
+AB3100_DEVICE(uart, "ab3100-uart");
+AB3100_DEVICE(rtc, "ab3100-rtc");
+AB3100_DEVICE(charger, "ab3100-charger");
+AB3100_DEVICE(boost, "ab3100-boost");
+AB3100_DEVICE(adc, "ab3100-adc");
+AB3100_DEVICE(fuelgauge, "ab3100-fuelgauge");
+AB3100_DEVICE(vibrator, "ab3100-vibrator");
+AB3100_DEVICE(otp, "ab3100-otp");
+AB3100_DEVICE(codec, "ab3100-codec");
+
+static struct platform_device *
+ab3100_platform_devs[] = {
+	&ab3100_dac_device,
+	&ab3100_leds_device,
+	&ab3100_power_device,
+	&ab3100_regulators_device,
+	&ab3100_sim_device,
+	&ab3100_uart_device,
+	&ab3100_rtc_device,
+	&ab3100_charger_device,
+	&ab3100_boost_device,
+	&ab3100_adc_device,
+	&ab3100_fuelgauge_device,
+	&ab3100_vibrator_device,
+	&ab3100_otp_device,
+	&ab3100_codec_device,
+};
+
+struct ab_family_id {
+	u8	id;
+	char	*name;
+};
+
+static const struct ab_family_id ids[] __initdata = {
+	/* AB3100 */
+	{
+		.id = 0xc0,
+		.name = "P1A"
+	}, {
+		.id = 0xc1,
+		.name = "P1B"
+	}, {
+		.id = 0xc2,
+		.name = "P1C"
+	}, {
+		.id = 0xc3,
+		.name = "P1D"
+	}, {
+		.id = 0xc4,
+		.name = "P1E"
+	}, {
+		.id = 0xc5,
+		.name = "P1F/R1A"
+	}, {
+		.id = 0xc6,
+		.name = "P1G/R1A"
+	}, {
+		.id = 0xc7,
+		.name = "P2A/R2A"
+	}, {
+		.id = 0xc8,
+		.name = "P2B/R2B"
+	},
+	/* AB3000 variants, not supported */
+	{
+		.id = 0xa0
+	}, {
+		.id = 0xa1
+	}, {
+		.id = 0xa2
+	}, {
+		.id = 0xa3
+	}, {
+		.id = 0xa4
+	}, {
+		.id = 0xa5
+	}, {
+		.id = 0xa6
+	}, {
+		.id = 0xa7
+	},
+	/* Terminator */
+	{
+		.id = 0x00,
+	},
+};
+
+static int __init ab3100_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	struct ab3100 *ab3100;
+	int err;
+	int i;
+
+	ab3100 = kzalloc(sizeof(struct ab3100), GFP_KERNEL);
+	if (!ab3100) {
+		dev_err(&client->dev, "could not allocate AB3100 device\n");
+		return -ENOMEM;
+	}
+
+	/* Initialize data structure */
+	mutex_init(&ab3100->access_mutex);
+	BLOCKING_INIT_NOTIFIER_HEAD(&ab3100->event_subscribers);
+
+	ab3100->i2c_client = client;
+	ab3100->dev = &ab3100->i2c_client->dev;
+
+	i2c_set_clientdata(client, ab3100);
+
+	/* Read chip ID register */
+	err = ab3100_get_register(ab3100, AB3100_CID,
+				  &ab3100->chip_id);
+	if (err) {
+		dev_err(&client->dev,
+			"could not communicate with the AB3100 analog "
+			"baseband chip\n");
+		goto exit_no_detect;
+	}
+
+	for (i = 0; ids[i].id != 0x0; i++) {
+		if (ids[i].id == ab3100->chip_id) {
+			if (ids[i].name != NULL) {
+				snprintf(&ab3100->chip_name[0],
+					 sizeof(ab3100->chip_name) - 1,
+					 "AB3100 %s",
+					 ids[i].name);
+				break;
+			} else {
+				dev_err(&client->dev,
+					"AB3000 is not supported\n");
+				goto exit_no_detect;
+			}
+		}
+	}
+
+	if (ids[i].id == 0x0) {
+		dev_err(&client->dev, "unknown analog baseband chip id: 0x%x\n",
+			ab3100->chip_id);
+		dev_err(&client->dev, "accepting it anyway. Please update "
+			"the driver.\n");
+		goto exit_no_detect;
+	}
+
+	dev_info(&client->dev, "Detected chip: %s\n",
+		 &ab3100->chip_name[0]);
+
+	/* Attach a second dummy i2c_client to the test register address */
+	ab3100->testreg_client = i2c_new_dummy(client->adapter,
+						     client->addr + 1);
+	if (!ab3100->testreg_client) {
+		err = -ENOMEM;
+		goto exit_no_testreg_client;
+	}
+
+	strlcpy(ab3100->testreg_client->name, id->name,
+		sizeof(ab3100->testreg_client->name));
+
+	err = ab3100_setup(ab3100);
+	if (err)
+		goto exit_no_setup;
+
+	INIT_WORK(&ab3100->work, ab3100_work);
+
+	/* This real unpredictable IRQ is of course sampled for entropy */
+	err = request_irq(client->irq, ab3100_irq_handler,
+			  IRQF_DISABLED | IRQF_SAMPLE_RANDOM,
+			  "AB3100 IRQ", ab3100);
+	if (err)
+		goto exit_no_irq;
+
+	/* Set parent and a pointer back to the container in device data */
+	for (i = 0; i < ARRAY_SIZE(ab3100_platform_devs); i++) {
+		ab3100_platform_devs[i]->dev.parent =
+			&client->dev;
+		platform_set_drvdata(ab3100_platform_devs[i], ab3100);
+	}
+
+	/* Register the platform devices */
+	platform_add_devices(ab3100_platform_devs,
+			     ARRAY_SIZE(ab3100_platform_devs));
+
+	ab3100_setup_debugfs(ab3100);
+
+	return 0;
+
+ exit_no_irq:
+ exit_no_setup:
+	i2c_unregister_device(ab3100->testreg_client);
+ exit_no_testreg_client:
+ exit_no_detect:
+	kfree(ab3100);
+	return err;
+}
+
+static int __exit ab3100_remove(struct i2c_client *client)
+{
+	struct ab3100 *ab3100 = i2c_get_clientdata(client);
+	int i;
+
+	/* Unregister subdevices */
+	for (i = 0; i < ARRAY_SIZE(ab3100_platform_devs); i++)
+		platform_device_unregister(ab3100_platform_devs[i]);
+
+	ab3100_remove_debugfs();
+	i2c_unregister_device(ab3100->testreg_client);
+
+	/*
+	 * At this point, all subscribers should have unregistered
+	 * their notifiers so deactivate IRQ
+	 */
+	free_irq(client->irq, ab3100);
+	kfree(ab3100);
+	return 0;
+}
+
+static const struct i2c_device_id ab3100_id[] = {
+	{ "ab3100", ab3100 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, ab3100_id);
+
+static struct i2c_driver ab3100_driver = {
+	.driver = {
+		.name	= "ab3100",
+		.owner	= THIS_MODULE,
+	},
+	.id_table	= ab3100_id,
+	.probe		= ab3100_probe,
+	.remove		= __exit_p(ab3100_remove),
+};
+
+static int __init ab3100_i2c_init(void)
+{
+	return i2c_add_driver(&ab3100_driver);
+}
+
+static void __exit ab3100_i2c_exit(void)
+{
+	i2c_del_driver(&ab3100_driver);
+}
+
+subsys_initcall(ab3100_i2c_init);
+module_exit(ab3100_i2c_exit);
+
+MODULE_AUTHOR("Linus Walleij <linus.walleij@stericsson.com>");
+MODULE_DESCRIPTION("AB3100 core driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/ab3100.h b/include/linux/mfd/ab3100.h
new file mode 100644
index 00000000000..7a3f316e384
--- /dev/null
+++ b/include/linux/mfd/ab3100.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2007-2009 ST-Ericsson AB
+ * License terms: GNU General Public License (GPL) version 2
+ * AB3100 core access functions
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ */
+
+#include <linux/device.h>
+
+#ifndef MFD_AB3100_H
+#define MFD_AB3100_H
+
+#define ABUNKNOWN	0
+#define	AB3000		1
+#define	AB3100		2
+
+/*
+ * AB3100, EVENTA1, A2 and A3 event register flags
+ * these are catenated into a single 32-bit flag in the code
+ * for event notification broadcasts.
+ */
+#define AB3100_EVENTA1_ONSWA				(0x01<<16)
+#define AB3100_EVENTA1_ONSWB				(0x02<<16)
+#define AB3100_EVENTA1_ONSWC				(0x04<<16)
+#define AB3100_EVENTA1_DCIO				(0x08<<16)
+#define AB3100_EVENTA1_OVER_TEMP			(0x10<<16)
+#define AB3100_EVENTA1_SIM_OFF				(0x20<<16)
+#define AB3100_EVENTA1_VBUS				(0x40<<16)
+#define AB3100_EVENTA1_VSET_USB				(0x80<<16)
+
+#define AB3100_EVENTA2_READY_TX				(0x01<<8)
+#define AB3100_EVENTA2_READY_RX				(0x02<<8)
+#define AB3100_EVENTA2_OVERRUN_ERROR			(0x04<<8)
+#define AB3100_EVENTA2_FRAMING_ERROR			(0x08<<8)
+#define AB3100_EVENTA2_CHARG_OVERCURRENT		(0x10<<8)
+#define AB3100_EVENTA2_MIDR				(0x20<<8)
+#define AB3100_EVENTA2_BATTERY_REM			(0x40<<8)
+#define AB3100_EVENTA2_ALARM				(0x80<<8)
+
+#define AB3100_EVENTA3_ADC_TRIG5			(0x01)
+#define AB3100_EVENTA3_ADC_TRIG4			(0x02)
+#define AB3100_EVENTA3_ADC_TRIG3			(0x04)
+#define AB3100_EVENTA3_ADC_TRIG2			(0x08)
+#define AB3100_EVENTA3_ADC_TRIGVBAT			(0x10)
+#define AB3100_EVENTA3_ADC_TRIGVTX			(0x20)
+#define AB3100_EVENTA3_ADC_TRIG1			(0x40)
+#define AB3100_EVENTA3_ADC_TRIG0			(0x80)
+
+/* AB3100, STR register flags */
+#define AB3100_STR_ONSWA				(0x01)
+#define AB3100_STR_ONSWB				(0x02)
+#define AB3100_STR_ONSWC				(0x04)
+#define AB3100_STR_DCIO					(0x08)
+#define AB3100_STR_BOOT_MODE				(0x10)
+#define AB3100_STR_SIM_OFF				(0x20)
+#define AB3100_STR_BATT_REMOVAL				(0x40)
+#define AB3100_STR_VBUS					(0x80)
+
+/**
+ * struct ab3100
+ * @access_mutex: lock out concurrent accesses to the AB3100 registers
+ * @dev: pointer to the containing device
+ * @i2c_client: I2C client for this chip
+ * @testreg_client: secondary client for test registers
+ * @chip_name: name of this chip variant
+ * @chip_id: 8 bit chip ID for this chip variant
+ * @work: an event handling worker
+ * @event_subscribers: event subscribers are listed here
+ * @startup_events: a copy of the first reading of the event registers
+ * @startup_events_read: whether the first events have been read
+ *
+ * This struct is PRIVATE and devices using it should NOT
+ * access ANY fields. It is used as a token for calling the
+ * AB3100 functions.
+ */
+struct ab3100 {
+	struct mutex access_mutex;
+	struct device *dev;
+	struct i2c_client *i2c_client;
+	struct i2c_client *testreg_client;
+	char chip_name[32];
+	u8 chip_id;
+	struct work_struct work;
+	struct blocking_notifier_head event_subscribers;
+	u32 startup_events;
+	bool startup_events_read;
+};
+
+int ab3100_set_register(struct ab3100 *ab3100, u8 reg, u8 regval);
+int ab3100_get_register(struct ab3100 *ab3100, u8 reg, u8 *regval);
+int ab3100_get_register_page(struct ab3100 *ab3100,
+			     u8 first_reg, u8 *regvals, u8 numregs);
+int ab3100_mask_and_set_register(struct ab3100 *ab3100,
+				 u8 reg, u8 andmask, u8 ormask);
+u8 ab3100_get_chip_type(struct ab3100 *ab3100);
+int ab3100_event_register(struct ab3100 *ab3100,
+			  struct notifier_block *nb);
+int ab3100_event_unregister(struct ab3100 *ab3100,
+			    struct notifier_block *nb);
+int ab3100_event_registers_startup_state_get(struct ab3100 *ab3100,
+					     u32 *fatevent);
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 13a09f93d2bf3a20c748e1d6a30160a00fc58169 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Thu, 28 May 2009 15:43:37 -0300
Subject: mfd: add PCAP driver

The PCAP Asic as present on EZX phones is a multi function device with
voltage regulators, ADC, touch screen controller, RTC, USB transceiver,
leds controller, and audio codec.

It has two SPI ports, typically one is connected to the application
processor and another to the baseband, this driver provides read/write
functions to its registers, irq demultiplexer and ADC
queueing/abstraction.

This chip is used on a lot of Motorola phones, it was manufactured by TI
as a custom product with the name PTWL93017, later this design evolved
into the ATLAS PMIC from Freescale (MC13783).

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig          |   7 +
 drivers/mfd/Makefile         |   2 +
 drivers/mfd/ezx-pcap.c       | 505 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/ezx-pcap.h | 256 ++++++++++++++++++++++
 4 files changed, 770 insertions(+)
 create mode 100644 drivers/mfd/ezx-pcap.c
 create mode 100644 include/linux/mfd/ezx-pcap.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 61f0346650b..287d47b78e7 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -255,6 +255,13 @@ config AB3100_CORE
 	  LEDs, vibrator, system power and temperature, power management
 	  and ALSA sound.
 
+config EZX_PCAP
+	bool "PCAP Support"
+	depends on GENERIC_HARDIRQS && SPI_MASTER
+	help
+	  This enables the PCAP ASIC present on EZX Phones. This is
+	  needed for MMC, TouchScreen, Sound, USB, etc..
+
 endmenu
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index f5f337143e6..6f8a9a1af20 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -26,6 +26,8 @@ obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
 
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
+obj-$(CONFIG_EZX_PCAP)		+= ezx-pcap.o
+
 obj-$(CONFIG_MCP)		+= mcp-core.o
 obj-$(CONFIG_MCP_SA11X0)	+= mcp-sa11x0.o
 obj-$(CONFIG_MCP_UCB1200)	+= ucb1x00-core.o
diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
new file mode 100644
index 00000000000..671a7efe86a
--- /dev/null
+++ b/drivers/mfd/ezx-pcap.c
@@ -0,0 +1,505 @@
+/*
+ * Driver for Motorola PCAP2 as present in EZX phones
+ *
+ * Copyright (C) 2006 Harald Welte <laforge@openezx.org>
+ * Copyright (C) 2009 Daniel Ribeiro <drwyrm@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/mfd/ezx-pcap.h>
+#include <linux/spi/spi.h>
+
+#define PCAP_ADC_MAXQ		8
+struct pcap_adc_request {
+	u8 bank;
+	u8 ch[2];
+	u32 flags;
+	void (*callback)(void *, u16[]);
+	void *data;
+};
+
+struct pcap_adc_sync_request {
+	u16 res[2];
+	struct completion completion;
+};
+
+struct pcap_chip {
+	struct spi_device *spi;
+
+	/* IO */
+	u32 buf;
+	struct mutex io_mutex;
+
+	/* IRQ */
+	unsigned int irq_base;
+	u32 msr;
+	struct work_struct isr_work;
+	struct work_struct msr_work;
+	struct workqueue_struct *workqueue;
+
+	/* ADC */
+	struct pcap_adc_request *adc_queue[PCAP_ADC_MAXQ];
+	u8 adc_head;
+	u8 adc_tail;
+	struct mutex adc_mutex;
+};
+
+/* IO */
+static int ezx_pcap_putget(struct pcap_chip *pcap, u32 *data)
+{
+	struct spi_transfer t;
+	struct spi_message m;
+	int status;
+
+	memset(&t, 0, sizeof t);
+	spi_message_init(&m);
+	t.len = sizeof(u32);
+	spi_message_add_tail(&t, &m);
+
+	pcap->buf = *data;
+	t.tx_buf = (u8 *) &pcap->buf;
+	t.rx_buf = (u8 *) &pcap->buf;
+	status = spi_sync(pcap->spi, &m);
+
+	if (status == 0)
+		*data = pcap->buf;
+
+	return status;
+}
+
+int ezx_pcap_write(struct pcap_chip *pcap, u8 reg_num, u32 value)
+{
+	int ret;
+
+	mutex_lock(&pcap->io_mutex);
+	value &= PCAP_REGISTER_VALUE_MASK;
+	value |= PCAP_REGISTER_WRITE_OP_BIT
+		| (reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
+	ret = ezx_pcap_putget(pcap, &value);
+	mutex_unlock(&pcap->io_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ezx_pcap_write);
+
+int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value)
+{
+	int ret;
+
+	mutex_lock(&pcap->io_mutex);
+	*value = PCAP_REGISTER_READ_OP_BIT
+		| (reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
+
+	ret = ezx_pcap_putget(pcap, value);
+	mutex_unlock(&pcap->io_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ezx_pcap_read);
+
+/* IRQ */
+static inline unsigned int irq2pcap(struct pcap_chip *pcap, int irq)
+{
+	return 1 << (irq - pcap->irq_base);
+}
+
+int pcap_to_irq(struct pcap_chip *pcap, int irq)
+{
+	return pcap->irq_base + irq;
+}
+EXPORT_SYMBOL_GPL(pcap_to_irq);
+
+static void pcap_mask_irq(unsigned int irq)
+{
+	struct pcap_chip *pcap = get_irq_chip_data(irq);
+
+	pcap->msr |= irq2pcap(pcap, irq);
+	queue_work(pcap->workqueue, &pcap->msr_work);
+}
+
+static void pcap_unmask_irq(unsigned int irq)
+{
+	struct pcap_chip *pcap = get_irq_chip_data(irq);
+
+	pcap->msr &= ~irq2pcap(pcap, irq);
+	queue_work(pcap->workqueue, &pcap->msr_work);
+}
+
+static struct irq_chip pcap_irq_chip = {
+	.name	= "pcap",
+	.mask	= pcap_mask_irq,
+	.unmask	= pcap_unmask_irq,
+};
+
+static void pcap_msr_work(struct work_struct *work)
+{
+	struct pcap_chip *pcap = container_of(work, struct pcap_chip, msr_work);
+
+	ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr);
+}
+
+static void pcap_isr_work(struct work_struct *work)
+{
+	struct pcap_chip *pcap = container_of(work, struct pcap_chip, isr_work);
+	struct pcap_platform_data *pdata = pcap->spi->dev.platform_data;
+	u32 msr, isr, int_sel, service;
+	int irq;
+
+	ezx_pcap_read(pcap, PCAP_REG_MSR, &msr);
+	ezx_pcap_read(pcap, PCAP_REG_ISR, &isr);
+
+	/* We cant service/ack irqs that are assigned to port 2 */
+	if (!(pdata->config & PCAP_SECOND_PORT)) {
+		ezx_pcap_read(pcap, PCAP_REG_INT_SEL, &int_sel);
+		isr &= ~int_sel;
+	}
+	ezx_pcap_write(pcap, PCAP_REG_ISR, isr);
+
+	local_irq_disable();
+	service = isr & ~msr;
+
+	for (irq = pcap->irq_base; service; service >>= 1, irq++) {
+		if (service & 1) {
+			struct irq_desc *desc = irq_to_desc(irq);
+
+			if (WARN(!desc, KERN_WARNING
+					"Invalid PCAP IRQ %d\n", irq))
+				break;
+
+			if (desc->status & IRQ_DISABLED)
+				note_interrupt(irq, desc, IRQ_NONE);
+			else
+				desc->handle_irq(irq, desc);
+		}
+	}
+	local_irq_enable();
+}
+
+static void pcap_irq_handler(unsigned int irq, struct irq_desc *desc)
+{
+	struct pcap_chip *pcap = get_irq_data(irq);
+
+	desc->chip->ack(irq);
+	queue_work(pcap->workqueue, &pcap->isr_work);
+	return;
+}
+
+/* ADC */
+static void pcap_disable_adc(struct pcap_chip *pcap)
+{
+	u32 tmp;
+
+	ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp);
+	tmp &= ~(PCAP_ADC_ADEN|PCAP_ADC_BATT_I_ADC|PCAP_ADC_BATT_I_POLARITY);
+	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
+}
+
+static void pcap_adc_trigger(struct pcap_chip *pcap)
+{
+	u32 tmp;
+	u8 head;
+
+	mutex_lock(&pcap->adc_mutex);
+	head = pcap->adc_head;
+	if (!pcap->adc_queue[head]) {
+		/* queue is empty, save power */
+		pcap_disable_adc(pcap);
+		mutex_unlock(&pcap->adc_mutex);
+		return;
+	}
+	mutex_unlock(&pcap->adc_mutex);
+
+	/* start conversion on requested bank */
+	tmp = pcap->adc_queue[head]->flags | PCAP_ADC_ADEN;
+
+	if (pcap->adc_queue[head]->bank == PCAP_ADC_BANK_1)
+		tmp |= PCAP_ADC_AD_SEL1;
+
+	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
+	ezx_pcap_write(pcap, PCAP_REG_ADR, PCAP_ADR_ASC);
+}
+
+static irqreturn_t pcap_adc_irq(int irq, void *_pcap)
+{
+	struct pcap_chip *pcap = _pcap;
+	struct pcap_adc_request *req;
+	u16 res[2];
+	u32 tmp;
+
+	mutex_lock(&pcap->adc_mutex);
+	req = pcap->adc_queue[pcap->adc_head];
+
+	if (WARN(!req, KERN_WARNING "adc irq without pending request\n"))
+		return IRQ_HANDLED;
+
+	/* read requested channels results */
+	ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp);
+	tmp &= ~(PCAP_ADC_ADA1_MASK | PCAP_ADC_ADA2_MASK);
+	tmp |= (req->ch[0] << PCAP_ADC_ADA1_SHIFT);
+	tmp |= (req->ch[1] << PCAP_ADC_ADA2_SHIFT);
+	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
+	ezx_pcap_read(pcap, PCAP_REG_ADR, &tmp);
+	res[0] = (tmp & PCAP_ADR_ADD1_MASK) >> PCAP_ADR_ADD1_SHIFT;
+	res[1] = (tmp & PCAP_ADR_ADD2_MASK) >> PCAP_ADR_ADD2_SHIFT;
+
+	pcap->adc_queue[pcap->adc_head] = NULL;
+	pcap->adc_head = (pcap->adc_head + 1) & (PCAP_ADC_MAXQ - 1);
+	mutex_unlock(&pcap->adc_mutex);
+
+	/* pass the results and release memory */
+	req->callback(req->data, res);
+	kfree(req);
+
+	/* trigger next conversion (if any) on queue */
+	pcap_adc_trigger(pcap);
+
+	return IRQ_HANDLED;
+}
+
+int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[],
+						void *callback, void *data)
+{
+	struct pcap_adc_request *req;
+
+	/* This will be freed after we have a result */
+	req = kmalloc(sizeof(struct pcap_adc_request), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->bank = bank;
+	req->flags = flags;
+	req->ch[0] = ch[0];
+	req->ch[1] = ch[1];
+	req->callback = callback;
+	req->data = data;
+
+	mutex_lock(&pcap->adc_mutex);
+	if (pcap->adc_queue[pcap->adc_tail]) {
+		mutex_unlock(&pcap->adc_mutex);
+		kfree(req);
+		return -EBUSY;
+	}
+	pcap->adc_queue[pcap->adc_tail] = req;
+	pcap->adc_tail = (pcap->adc_tail + 1) & (PCAP_ADC_MAXQ - 1);
+	mutex_unlock(&pcap->adc_mutex);
+
+	/* start conversion */
+	pcap_adc_trigger(pcap);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pcap_adc_async);
+
+static void pcap_adc_sync_cb(void *param, u16 res[])
+{
+	struct pcap_adc_sync_request *req = param;
+
+	req->res[0] = res[0];
+	req->res[1] = res[1];
+	complete(&req->completion);
+}
+
+int pcap_adc_sync(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[],
+								u16 res[])
+{
+	struct pcap_adc_sync_request sync_data;
+	int ret;
+
+	init_completion(&sync_data.completion);
+	ret = pcap_adc_async(pcap, bank, flags, ch, pcap_adc_sync_cb,
+								&sync_data);
+	if (ret)
+		return ret;
+	wait_for_completion(&sync_data.completion);
+	res[0] = sync_data.res[0];
+	res[1] = sync_data.res[1];
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pcap_adc_sync);
+
+/* subdevs */
+static int pcap_remove_subdev(struct device *dev, void *unused)
+{
+	platform_device_unregister(to_platform_device(dev));
+	return 0;
+}
+
+static int __devinit pcap_add_subdev(struct pcap_chip *pcap,
+						struct pcap_subdev *subdev)
+{
+	struct platform_device *pdev;
+
+	pdev = platform_device_alloc(subdev->name, subdev->id);
+	pdev->dev.parent = &pcap->spi->dev;
+	pdev->dev.platform_data = subdev->platform_data;
+	platform_set_drvdata(pdev, pcap);
+
+	return platform_device_add(pdev);
+}
+
+static int __devexit ezx_pcap_remove(struct spi_device *spi)
+{
+	struct pcap_chip *pcap = dev_get_drvdata(&spi->dev);
+	struct pcap_platform_data *pdata = spi->dev.platform_data;
+	int i, adc_irq;
+
+	/* remove all registered subdevs */
+	device_for_each_child(&spi->dev, NULL, pcap_remove_subdev);
+
+	/* cleanup ADC */
+	adc_irq = pcap_to_irq(pcap, (pdata->config & PCAP_SECOND_PORT) ?
+				PCAP_IRQ_ADCDONE2 : PCAP_IRQ_ADCDONE);
+	free_irq(adc_irq, pcap);
+	mutex_lock(&pcap->adc_mutex);
+	for (i = 0; i < PCAP_ADC_MAXQ; i++)
+		kfree(pcap->adc_queue[i]);
+	mutex_unlock(&pcap->adc_mutex);
+
+	/* cleanup irqchip */
+	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++)
+		set_irq_chip_and_handler(i, NULL, NULL);
+
+	destroy_workqueue(pcap->workqueue);
+
+	kfree(pcap);
+
+	return 0;
+}
+
+static int __devinit ezx_pcap_probe(struct spi_device *spi)
+{
+	struct pcap_platform_data *pdata = spi->dev.platform_data;
+	struct pcap_chip *pcap;
+	int i, adc_irq;
+	int ret = -ENODEV;
+
+	/* platform data is required */
+	if (!pdata)
+		goto ret;
+
+	pcap = kzalloc(sizeof(*pcap), GFP_KERNEL);
+	if (!pcap) {
+		ret = -ENOMEM;
+		goto ret;
+	}
+
+	mutex_init(&pcap->io_mutex);
+	mutex_init(&pcap->adc_mutex);
+	INIT_WORK(&pcap->isr_work, pcap_isr_work);
+	INIT_WORK(&pcap->msr_work, pcap_msr_work);
+	dev_set_drvdata(&spi->dev, pcap);
+
+	/* setup spi */
+	spi->bits_per_word = 32;
+	spi->mode = SPI_MODE_0 | (pdata->config & PCAP_CS_AH ? SPI_CS_HIGH : 0);
+	ret = spi_setup(spi);
+	if (ret)
+		goto free_pcap;
+
+	pcap->spi = spi;
+
+	/* setup irq */
+	pcap->irq_base = pdata->irq_base;
+	pcap->workqueue = create_singlethread_workqueue("pcapd");
+	if (!pcap->workqueue) {
+		dev_err(&spi->dev, "cant create pcap thread\n");
+		goto free_pcap;
+	}
+
+	/* redirect interrupts to AP, except adcdone2 */
+	if (!(pdata->config & PCAP_SECOND_PORT))
+		ezx_pcap_write(pcap, PCAP_REG_INT_SEL,
+					(1 << PCAP_IRQ_ADCDONE2));
+
+	/* setup irq chip */
+	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++) {
+		set_irq_chip_and_handler(i, &pcap_irq_chip, handle_simple_irq);
+		set_irq_chip_data(i, pcap);
+#ifdef CONFIG_ARM
+		set_irq_flags(i, IRQF_VALID);
+#else
+		set_irq_noprobe(i);
+#endif
+	}
+
+	/* mask/ack all PCAP interrupts */
+	ezx_pcap_write(pcap, PCAP_REG_MSR, PCAP_MASK_ALL_INTERRUPT);
+	ezx_pcap_write(pcap, PCAP_REG_ISR, PCAP_CLEAR_INTERRUPT_REGISTER);
+	pcap->msr = PCAP_MASK_ALL_INTERRUPT;
+
+	set_irq_type(spi->irq, IRQ_TYPE_EDGE_RISING);
+	set_irq_data(spi->irq, pcap);
+	set_irq_chained_handler(spi->irq, pcap_irq_handler);
+	set_irq_wake(spi->irq, 1);
+
+	/* ADC */
+	adc_irq = pcap_to_irq(pcap, (pdata->config & PCAP_SECOND_PORT) ?
+					PCAP_IRQ_ADCDONE2 : PCAP_IRQ_ADCDONE);
+
+	ret = request_irq(adc_irq, pcap_adc_irq, 0, "ADC", pcap);
+	if (ret)
+		goto free_irqchip;
+
+	/* setup subdevs */
+	for (i = 0; i < pdata->num_subdevs; i++) {
+		ret = pcap_add_subdev(pcap, &pdata->subdevs[i]);
+		if (ret)
+			goto remove_subdevs;
+	}
+
+	/* board specific quirks */
+	if (pdata->init)
+		pdata->init(pcap);
+
+	return 0;
+
+remove_subdevs:
+	device_for_each_child(&spi->dev, NULL, pcap_remove_subdev);
+/* free_adc: */
+	free_irq(adc_irq, pcap);
+free_irqchip:
+	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++)
+		set_irq_chip_and_handler(i, NULL, NULL);
+/* destroy_workqueue: */
+	destroy_workqueue(pcap->workqueue);
+free_pcap:
+	kfree(pcap);
+ret:
+	return ret;
+}
+
+static struct spi_driver ezxpcap_driver = {
+	.probe	= ezx_pcap_probe,
+	.remove = __devexit_p(ezx_pcap_remove),
+	.driver = {
+		.name	= "ezx-pcap",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init ezx_pcap_init(void)
+{
+	return spi_register_driver(&ezxpcap_driver);
+}
+
+static void __exit ezx_pcap_exit(void)
+{
+	spi_unregister_driver(&ezxpcap_driver);
+}
+
+module_init(ezx_pcap_init);
+module_exit(ezx_pcap_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Ribeiro / Harald Welte");
+MODULE_DESCRIPTION("Motorola PCAP2 ASIC Driver");
diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h
new file mode 100644
index 00000000000..c12c3c0932b
--- /dev/null
+++ b/include/linux/mfd/ezx-pcap.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2009 Daniel Ribeiro <drwyrm@gmail.com>
+ *
+ * For further information, please see http://wiki.openezx.org/PCAP2
+ */
+
+#ifndef EZX_PCAP_H
+#define EZX_PCAP_H
+
+struct pcap_subdev {
+	int id;
+	const char *name;
+	void *platform_data;
+};
+
+struct pcap_platform_data {
+	unsigned int irq_base;
+	unsigned int config;
+	void (*init) (void *);	/* board specific init */
+	int num_subdevs;
+	struct pcap_subdev *subdevs;
+};
+
+struct pcap_chip;
+
+int ezx_pcap_write(struct pcap_chip *, u8, u32);
+int ezx_pcap_read(struct pcap_chip *, u8, u32 *);
+int pcap_to_irq(struct pcap_chip *, int);
+int pcap_adc_async(struct pcap_chip *, u8, u32, u8[], void *, void *);
+int pcap_adc_sync(struct pcap_chip *, u8, u32, u8[], u16[]);
+
+#define PCAP_SECOND_PORT	1
+#define PCAP_CS_AH		2
+
+#define PCAP_REGISTER_WRITE_OP_BIT	0x80000000
+#define PCAP_REGISTER_READ_OP_BIT	0x00000000
+
+#define PCAP_REGISTER_VALUE_MASK	0x01ffffff
+#define PCAP_REGISTER_ADDRESS_MASK	0x7c000000
+#define PCAP_REGISTER_ADDRESS_SHIFT	26
+#define PCAP_REGISTER_NUMBER		32
+#define PCAP_CLEAR_INTERRUPT_REGISTER	0x01ffffff
+#define PCAP_MASK_ALL_INTERRUPT		0x01ffffff
+
+/* registers acessible by both pcap ports */
+#define PCAP_REG_ISR		0x0	/* Interrupt Status */
+#define PCAP_REG_MSR		0x1	/* Interrupt Mask */
+#define PCAP_REG_PSTAT		0x2	/* Processor Status */
+#define PCAP_REG_VREG2		0x6	/* Regulator Bank 2 Control */
+#define PCAP_REG_AUXVREG	0x7	/* Auxiliary Regulator Control */
+#define PCAP_REG_BATT		0x8	/* Battery Control */
+#define PCAP_REG_ADC		0x9	/* AD Control */
+#define PCAP_REG_ADR		0xa	/* AD Result */
+#define PCAP_REG_CODEC		0xb	/* Audio Codec Control */
+#define PCAP_REG_RX_AMPS	0xc	/* RX Audio Amplifiers Control */
+#define PCAP_REG_ST_DAC		0xd	/* Stereo DAC Control */
+#define PCAP_REG_BUSCTRL	0x14	/* Connectivity Control */
+#define PCAP_REG_PERIPH		0x15	/* Peripheral Control */
+#define PCAP_REG_LOWPWR		0x18	/* Regulator Low Power Control */
+#define PCAP_REG_TX_AMPS	0x1a	/* TX Audio Amplifiers Control */
+#define PCAP_REG_GP		0x1b	/* General Purpose */
+#define PCAP_REG_TEST1		0x1c
+#define PCAP_REG_TEST2		0x1d
+#define PCAP_REG_VENDOR_TEST1	0x1e
+#define PCAP_REG_VENDOR_TEST2	0x1f
+
+/* registers acessible by pcap port 1 only (a1200, e2 & e6) */
+#define PCAP_REG_INT_SEL	0x3	/* Interrupt Select */
+#define PCAP_REG_SWCTRL		0x4	/* Switching Regulator Control */
+#define PCAP_REG_VREG1		0x5	/* Regulator Bank 1 Control */
+#define PCAP_REG_RTC_TOD	0xe	/* RTC Time of Day */
+#define PCAP_REG_RTC_TODA	0xf	/* RTC Time of Day Alarm */
+#define PCAP_REG_RTC_DAY	0x10	/* RTC Day */
+#define PCAP_REG_RTC_DAYA	0x11	/* RTC Day Alarm */
+#define PCAP_REG_MTRTMR		0x12	/* AD Monitor Timer */
+#define PCAP_REG_PWR		0x13	/* Power Control */
+#define PCAP_REG_AUXVREG_MASK	0x16	/* Auxiliary Regulator Mask */
+#define PCAP_REG_VENDOR_REV	0x17
+#define PCAP_REG_PERIPH_MASK	0x19	/* Peripheral Mask */
+
+/* PCAP2 Interrupts */
+#define PCAP_NIRQS		23
+#define PCAP_IRQ_ADCDONE	0	/* ADC done port 1 */
+#define PCAP_IRQ_TS		1	/* Touch Screen */
+#define PCAP_IRQ_1HZ		2	/* 1HZ timer */
+#define PCAP_IRQ_WH		3	/* ADC above high limit */
+#define PCAP_IRQ_WL		4	/* ADC below low limit */
+#define PCAP_IRQ_TODA		5	/* Time of day alarm */
+#define PCAP_IRQ_USB4V		6	/* USB above 4V */
+#define PCAP_IRQ_ONOFF		7	/* On/Off button */
+#define PCAP_IRQ_ONOFF2		8	/* On/Off button 2 */
+#define PCAP_IRQ_USB1V		9	/* USB above 1V */
+#define PCAP_IRQ_MOBPORT	10
+#define PCAP_IRQ_MIC		11	/* Mic attach/HS button */
+#define PCAP_IRQ_HS		12	/* Headset attach */
+#define PCAP_IRQ_ST		13
+#define PCAP_IRQ_PC		14	/* Power Cut */
+#define PCAP_IRQ_WARM		15
+#define PCAP_IRQ_EOL		16	/* Battery End Of Life */
+#define PCAP_IRQ_CLK		17
+#define PCAP_IRQ_SYSRST		18	/* System Reset */
+#define PCAP_IRQ_DUMMY		19
+#define PCAP_IRQ_ADCDONE2	20	/* ADC done port 2 */
+#define PCAP_IRQ_SOFTRESET	21
+#define PCAP_IRQ_MNEXB		22
+
+/* voltage regulators */
+#define V1		0
+#define V2		1
+#define V3		2
+#define V4		3
+#define V5		4
+#define V6		5
+#define V7		6
+#define V8		7
+#define V9		8
+#define V10		9
+#define VAUX1		10
+#define VAUX2		11
+#define VAUX3		12
+#define VAUX4		13
+#define VSIM		14
+#define VSIM2		15
+#define VVIB		16
+#define SW1		17
+#define SW2		18
+#define SW3		19
+#define SW1S		20
+#define SW2S		21
+
+#define PCAP_BATT_DAC_MASK		0x000000ff
+#define PCAP_BATT_DAC_SHIFT		0
+#define PCAP_BATT_B_FDBK		(1 << 8)
+#define PCAP_BATT_EXT_ISENSE		(1 << 9)
+#define PCAP_BATT_V_COIN_MASK		0x00003c00
+#define PCAP_BATT_V_COIN_SHIFT		10
+#define PCAP_BATT_I_COIN		(1 << 14)
+#define PCAP_BATT_COIN_CH_EN		(1 << 15)
+#define PCAP_BATT_EOL_SEL_MASK		0x000e0000
+#define PCAP_BATT_EOL_SEL_SHIFT		17
+#define PCAP_BATT_EOL_CMP_EN		(1 << 20)
+#define PCAP_BATT_BATT_DET_EN		(1 << 21)
+#define PCAP_BATT_THERMBIAS_CTRL	(1 << 22)
+
+#define PCAP_ADC_ADEN			(1 << 0)
+#define PCAP_ADC_RAND			(1 << 1)
+#define PCAP_ADC_AD_SEL1		(1 << 2)
+#define PCAP_ADC_AD_SEL2		(1 << 3)
+#define PCAP_ADC_ADA1_MASK		0x00000070
+#define PCAP_ADC_ADA1_SHIFT		4
+#define PCAP_ADC_ADA2_MASK		0x00000380
+#define PCAP_ADC_ADA2_SHIFT		7
+#define PCAP_ADC_ATO_MASK		0x00003c00
+#define PCAP_ADC_ATO_SHIFT		10
+#define PCAP_ADC_ATOX			(1 << 14)
+#define PCAP_ADC_MTR1			(1 << 15)
+#define PCAP_ADC_MTR2			(1 << 16)
+#define PCAP_ADC_TS_M_MASK		0x000e0000
+#define PCAP_ADC_TS_M_SHIFT		17
+#define PCAP_ADC_TS_REF_LOWPWR		(1 << 20)
+#define PCAP_ADC_TS_REFENB		(1 << 21)
+#define PCAP_ADC_BATT_I_POLARITY	(1 << 22)
+#define PCAP_ADC_BATT_I_ADC		(1 << 23)
+
+#define PCAP_ADC_BANK_0			0
+#define PCAP_ADC_BANK_1			1
+/* ADC bank 0 */
+#define PCAP_ADC_CH_COIN		0
+#define PCAP_ADC_CH_BATT		1
+#define PCAP_ADC_CH_BPLUS		2
+#define PCAP_ADC_CH_MOBPORTB		3
+#define PCAP_ADC_CH_TEMPERATURE		4
+#define PCAP_ADC_CH_CHARGER_ID		5
+#define PCAP_ADC_CH_AD6			6
+/* ADC bank 1 */
+#define PCAP_ADC_CH_AD7			0
+#define PCAP_ADC_CH_AD8			1
+#define PCAP_ADC_CH_AD9			2
+#define PCAP_ADC_CH_TS_X1		3
+#define PCAP_ADC_CH_TS_X2		4
+#define PCAP_ADC_CH_TS_Y1		5
+#define PCAP_ADC_CH_TS_Y2		6
+
+#define PCAP_ADC_T_NOW			0
+#define PCAP_ADC_T_IN_BURST		1
+#define PCAP_ADC_T_OUT_BURST		2
+
+#define PCAP_ADC_ATO_IN_BURST		6
+#define PCAP_ADC_ATO_OUT_BURST		0
+
+#define PCAP_ADC_TS_M_XY		1
+#define PCAP_ADC_TS_M_PRESSURE		2
+#define PCAP_ADC_TS_M_PLATE_X		3
+#define PCAP_ADC_TS_M_PLATE_Y		4
+#define PCAP_ADC_TS_M_STANDBY		5
+#define PCAP_ADC_TS_M_NONTS		6
+
+#define PCAP_ADR_ADD1_MASK		0x000003ff
+#define PCAP_ADR_ADD1_SHIFT		0
+#define PCAP_ADR_ADD2_MASK		0x000ffc00
+#define PCAP_ADR_ADD2_SHIFT		10
+#define PCAP_ADR_ADINC1			(1 << 20)
+#define PCAP_ADR_ADINC2			(1 << 21)
+#define PCAP_ADR_ASC			(1 << 22)
+#define PCAP_ADR_ONESHOT		(1 << 23)
+
+#define PCAP_BUSCTRL_FSENB		(1 << 0)
+#define PCAP_BUSCTRL_USB_SUSPEND	(1 << 1)
+#define PCAP_BUSCTRL_USB_PU		(1 << 2)
+#define PCAP_BUSCTRL_USB_PD		(1 << 3)
+#define PCAP_BUSCTRL_VUSB_EN		(1 << 4)
+#define PCAP_BUSCTRL_USB_PS		(1 << 5)
+#define PCAP_BUSCTRL_VUSB_MSTR_EN	(1 << 6)
+#define PCAP_BUSCTRL_VBUS_PD_ENB	(1 << 7)
+#define PCAP_BUSCTRL_CURRLIM		(1 << 8)
+#define PCAP_BUSCTRL_RS232ENB		(1 << 9)
+#define PCAP_BUSCTRL_RS232_DIR		(1 << 10)
+#define PCAP_BUSCTRL_SE0_CONN		(1 << 11)
+#define PCAP_BUSCTRL_USB_PDM		(1 << 12)
+#define PCAP_BUSCTRL_BUS_PRI_ADJ	(1 << 24)
+
+/* leds */
+#define PCAP_LED0		0
+#define PCAP_LED1		1
+#define PCAP_BL0		2
+#define PCAP_BL1		3
+#define PCAP_VIB		4
+#define PCAP_LED_3MA		0
+#define PCAP_LED_4MA		1
+#define PCAP_LED_5MA		2
+#define PCAP_LED_9MA		3
+#define PCAP_LED_GPIO_VAL_MASK	0x00ffffff
+#define PCAP_LED_GPIO_EN	0x01000000
+#define PCAP_LED_GPIO_INVERT	0x02000000
+#define PCAP_LED_T_MASK		0xf
+#define PCAP_LED_C_MASK		0x3
+#define PCAP_BL_MASK		0x1f
+#define PCAP_BL0_SHIFT		0
+#define PCAP_LED0_EN		(1 << 5)
+#define PCAP_LED1_EN		(1 << 6)
+#define PCAP_LED0_T_SHIFT	7
+#define PCAP_LED1_T_SHIFT	11
+#define PCAP_LED0_C_SHIFT	15
+#define PCAP_LED1_C_SHIFT	17
+#define PCAP_BL1_SHIFT		20
+#define PCAP_VIB_MASK		0x3
+#define PCAP_VIB_SHIFT		20
+#define PCAP_VIB_EN		(1 << 19)
+
+/* RTC */
+#define PCAP_RTC_DAY_MASK	0x3fff
+#define PCAP_RTC_TOD_MASK	0xffff
+#define PCAP_RTC_PC_MASK	0x7
+#define SEC_PER_DAY		86400
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 6483c1b5e1a6e3489640a1376e951395982e9615 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Fri, 5 Jun 2009 18:31:01 +0200
Subject: mfd: asic3: add asic3_set_register common operation

Used to configure single bits of the SDHWCTRL_SDCONF and EXTCF_RESET/SELECT
registers needed for DS1WM, MMC/SDIO and PCMCIA functionality.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/asic3.c       | 15 +++++++++++++++
 include/linux/mfd/asic3.h | 10 +++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 9e485459f63..ad3c5913599 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -52,6 +52,21 @@ static inline u32 asic3_read_register(struct asic3 *asic,
 			(reg >> asic->bus_shift));
 }
 
+void asic3_set_register(struct asic3 *asic, u32 reg, u32 bits, bool set)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&asic->lock, flags);
+	val = asic3_read_register(asic, reg);
+	if (set)
+		val |= bits;
+	else
+		val &= ~bits;
+	asic3_write_register(asic, reg, val);
+	spin_unlock_irqrestore(&asic->lock, flags);
+}
+
 /* IRQs */
 #define MAX_ASIC_ISR_LOOPS    20
 #define ASIC3_GPIO_BASE_INCR \
diff --git a/include/linux/mfd/asic3.h b/include/linux/mfd/asic3.h
index 322cd6deb9f..6b427ec4b24 100644
--- a/include/linux/mfd/asic3.h
+++ b/include/linux/mfd/asic3.h
@@ -227,8 +227,8 @@ struct asic3_platform_data {
 
 
 /* Basic control of the SD ASIC */
-#define ASIC3_SDHWCTRL_Base	0x0E00
-#define ASIC3_SDHWCTRL_SDConf    0x00
+#define ASIC3_SDHWCTRL_BASE     0x0E00
+#define ASIC3_SDHWCTRL_SDCONF     0x00
 
 #define ASIC3_SDHWCTRL_SUSPEND    (1 << 0)  /* 1=suspend all SD operations */
 #define ASIC3_SDHWCTRL_CLKSEL     (1 << 1)  /* 1=SDICK, 0=HCLK */
@@ -242,10 +242,10 @@ struct asic3_platform_data {
 /* SD card power supply ctrl 1=enable */
 #define ASIC3_SDHWCTRL_SDPWR      (1 << 6)
 
-#define ASIC3_EXTCF_Base		0x1100
+#define ASIC3_EXTCF_BASE        0x1100
 
-#define ASIC3_EXTCF_Select         0x00
-#define ASIC3_EXTCF_Reset          0x04
+#define ASIC3_EXTCF_SELECT        0x00
+#define ASIC3_EXTCF_RESET         0x04
 
 #define ASIC3_EXTCF_SMOD0	         (1 << 0)  /* slot number of mode 0 */
 #define ASIC3_EXTCF_SMOD1	         (1 << 1)  /* slot number of mode 1 */
-- 
cgit v1.2.3-70-g09d2


From 01906d6d780e84c51537f6b56da472959e846314 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Fri, 5 Jun 2009 18:31:03 +0200
Subject: mfd: add ASIC3 IRQ numbers

IRQ number definitions for PWM, LED, SPI and OWM (ds1wm).

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/asic3.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/asic3.h b/include/linux/mfd/asic3.h
index 6b427ec4b24..d2ecaafaefe 100644
--- a/include/linux/mfd/asic3.h
+++ b/include/linux/mfd/asic3.h
@@ -30,6 +30,13 @@ struct asic3_platform_data {
 #define ASIC3_NUM_GPIOS		64
 #define ASIC3_NR_IRQS		ASIC3_NUM_GPIOS + 6
 
+#define ASIC3_IRQ_LED0		64
+#define ASIC3_IRQ_LED1		65
+#define ASIC3_IRQ_LED2		66
+#define ASIC3_IRQ_SPI		67
+#define ASIC3_IRQ_SMBUS		68
+#define ASIC3_IRQ_OWM		69
+
 #define ASIC3_TO_GPIO(gpio) (NR_BUILTIN_GPIO + (gpio))
 
 #define ASIC3_GPIO_BANK_A	0
-- 
cgit v1.2.3-70-g09d2


From 1b89040c3aa4500c97859bad714e010441e9c477 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Fri, 5 Jun 2009 18:31:05 +0200
Subject: mfd: asic3: remove SD/SDIO controller register definitions

Only the base addresses remain, as they are needed to set up
the IOMEM resources.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/asic3.h | 219 +---------------------------------------------
 1 file changed, 3 insertions(+), 216 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/asic3.h b/include/linux/mfd/asic3.h
index d2ecaafaefe..de3c4ad19af 100644
--- a/include/linux/mfd/asic3.h
+++ b/include/linux/mfd/asic3.h
@@ -286,222 +286,9 @@ struct asic3_platform_data {
  *  SDIO_CTRL         Control registers for SDIO operations
  *
  *****************************************************************************/
-#define ASIC3_SD_CONFIG_Base            0x0400 /* Assumes 32 bit addressing */
-
-#define ASIC3_SD_CONFIG_Command           0x08   /* R/W: Command */
-
-/* [0:8] SD Control Register Base Address */
-#define ASIC3_SD_CONFIG_Addr0             0x20
-
-/* [9:31] SD Control Register Base Address */
-#define ASIC3_SD_CONFIG_Addr1             0x24
-
-/* R/O: interrupt assigned to pin */
-#define ASIC3_SD_CONFIG_IntPin            0x78
-
-/*
- * Set to 0x1f to clock SD controller, 0 otherwise.
- * At 0x82 - Gated Clock Ctrl
- */
-#define ASIC3_SD_CONFIG_ClkStop           0x80
-
-/* Control clock of SD controller */
-#define ASIC3_SD_CONFIG_ClockMode         0x84
-#define ASIC3_SD_CONFIG_SDHC_PinStatus    0x88   /* R/0: SD pins status */
-#define ASIC3_SD_CONFIG_SDHC_Power1       0x90   /* Power1 - manual pwr ctrl */
-
-/* auto power up after card inserted */
-#define ASIC3_SD_CONFIG_SDHC_Power2       0x92
-
-/* auto power down when card removed */
-#define ASIC3_SD_CONFIG_SDHC_Power3       0x94
-#define ASIC3_SD_CONFIG_SDHC_CardDetect   0x98
-#define ASIC3_SD_CONFIG_SDHC_Slot         0xA0   /* R/O: support slot number */
-#define ASIC3_SD_CONFIG_SDHC_ExtGateClk1  0x1E0  /* Not used */
-#define ASIC3_SD_CONFIG_SDHC_ExtGateClk2  0x1E2  /* Not used*/
-
-/* GPIO Output Reg. , at 0x1EA - GPIO Output Enable Reg. */
-#define ASIC3_SD_CONFIG_SDHC_GPIO_OutAndEnable  0x1E8
-#define ASIC3_SD_CONFIG_SDHC_GPIO_Status  0x1EC  /* GPIO Status Reg. */
-
-/* Bit 1: double buffer/single buffer */
-#define ASIC3_SD_CONFIG_SDHC_ExtGateClk3  0x1F0
-
-/* Memory access enable (set to 1 to access SD Controller) */
-#define SD_CONFIG_COMMAND_MAE                (1<<1)
-
-#define SD_CONFIG_CLK_ENABLE_ALL             0x1f
-
-#define SD_CONFIG_POWER1_PC_33V              0x0200    /* Set for 3.3 volts */
-#define SD_CONFIG_POWER1_PC_OFF              0x0000    /* Turn off power */
-
- /* two bits - number of cycles for card detection */
-#define SD_CONFIG_CARDDETECTMODE_CLK           ((x) & 0x3)
-
-
-#define ASIC3_SD_CTRL_Base            0x1000
-
-#define ASIC3_SD_CTRL_Cmd                  0x00
-#define ASIC3_SD_CTRL_Arg0                 0x08
-#define ASIC3_SD_CTRL_Arg1                 0x0C
-#define ASIC3_SD_CTRL_StopInternal         0x10
-#define ASIC3_SD_CTRL_TransferSectorCount  0x14
-#define ASIC3_SD_CTRL_Response0            0x18
-#define ASIC3_SD_CTRL_Response1            0x1C
-#define ASIC3_SD_CTRL_Response2            0x20
-#define ASIC3_SD_CTRL_Response3            0x24
-#define ASIC3_SD_CTRL_Response4            0x28
-#define ASIC3_SD_CTRL_Response5            0x2C
-#define ASIC3_SD_CTRL_Response6            0x30
-#define ASIC3_SD_CTRL_Response7            0x34
-#define ASIC3_SD_CTRL_CardStatus           0x38
-#define ASIC3_SD_CTRL_BufferCtrl           0x3C
-#define ASIC3_SD_CTRL_IntMaskCard          0x40
-#define ASIC3_SD_CTRL_IntMaskBuffer        0x44
-#define ASIC3_SD_CTRL_CardClockCtrl        0x48
-#define ASIC3_SD_CTRL_MemCardXferDataLen   0x4C
-#define ASIC3_SD_CTRL_MemCardOptionSetup   0x50
-#define ASIC3_SD_CTRL_ErrorStatus0         0x58
-#define ASIC3_SD_CTRL_ErrorStatus1         0x5C
-#define ASIC3_SD_CTRL_DataPort             0x60
-#define ASIC3_SD_CTRL_TransactionCtrl      0x68
-#define ASIC3_SD_CTRL_SoftwareReset        0x1C0
-
-#define SD_CTRL_SOFTWARE_RESET_CLEAR            (1<<0)
-
-#define SD_CTRL_TRANSACTIONCONTROL_SET          (1<<8)
-
-#define SD_CTRL_CARDCLOCKCONTROL_FOR_SD_CARD    (1<<15)
-#define SD_CTRL_CARDCLOCKCONTROL_ENABLE_CLOCK   (1<<8)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_512    (1<<7)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_256    (1<<6)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_128    (1<<5)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_64     (1<<4)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_32     (1<<3)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_16     (1<<2)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_8      (1<<1)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_4      (1<<0)
-#define SD_CTRL_CARDCLOCKCONTROL_CLK_DIV_2      (0<<0)
-
-#define MEM_CARD_OPTION_REQUIRED                   0x000e
-#define MEM_CARD_OPTION_DATA_RESPONSE_TIMEOUT(x)   (((x) & 0x0f) << 4)
-#define MEM_CARD_OPTION_C2_MODULE_NOT_PRESENT      (1<<14)
-#define MEM_CARD_OPTION_DATA_XFR_WIDTH_1           (1<<15)
-#define MEM_CARD_OPTION_DATA_XFR_WIDTH_4           0
-
-#define SD_CTRL_COMMAND_INDEX(x)                   ((x) & 0x3f)
-#define SD_CTRL_COMMAND_TYPE_CMD                   (0 << 6)
-#define SD_CTRL_COMMAND_TYPE_ACMD                  (1 << 6)
-#define SD_CTRL_COMMAND_TYPE_AUTHENTICATION        (2 << 6)
-#define SD_CTRL_COMMAND_RESPONSE_TYPE_NORMAL       (0 << 8)
-#define SD_CTRL_COMMAND_RESPONSE_TYPE_EXT_R1       (4 << 8)
-#define SD_CTRL_COMMAND_RESPONSE_TYPE_EXT_R1B      (5 << 8)
-#define SD_CTRL_COMMAND_RESPONSE_TYPE_EXT_R2       (6 << 8)
-#define SD_CTRL_COMMAND_RESPONSE_TYPE_EXT_R3       (7 << 8)
-#define SD_CTRL_COMMAND_DATA_PRESENT               (1 << 11)
-#define SD_CTRL_COMMAND_TRANSFER_READ              (1 << 12)
-#define SD_CTRL_COMMAND_TRANSFER_WRITE             (0 << 12)
-#define SD_CTRL_COMMAND_MULTI_BLOCK                (1 << 13)
-#define SD_CTRL_COMMAND_SECURITY_CMD               (1 << 14)
-
-#define SD_CTRL_STOP_INTERNAL_ISSSUE_CMD12         (1 << 0)
-#define SD_CTRL_STOP_INTERNAL_AUTO_ISSUE_CMD12     (1 << 8)
-
-#define SD_CTRL_CARDSTATUS_RESPONSE_END            (1 << 0)
-#define SD_CTRL_CARDSTATUS_RW_END                  (1 << 2)
-#define SD_CTRL_CARDSTATUS_CARD_REMOVED_0          (1 << 3)
-#define SD_CTRL_CARDSTATUS_CARD_INSERTED_0         (1 << 4)
-#define SD_CTRL_CARDSTATUS_SIGNAL_STATE_PRESENT_0  (1 << 5)
-#define SD_CTRL_CARDSTATUS_WRITE_PROTECT           (1 << 7)
-#define SD_CTRL_CARDSTATUS_CARD_REMOVED_3          (1 << 8)
-#define SD_CTRL_CARDSTATUS_CARD_INSERTED_3         (1 << 9)
-#define SD_CTRL_CARDSTATUS_SIGNAL_STATE_PRESENT_3  (1 << 10)
-
-#define SD_CTRL_BUFFERSTATUS_CMD_INDEX_ERROR       (1 << 0)
-#define SD_CTRL_BUFFERSTATUS_CRC_ERROR             (1 << 1)
-#define SD_CTRL_BUFFERSTATUS_STOP_BIT_END_ERROR    (1 << 2)
-#define SD_CTRL_BUFFERSTATUS_DATA_TIMEOUT          (1 << 3)
-#define SD_CTRL_BUFFERSTATUS_BUFFER_OVERFLOW       (1 << 4)
-#define SD_CTRL_BUFFERSTATUS_BUFFER_UNDERFLOW      (1 << 5)
-#define SD_CTRL_BUFFERSTATUS_CMD_TIMEOUT           (1 << 6)
-#define SD_CTRL_BUFFERSTATUS_UNK7                  (1 << 7)
-#define SD_CTRL_BUFFERSTATUS_BUFFER_READ_ENABLE    (1 << 8)
-#define SD_CTRL_BUFFERSTATUS_BUFFER_WRITE_ENABLE   (1 << 9)
-#define SD_CTRL_BUFFERSTATUS_ILLEGAL_FUNCTION      (1 << 13)
-#define SD_CTRL_BUFFERSTATUS_CMD_BUSY              (1 << 14)
-#define SD_CTRL_BUFFERSTATUS_ILLEGAL_ACCESS        (1 << 15)
-
-#define SD_CTRL_INTMASKCARD_RESPONSE_END           (1 << 0)
-#define SD_CTRL_INTMASKCARD_RW_END                 (1 << 2)
-#define SD_CTRL_INTMASKCARD_CARD_REMOVED_0         (1 << 3)
-#define SD_CTRL_INTMASKCARD_CARD_INSERTED_0        (1 << 4)
-#define SD_CTRL_INTMASKCARD_SIGNAL_STATE_PRESENT_0 (1 << 5)
-#define SD_CTRL_INTMASKCARD_UNK6                   (1 << 6)
-#define SD_CTRL_INTMASKCARD_WRITE_PROTECT          (1 << 7)
-#define SD_CTRL_INTMASKCARD_CARD_REMOVED_3         (1 << 8)
-#define SD_CTRL_INTMASKCARD_CARD_INSERTED_3        (1 << 9)
-#define SD_CTRL_INTMASKCARD_SIGNAL_STATE_PRESENT_3 (1 << 10)
-
-#define SD_CTRL_INTMASKBUFFER_CMD_INDEX_ERROR      (1 << 0)
-#define SD_CTRL_INTMASKBUFFER_CRC_ERROR            (1 << 1)
-#define SD_CTRL_INTMASKBUFFER_STOP_BIT_END_ERROR   (1 << 2)
-#define SD_CTRL_INTMASKBUFFER_DATA_TIMEOUT         (1 << 3)
-#define SD_CTRL_INTMASKBUFFER_BUFFER_OVERFLOW      (1 << 4)
-#define SD_CTRL_INTMASKBUFFER_BUFFER_UNDERFLOW     (1 << 5)
-#define SD_CTRL_INTMASKBUFFER_CMD_TIMEOUT          (1 << 6)
-#define SD_CTRL_INTMASKBUFFER_UNK7                 (1 << 7)
-#define SD_CTRL_INTMASKBUFFER_BUFFER_READ_ENABLE   (1 << 8)
-#define SD_CTRL_INTMASKBUFFER_BUFFER_WRITE_ENABLE  (1 << 9)
-#define SD_CTRL_INTMASKBUFFER_ILLEGAL_FUNCTION     (1 << 13)
-#define SD_CTRL_INTMASKBUFFER_CMD_BUSY             (1 << 14)
-#define SD_CTRL_INTMASKBUFFER_ILLEGAL_ACCESS       (1 << 15)
-
-#define SD_CTRL_DETAIL0_RESPONSE_CMD_ERROR                   (1 << 0)
-#define SD_CTRL_DETAIL0_END_BIT_ERROR_FOR_RESPONSE_NON_CMD12 (1 << 2)
-#define SD_CTRL_DETAIL0_END_BIT_ERROR_FOR_RESPONSE_CMD12     (1 << 3)
-#define SD_CTRL_DETAIL0_END_BIT_ERROR_FOR_READ_DATA          (1 << 4)
-#define SD_CTRL_DETAIL0_END_BIT_ERROR_FOR_WRITE_CRC_STATUS   (1 << 5)
-#define SD_CTRL_DETAIL0_CRC_ERROR_FOR_RESPONSE_NON_CMD12     (1 << 8)
-#define SD_CTRL_DETAIL0_CRC_ERROR_FOR_RESPONSE_CMD12         (1 << 9)
-#define SD_CTRL_DETAIL0_CRC_ERROR_FOR_READ_DATA              (1 << 10)
-#define SD_CTRL_DETAIL0_CRC_ERROR_FOR_WRITE_CMD              (1 << 11)
-
-#define SD_CTRL_DETAIL1_NO_CMD_RESPONSE                      (1 << 0)
-#define SD_CTRL_DETAIL1_TIMEOUT_READ_DATA                    (1 << 4)
-#define SD_CTRL_DETAIL1_TIMEOUT_CRS_STATUS                   (1 << 5)
-#define SD_CTRL_DETAIL1_TIMEOUT_CRC_BUSY                     (1 << 6)
-
-#define ASIC3_SDIO_CTRL_Base          0x1200
-
-#define ASIC3_SDIO_CTRL_Cmd                  0x00
-#define ASIC3_SDIO_CTRL_CardPortSel          0x04
-#define ASIC3_SDIO_CTRL_Arg0                 0x08
-#define ASIC3_SDIO_CTRL_Arg1                 0x0C
-#define ASIC3_SDIO_CTRL_TransferBlockCount   0x14
-#define ASIC3_SDIO_CTRL_Response0            0x18
-#define ASIC3_SDIO_CTRL_Response1            0x1C
-#define ASIC3_SDIO_CTRL_Response2            0x20
-#define ASIC3_SDIO_CTRL_Response3            0x24
-#define ASIC3_SDIO_CTRL_Response4            0x28
-#define ASIC3_SDIO_CTRL_Response5            0x2C
-#define ASIC3_SDIO_CTRL_Response6            0x30
-#define ASIC3_SDIO_CTRL_Response7            0x34
-#define ASIC3_SDIO_CTRL_CardStatus           0x38
-#define ASIC3_SDIO_CTRL_BufferCtrl           0x3C
-#define ASIC3_SDIO_CTRL_IntMaskCard          0x40
-#define ASIC3_SDIO_CTRL_IntMaskBuffer        0x44
-#define ASIC3_SDIO_CTRL_CardXferDataLen      0x4C
-#define ASIC3_SDIO_CTRL_CardOptionSetup      0x50
-#define ASIC3_SDIO_CTRL_ErrorStatus0         0x54
-#define ASIC3_SDIO_CTRL_ErrorStatus1         0x58
-#define ASIC3_SDIO_CTRL_DataPort             0x60
-#define ASIC3_SDIO_CTRL_TransactionCtrl      0x68
-#define ASIC3_SDIO_CTRL_CardIntCtrl          0x6C
-#define ASIC3_SDIO_CTRL_ClocknWaitCtrl       0x70
-#define ASIC3_SDIO_CTRL_HostInformation      0x74
-#define ASIC3_SDIO_CTRL_ErrorCtrl            0x78
-#define ASIC3_SDIO_CTRL_LEDCtrl              0x7C
-#define ASIC3_SDIO_CTRL_SoftwareReset        0x1C0
+#define ASIC3_SD_CONFIG_BASE	0x0400 /* Assumes 32 bit addressing */
+#define ASIC3_SD_CTRL_BASE	0x1000
+#define ASIC3_SDIO_CTRL_BASE	0x1200
 
 #define ASIC3_MAP_SIZE_32BIT	0x2000
 #define ASIC3_MAP_SIZE_16BIT	0x1000
-- 
cgit v1.2.3-70-g09d2


From 4d3792e054f706f73837769a0e5607b3b7ad25a2 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Mon, 15 Jun 2009 15:43:31 +0200
Subject: mfd: fix tmio related warnings

We can not have .driver_data as const since platform_set_drvdata() doesnt take
a const.
The hclk mmc_data field can be const though.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/t7l66xb.c    | 2 +-
 drivers/mfd/tc6387xb.c   | 2 +-
 drivers/mfd/tc6393xb.c   | 2 +-
 include/linux/mfd/tmio.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c
index 875f7a87573..0a255c1f1ce 100644
--- a/drivers/mfd/t7l66xb.c
+++ b/drivers/mfd/t7l66xb.c
@@ -108,7 +108,7 @@ static int t7l66xb_mmc_disable(struct platform_device *mmc)
 
 /*--------------------------------------------------------------------------*/
 
-static const struct tmio_mmc_data t7166xb_mmc_data = {
+static struct tmio_mmc_data t7166xb_mmc_data = {
 	.hclk = 24000000,
 };
 
diff --git a/drivers/mfd/tc6387xb.c b/drivers/mfd/tc6387xb.c
index c3993ac2054..3280ab33f88 100644
--- a/drivers/mfd/tc6387xb.c
+++ b/drivers/mfd/tc6387xb.c
@@ -75,7 +75,7 @@ static int tc6387xb_mmc_disable(struct platform_device *mmc)
 
 /*--------------------------------------------------------------------------*/
 
-const static struct tmio_mmc_data tc6387xb_mmc_data = {
+static struct tmio_mmc_data tc6387xb_mmc_data = {
 	.hclk = 24000000,
 };
 
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 9d2abb5d6e2..1429a7341a9 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -136,7 +136,7 @@ static int tc6393xb_nand_enable(struct platform_device *nand)
 	return 0;
 }
 
-const static struct tmio_mmc_data tc6393xb_mmc_data = {
+static struct tmio_mmc_data tc6393xb_mmc_data = {
 	.hclk = 24000000,
 };
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index c377118884e..6b9c5d06690 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -22,7 +22,7 @@
  * data for the MMC controller
  */
 struct tmio_mmc_data {
-	unsigned int		hclk;
+	const unsigned int		hclk;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 44549dff82753b6a5ffabcefeead34be63e95d96 Mon Sep 17 00:00:00 2001
From: Mike Sager <sager@netapp.com>
Date: Wed, 1 Apr 2009 09:21:47 -0400
Subject: nfs41: define NFS4_MAX_MINOR_VERSION based on CONFIG_NFS_V4_1

If 4.1 isn't supported, NFS4_MAX_MINOR_VERSION will be 0.

Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs4.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e3f0cbcbd0d..7c36fcf2dfb 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -462,6 +462,13 @@ enum lock_type4 {
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
 #define NFS4_MINOR_VERSION 0
+
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif /* CONFIG_NFS_V4_1 */
+
 #define NFS4_DEBUG 1
 
 /* Index of predefined Linux client operations */
-- 
cgit v1.2.3-70-g09d2


From 94a417f3d7a02478a2d7842e693a61339fb54ea4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:49 -0400
Subject: nfs41: nfs_client.cl_minorversion

This field is set to the nfsv4 minor version for this mount.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>

Note: This patch sets the referral to the same minorversion as the
current mount. Revisit in future patch.

Signed-off-by: Andy Adamson <andros@netapp.com>
[removed cl_minorversion assignment in nfs_set_client]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[always define nfs_client.cl_minorversion]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 9 ++++++---
 include/linux/nfs_fs_sb.h | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 75c9cd2aa11..0efcb55c2ca 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1101,7 +1101,8 @@ static int nfs4_set_client(struct nfs_server *server,
 		const size_t addrlen,
 		const char *ip_addr,
 		rpc_authflavor_t authflavour,
-		int proto, const struct rpc_timeout *timeparms)
+		int proto, const struct rpc_timeout *timeparms,
+		u32 minorversion)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -1164,7 +1165,8 @@ static int nfs4_init_server(struct nfs_server *server,
 			data->client_address,
 			data->auth_flavors[0],
 			data->nfs_server.protocol,
-			&timeparms);
+			&timeparms,
+			data->minorversion);
 	if (error < 0)
 		goto error;
 
@@ -1282,7 +1284,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				parent_client->cl_ipaddr,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
-				parent_server->client->cl_timeout);
+				parent_server->client->cl_timeout,
+				parent_client->cl_minorversion);
 	if (error < 0)
 		goto error;
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6ad75948cbf..e9a51fe46aa 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -32,6 +32,7 @@ struct nfs_client {
 	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
 	int			cl_proto;	/* Network transport protocol */
 
+	u32			cl_minorversion;/* NFSv4 minorversion */
 	struct rpc_cred		*cl_machine_cred;
 
 #ifdef CONFIG_NFS_V4
@@ -63,7 +64,7 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
-#endif
+#endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
-- 
cgit v1.2.3-70-g09d2


From 9ff71c3a9827b99699510076dffa0bbe7c36bfd4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:52 -0400
Subject: nfs41: client xdr definitions

Define stubs for sequence args and res data structures and embed
them in all other nfs4 and nfs41 xdr types.  They are needed for
sending any op in a nfs41 compound rpc.

Signed-off-by: Andy Adamson<andros@netapp.com>
[moved new args/res definitions away, to where they're first used]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b89c34e40bc..f2c5700c7b6 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -145,6 +145,15 @@ struct nfs4_change_info {
 };
 
 struct nfs_seqid;
+
+struct nfs4_sequence_args {
+	/* stub */
+};
+
+struct nfs4_sequence_res {
+	/* stub */
+};
+
 /*
  * Arguments to the open call.
  */
@@ -165,6 +174,7 @@ struct nfs_openargs {
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
 	__u32			claim;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_openres {
@@ -181,6 +191,7 @@ struct nfs_openres {
 	__u32			do_recall;
 	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -206,6 +217,7 @@ struct nfs_closeargs {
 	struct nfs_seqid *	seqid;
 	fmode_t			fmode;
 	const u32 *		bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_closeres {
@@ -213,6 +225,7 @@ struct nfs_closeres {
 	struct nfs_fattr *	fattr;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
 };
 /*
  *  * Arguments to the lock,lockt, and locku call.
@@ -233,12 +246,14 @@ struct nfs_lock_args {
 	unsigned char		block : 1;
 	unsigned char		reclaim : 1;
 	unsigned char		new_lock_owner : 1;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_lock_res {
 	nfs4_stateid		stateid;
 	struct nfs_seqid *	lock_seqid;
 	struct nfs_seqid *	open_seqid;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_locku_args {
@@ -246,32 +261,38 @@ struct nfs_locku_args {
 	struct file_lock *	fl;
 	struct nfs_seqid *	seqid;
 	nfs4_stateid *		stateid;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_locku_res {
 	nfs4_stateid		stateid;
 	struct nfs_seqid *	seqid;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_lockt_args {
 	struct nfs_fh *		fh;
 	struct file_lock *	fl;
 	struct nfs_lowner	lock_owner;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_lockt_res {
 	struct file_lock *	denied; /* LOCK, LOCKT failed */
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_delegreturnargs {
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
 	const u32 * bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_delegreturnres {
 	struct nfs_fattr * fattr;
 	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -284,12 +305,14 @@ struct nfs_readargs {
 	__u32			count;
 	unsigned int		pgbase;
 	struct page **		pages;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_readres {
 	struct nfs_fattr *	fattr;
 	__u32			count;
 	int                     eof;
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -304,6 +327,7 @@ struct nfs_writeargs {
 	unsigned int		pgbase;
 	struct page **		pages;
 	const u32 *		bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_writeverf {
@@ -316,6 +340,7 @@ struct nfs_writeres {
 	struct nfs_writeverf *	verf;
 	__u32			count;
 	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
 };
 
 /*
@@ -325,12 +350,14 @@ struct nfs_removeargs {
 	const struct nfs_fh	*fh;
 	struct qstr		name;
 	const u32 *		bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_removeres {
 	const struct nfs_server *server;
 	struct nfs4_change_info	cinfo;
 	struct nfs_fattr	dir_attr;
+	struct nfs4_sequence_res 	seq_res;
 };
 
 /*
@@ -383,6 +410,7 @@ struct nfs_setattrargs {
 	struct iattr *                  iap;
 	const struct nfs_server *	server; /* Needed for name mapping */
 	const u32 *			bitmask;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs_setaclargs {
@@ -390,6 +418,7 @@ struct nfs_setaclargs {
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs_getaclargs {
@@ -397,11 +426,13 @@ struct nfs_getaclargs {
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs_setattrres {
 	struct nfs_fattr *              fattr;
 	const struct nfs_server *	server;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs_linkargs {
@@ -583,6 +614,7 @@ struct nfs4_accessargs {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
 	u32				access;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_accessres {
@@ -590,6 +622,7 @@ struct nfs4_accessres {
 	struct nfs_fattr *		fattr;
 	u32				supported;
 	u32				access;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_create_arg {
@@ -609,6 +642,7 @@ struct nfs4_create_arg {
 	const struct iattr *		attrs;
 	const struct nfs_fh *		dir_fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs4_create_res {
@@ -617,21 +651,25 @@ struct nfs4_create_res {
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		dir_cinfo;
 	struct nfs_fattr *		dir_fattr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_fsinfo_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_getattr_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_getattr_res {
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_link_arg {
@@ -639,6 +677,7 @@ struct nfs4_link_arg {
 	const struct nfs_fh *		dir_fh;
 	const struct qstr *		name;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args 	seq_args;
 };
 
 struct nfs4_link_res {
@@ -646,6 +685,7 @@ struct nfs4_link_res {
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		cinfo;
 	struct nfs_fattr *		dir_attr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 
@@ -653,21 +693,25 @@ struct nfs4_lookup_arg {
 	const struct nfs_fh *		dir_fh;
 	const struct qstr *		name;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_lookup_res {
 	const struct nfs_server *	server;
 	struct nfs_fattr *		fattr;
 	struct nfs_fh *			fh;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_lookup_root_arg {
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_pathconf_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_readdir_arg {
@@ -678,11 +722,13 @@ struct nfs4_readdir_arg {
 	struct page **			pages;	/* zero-copy data */
 	unsigned int			pgbase;	/* zero-copy data */
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_readdir_res {
 	nfs4_verifier			verifier;
 	unsigned int			pgbase;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_readlink {
@@ -690,6 +736,7 @@ struct nfs4_readlink {
 	unsigned int			pgbase;
 	unsigned int			pglen;   /* zero-copy data */
 	struct page **			pages;   /* zero-copy data */
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_rename_arg {
@@ -698,6 +745,7 @@ struct nfs4_rename_arg {
 	const struct qstr *		old_name;
 	const struct qstr *		new_name;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_rename_res {
@@ -706,6 +754,7 @@ struct nfs4_rename_res {
 	struct nfs_fattr *		old_fattr;
 	struct nfs4_change_info		new_cinfo;
 	struct nfs_fattr *		new_fattr;
+	struct nfs4_sequence_res	seq_res;
 };
 
 #define NFS4_SETCLIENTID_NAMELEN	(127)
@@ -724,6 +773,7 @@ struct nfs4_setclientid {
 struct nfs4_statfs_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 struct nfs4_server_caps_res {
@@ -731,6 +781,7 @@ struct nfs4_server_caps_res {
 	u32				acl_bitmask;
 	u32				has_links;
 	u32				has_symlinks;
+	struct nfs4_sequence_res	seq_res;
 };
 
 struct nfs4_string {
@@ -765,6 +816,7 @@ struct nfs4_fs_locations_arg {
 	const struct qstr *name;
 	struct page *page;
 	const u32 *bitmask;
+	struct nfs4_sequence_args	seq_args;
 };
 
 #endif /* CONFIG_NFS_V4 */
-- 
cgit v1.2.3-70-g09d2


From 557134a39c8d2ab79d8b8d53438e03e29feb5ec4 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:21:53 -0400
Subject: nfs41: sessions client infrastructure

NFSv4.1 Sessions basic data types, initialization, and destruction.

The session is always associated with a struct nfs_client that holds
the exchange_id results.

Signed-off-by: Rahul Iyer <iyer@netapp.com>
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[remove extraneous rpc_clnt pointer, use the struct nfs_client cl_rpcclient.
remove the rpc_clnt parameter from nfs4 nfs4_init_session]
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Use the presence of a session to determine behaviour instead of the
minorversion number.]
Signed-off-by: Andy Adamson <andros@netapp.com>
[constified nfs4_has_session's struct nfs_client parameter]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Rename nfs4_put_session() to nfs4_destroy_session() and call it from nfs4_free_client() not nfs4_free_server().
Also get rid of nfs4_get_session() and the ref_count in nfs4_session struct as keeping track of nfs_client should be sufficient]
Signed-off-by: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
[nfs41: pass rsize and wsize into nfs4_init_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
[separated out removal of rpc_clnt parameter from nfs4_init_session ot a
 patch of its own]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Pass the nfs_client pointer into nfs4_alloc_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: don't assign to session->clp->cl_session in nfs4_destroy_session]
[nfs41: fixup nfs4_clear_client_minor_version]
[introduce nfs4_clear_client_minor_version() in this patch]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Refactor nfs4_init_session]
    Moved session allocation into nfs4_init_client_minor_version, called from
    nfs4_init_client.
    Leave rwise and wsize initialization in nfs4_init_session, called from
    nfs4_init_server.
    Reverted moving of nfs_fsid definition to nfs_fs_sb.h
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: Move NFS4_MAX_SLOT_TABLE define from under CONFIG_NFS_V4_1]
[Fix comile error when CONFIG_NFS_V4_1 is not set.]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[moved nfs4_init_slot_table definition to "create_session operation"]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: alloc session with GFP_KERNEL]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/internal.h         | 12 ++++++++++
 fs/nfs/nfs4_fs.h          |  4 ++++
 fs/nfs/nfs4proc.c         | 34 +++++++++++++++++++++++++++
 include/linux/nfs_fs_sb.h | 49 ++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_xdr.h   | 15 ++++++++++++
 6 files changed, 174 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a736160046c..f1506f14852 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -183,6 +183,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 #endif
 }
 
+/*
+ * Clears/puts all minor version specific parts from an nfs_client struct
+ * reverting it to minorversion 0.
+ */
+static void nfs4_clear_client_minor_version(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (nfs4_has_session(clp)) {
+		nfs4_destroy_session(clp->cl_session);
+		clp->cl_session = NULL;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+}
+
 /*
  * Destroy a shared client record
  */
@@ -190,6 +204,7 @@ static void nfs_free_client(struct nfs_client *clp)
 {
 	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
 
+	nfs4_clear_client_minor_version(clp);
 	nfs4_shutdown_client(clp);
 
 	nfs_fscache_release_client_cookie(clp);
@@ -1053,6 +1068,30 @@ error:
 }
 
 #ifdef CONFIG_NFS_V4
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (clp->cl_minorversion) {
+		struct nfs4_session *session = NULL;
+		/*
+		 * Create the session and mark it expired.
+		 * When a SEQUENCE operation encounters the expired session
+		 * it will do session recovery to initialize it.
+		 */
+		session = nfs4_alloc_session(clp);
+		if (!session)
+			return -ENOMEM;
+
+		clp->cl_session = session;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+
+	return 0;
+}
+
 /*
  * Initialise an NFS4 client record
  */
@@ -1087,6 +1126,10 @@ static int nfs4_init_client(struct nfs_client *clp,
 	}
 	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
 
+	error = nfs4_init_client_minor_version(clp);
+	if (error < 0)
+		goto error;
+
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 	return 0;
 
@@ -1143,6 +1186,21 @@ error:
 	return error;
 }
 
+/*
+ * Initialize a session.
+ * Note: save the mount rsize and wsize for create_server negotiation.
+ */
+static void nfs4_init_session(struct nfs_client *clp,
+			      unsigned int wsize, unsigned int rsize)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (nfs4_has_session(clp)) {
+		clp->cl_session->fc_attrs.max_rqst_sz = wsize;
+		clp->cl_session->fc_attrs.max_resp_sz = rsize;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+}
+
 /*
  * Create a version 4 volume record
  */
@@ -1221,6 +1279,8 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
+	nfs4_init_session(server->nfs_client, server->wsize, server->rsize);
+
 	/* Probe the root fh to retrieve its FSID */
 	error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
 	if (error < 0)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ffa6bd54d43..7cef45db925 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -206,6 +206,18 @@ extern int nfs4_path_walk(struct nfs_server *server,
 			  const char *path);
 #endif
 
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (clp->cl_session)
+		return 1;
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
 /*
  * Determine the device name as a string
  */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84345deab26..acac6f8c3d3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -202,6 +202,10 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
+#if defined(CONFIG_NFS_V4_1)
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
 
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4674f8092da..cdd8e74c47d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3723,6 +3723,40 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	return status;
 }
 
+#ifdef CONFIG_NFS_V4_1
+/* Destroy the slot table */
+static void nfs4_destroy_slot_table(struct nfs4_session *session)
+{
+	if (session->fc_slot_table.slots == NULL)
+		return;
+	kfree(session->fc_slot_table.slots);
+	session->fc_slot_table.slots = NULL;
+	return;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+
+	session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+	if (!session)
+		return NULL;
+	tbl = &session->fc_slot_table;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "Slot table");
+	session->clp = clp;
+	return session;
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+	nfs4_destroy_slot_table(session);
+	kfree(session);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
 	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
 	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e9a51fe46aa..b47c0fc55d4 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -4,9 +4,12 @@
 #include <linux/list.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/xprt.h>
 
 #include <asm/atomic.h>
 
+struct nfs4_session;
 struct nfs_iostats;
 struct nlm_host;
 
@@ -66,6 +69,10 @@ struct nfs_client {
 	unsigned char		cl_id_uniquifier;
 #endif /* CONFIG_NFS_V4 */
 
+#ifdef CONFIG_NFS_V4_1
+	struct nfs4_session	*cl_session; 	/* sharred session */
+#endif /* CONFIG_NFS_V4_1 */
+
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
@@ -146,4 +153,46 @@ struct nfs_server {
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
 
+
+/* maximum number of slots to use */
+#define NFS4_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE
+
+#if defined(CONFIG_NFS_V4_1)
+
+/* Sessions */
+#define SLOT_TABLE_SZ (NFS4_MAX_SLOT_TABLE/(8*sizeof(long)))
+struct nfs4_slot_table {
+	struct nfs4_slot *slots;		/* seqid per slot */
+	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+	spinlock_t	slot_tbl_lock;
+	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */
+	int		max_slots;		/* # slots in table */
+	int		highest_used_slotid;	/* sent to server on each SEQ.
+						 * op for dynamic resizing */
+};
+
+static inline int slot_idx(struct nfs4_slot_table *tbl, struct nfs4_slot *sp)
+{
+	return sp - tbl->slots;
+}
+
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+	struct nfs4_sessionid		sess_id;
+	u32				flags;
+	unsigned long			session_state;
+	u32				hash_alg;
+	u32				ssv_len;
+
+	/* The fore and back channel */
+	struct nfs4_channel_attrs	fc_attrs;
+	struct nfs4_slot_table		fc_slot_table;
+	struct nfs4_channel_attrs	bc_attrs;
+					/* back channel has one slot */
+	struct nfs_client		*clp;
+};
+
+#endif /* CONFIG_NFS_V4_1 */
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f2c5700c7b6..8f8c026c558 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -146,6 +146,21 @@ struct nfs4_change_info {
 
 struct nfs_seqid;
 
+/* nfs41 sessions channel attributes */
+struct nfs4_channel_attrs {
+	u32			headerpadsz;
+	u32			max_rqst_sz;
+	u32			max_resp_sz;
+	u32			max_resp_sz_cached;
+	u32			max_ops;
+	u32			max_reqs;
+};
+
+/* nfs41 sessions slot seqid */
+struct nfs4_slot {
+	u32		 	seq_nr;
+};
+
 struct nfs4_sequence_args {
 	/* stub */
 };
-- 
cgit v1.2.3-70-g09d2


From 43652ad55342d9146d8035932101a5814b22315a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:54 -0400
Subject: nfs41: use nfs4_server_caps_arg

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_server_caps_arg]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cdd8e74c47d..5ef1022b7e6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1584,10 +1584,13 @@ void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+	struct nfs4_server_caps_arg args = {
+		.fhandle = fhandle,
+	};
 	struct nfs4_server_caps_res res = {};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
-		.rpc_argp = fhandle,
+		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
 	int status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1690f0e44b9..91305861037 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1900,7 +1900,8 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
  * GETATTR_BITMAP request
  */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+				    struct nfs4_server_caps_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1909,7 +1910,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
-	encode_putfh(&xdr, fhandle, &hdr);
+	encode_putfh(&xdr, args->fhandle, &hdr);
 	encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
 			   FATTR4_WORD0_LINK_SUPPORT|
 			   FATTR4_WORD0_SYMLINK_SUPPORT|
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8f8c026c558..a7b7f2a059c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -791,6 +791,11 @@ struct nfs4_statfs_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_server_caps_arg {
+	struct nfs_fh		       *fhandle;
+	struct nfs4_sequence_args	seq_args;
+};
+
 struct nfs4_server_caps_res {
 	u32				attr_bitmask[2];
 	u32				acl_bitmask;
-- 
cgit v1.2.3-70-g09d2


From f50c7000817e7cb4e676ac5d911a82c0f3fd226f Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:55 -0400
Subject: nfs41: use nfs4_readlink_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_readlink_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 3 ++-
 fs/nfs/nfs4xdr.c        | 3 ++-
 include/linux/nfs_xdr.h | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5ef1022b7e6..b0ec8ff96eb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1960,10 +1960,11 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
 		.pglen    = pglen,
 		.pages    = &page,
 	};
+	struct nfs4_readlink_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
 		.rpc_argp = &args,
-		.rpc_resp = NULL,
+		.rpc_resp = &res,
 	};
 
 	return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 91305861037..1e41420916a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4235,7 +4235,8 @@ out:
 /*
  * Decode READLINK response
  */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+				 struct nfs4_readlink_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a7b7f2a059c..f71260aeb80 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -754,6 +754,10 @@ struct nfs4_readlink {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_readlink_res {
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_rename_arg {
 	const struct nfs_fh *		old_dir;
 	const struct nfs_fh *		new_dir;
-- 
cgit v1.2.3-70-g09d2


From 24ad148a0ff74b1e703a8bc5b3e0793dc7d4e3a9 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:56 -0400
Subject: nfs41: use nfs4_statfs_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_statfs_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b0ec8ff96eb..b715f605761 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2426,10 +2426,13 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 		.fh = fhandle,
 		.bitmask = server->attr_bitmask,
 	};
+	struct nfs4_statfs_res res = {
+		.fsstat = fsstat,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
 		.rpc_argp = &args,
-		.rpc_resp = fsstat,
+		.rpc_resp = &res,
 	};
 
 	nfs_fattr_init(fsstat->fattr);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1e41420916a..b7871ad82aa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4387,7 +4387,8 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
 /*
  * STATFS request
  */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+			       struct nfs4_statfs_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4398,7 +4399,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fssta
 	if (!status)
 		status = decode_putfh(&xdr);
 	if (!status)
-		status = decode_statfs(&xdr, fsstat);
+		status = decode_statfs(&xdr, res->fsstat);
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f71260aeb80..4dac59ef6f4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -795,6 +795,11 @@ struct nfs4_statfs_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_statfs_res {
+	struct nfs_fsstat	       *fsstat;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_server_caps_arg {
 	struct nfs_fh		       *fhandle;
 	struct nfs4_sequence_args	seq_args;
-- 
cgit v1.2.3-70-g09d2


From 3dda5e434721f942870ee30bc6103761618d410f Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:57 -0400
Subject: nfs41: use nfs4_fsinfo_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_fsinfo_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b715f605761..b8915ef533d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2458,10 +2458,13 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 		.fh = fhandle,
 		.bitmask = server->attr_bitmask,
 	};
+	struct nfs4_fsinfo_res res = {
+		.fsinfo = fsinfo,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
 		.rpc_argp = &args,
-		.rpc_resp = fsinfo,
+		.rpc_resp = &res,
 	};
 
 	return rpc_call_sync(server->client, &msg, 0);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b7871ad82aa..d9ab8209c28 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4351,7 +4351,8 @@ out:
 /*
  * FSINFO request
  */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+			       struct nfs4_fsinfo_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4362,7 +4363,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
 	if (!status)
 		status = decode_putfh(&xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, fsinfo);
+		status = decode_fsinfo(&xdr, res->fsinfo);
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4dac59ef6f4..7d64913cbb1 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -675,6 +675,11 @@ struct nfs4_fsinfo_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_fsinfo_res {
+	struct nfs_fsinfo	       *fsinfo;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_getattr_arg {
 	const struct nfs_fh *		fh;
 	const u32 *			bitmask;
-- 
cgit v1.2.3-70-g09d2


From d45b2989a7956ae9e71d584ceac942278c0371c7 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:58 -0400
Subject: nfs41: use nfs4_pathconf_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs4_pathconf_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 5 +++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b8915ef533d..aea2e83d393 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2496,10 +2496,13 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
 		.fh = fhandle,
 		.bitmask = server->attr_bitmask,
 	};
+	struct nfs4_pathconf_res res = {
+		.pathconf = pathconf,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
 		.rpc_argp = &args,
-		.rpc_resp = pathconf,
+		.rpc_resp = &res,
 	};
 
 	/* None of the pathconf attributes are mandatory to implement */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d9ab8209c28..a77ee3dd0b3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4370,7 +4370,8 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
 /*
  * PATHCONF request
  */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+				 struct nfs4_pathconf_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4381,7 +4382,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
 	if (!status)
 		status = decode_putfh(&xdr);
 	if (!status)
-		status = decode_pathconf(&xdr, pathconf);
+		status = decode_pathconf(&xdr, res->pathconf);
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7d64913cbb1..56523319e14 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -734,6 +734,11 @@ struct nfs4_pathconf_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_pathconf_res {
+	struct nfs_pathconf	       *pathconf;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs4_readdir_arg {
 	const struct nfs_fh *		fh;
 	u64				cookie;
-- 
cgit v1.2.3-70-g09d2


From 663c79b3cd8f5fe21fe7d7565fec0072e3234ddc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:21:59 -0400
Subject: nfs41: use nfs4_getaclres

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: embed resp_len in nfs_getaclres]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 20 +++++++++++---------
 fs/nfs/nfs4xdr.c        |  5 +++--
 include/linux/nfs_xdr.h |  5 +++++
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aea2e83d393..20c9acf689f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2755,12 +2755,14 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		.acl_pages = pages,
 		.acl_len = buflen,
 	};
-	size_t resp_len = buflen;
+	struct nfs_getaclres res = {
+		.acl_len = buflen,
+	};
 	void *resp_buf;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
 		.rpc_argp = &args,
-		.rpc_resp = &resp_len,
+		.rpc_resp = &res,
 	};
 	struct page *localpage = NULL;
 	int ret;
@@ -2774,7 +2776,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 			return -ENOMEM;
 		args.acl_pages[0] = localpage;
 		args.acl_pgbase = 0;
-		resp_len = args.acl_len = PAGE_SIZE;
+		args.acl_len = PAGE_SIZE;
 	} else {
 		resp_buf = buf;
 		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
@@ -2782,18 +2784,18 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (ret)
 		goto out_free;
-	if (resp_len > args.acl_len)
-		nfs4_write_cached_acl(inode, NULL, resp_len);
+	if (res.acl_len > args.acl_len)
+		nfs4_write_cached_acl(inode, NULL, res.acl_len);
 	else
-		nfs4_write_cached_acl(inode, resp_buf, resp_len);
+		nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
 	if (buf) {
 		ret = -ERANGE;
-		if (resp_len > buflen)
+		if (res.acl_len > buflen)
 			goto out_free;
 		if (localpage)
-			memcpy(buf, resp_buf, resp_len);
+			memcpy(buf, resp_buf, res.acl_len);
 	}
-	ret = resp_len;
+	ret = res.acl_len;
 out_free:
 	if (localpage)
 		__free_page(localpage);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a77ee3dd0b3..3e777893e2b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4018,7 +4018,8 @@ out:
  * Decode GETACL response
  */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+		    struct nfs_getaclres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4031,7 +4032,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
 	status = decode_putfh(&xdr);
 	if (status)
 		goto out;
-	status = decode_getacl(&xdr, rqstp, acl_len);
+	status = decode_getacl(&xdr, rqstp, &res->acl_len);
 
 out:
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 56523319e14..6e9ee284860 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -444,6 +444,11 @@ struct nfs_getaclargs {
 	struct nfs4_sequence_args 	seq_args;
 };
 
+struct nfs_getaclres {
+	size_t				acl_len;
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs_setattrres {
 	struct nfs_fattr *              fattr;
 	const struct nfs_server *	server;
-- 
cgit v1.2.3-70-g09d2


From 73c403a9a93743b068103c13c05ed136dc687d05 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:01 -0400
Subject: nfs41: use nfs4_setaclres

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[define nfs_setaclres]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 3 ++-
 fs/nfs/nfs4xdr.c        | 3 ++-
 include/linux/nfs_xdr.h | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 20c9acf689f..62bbe25d942 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2842,10 +2842,11 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		.acl_pages	= pages,
 		.acl_len	= buflen,
 	};
+	struct nfs_setaclres res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETACL],
 		.rpc_argp	= &arg,
-		.rpc_resp	= NULL,
+		.rpc_resp	= &res,
 	};
 	int ret;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 27dd25d9ad4..aa350d5bf20 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3996,7 +3996,8 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
  * Decode SETACL response
  */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+		    struct nfs_setaclres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6e9ee284860..0f2dc8f4cc3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -436,6 +436,10 @@ struct nfs_setaclargs {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs_setaclres {
+	struct nfs4_sequence_res	seq_res;
+};
+
 struct nfs_getaclargs {
 	struct nfs_fh *			fh;
 	size_t				acl_len;
-- 
cgit v1.2.3-70-g09d2


From 22958463d5dca8548e19430779f379e66fd6e4a4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:02 -0400
Subject: nfs41: use nfs4_fs_locations_res

In preparation for nfs41 sequence processing.

Signed-off-by: Andy Admason <andros@netapp.com>
[find nfs4_fs_locations_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 ++++-
 fs/nfs/nfs4xdr.c        | 6 ++++--
 include/linux/nfs_xdr.h | 5 +++++
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 62bbe25d942..e08edc99faa 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3722,10 +3722,13 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		.page = page,
 		.bitmask = bitmask,
 	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations = fs_locations,
+	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
 		.rpc_argp = &args,
-		.rpc_resp = fs_locations,
+		.rpc_resp = &res,
 	};
 	int status;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index aa350d5bf20..e448e33b4d0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4504,7 +4504,8 @@ out:
 /*
  * FS_LOCATIONS request
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+				     struct nfs4_fs_locations_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4519,7 +4520,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 	if ((status = decode_lookup(&xdr)) != 0)
 		goto out;
 	xdr_enter_page(&xdr, PAGE_SIZE);
-	status = decode_getfattr(&xdr, &res->fattr, res->server);
+	status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+				 res->fs_locations->server);
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0f2dc8f4cc3..d837f10c49e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -867,6 +867,11 @@ struct nfs4_fs_locations_arg {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs4_fs_locations_res {
+	struct nfs4_fs_locations       *fs_locations;
+	struct nfs4_sequence_res	seq_res;
+};
+
 #endif /* CONFIG_NFS_V4 */
 
 struct nfs_page;
-- 
cgit v1.2.3-70-g09d2


From cccef3b96a4759ae0790452280c00ea505412157 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:03 -0400
Subject: nfs41: introduce nfs4_call_sync

Use nfs4_call_sync rather than rpc_call_sync to provide
for a nfs41 sessions-enabled interface for sessions manipulation.

The nfs41 rpc logic uses the rpc_call_prepare method to
recover and create the session, as well as selecting a free slot id
and the rpc_call_done to free the slot and update slot table
related metadata.

In the coming patches we'll add rpc prepare and done routines
for setting up the sequence op and processing the sequence result.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: nfs4_call_sync]
As per 11-14-08 review.
Squash into "nfs41: introduce nfs4_call_sync" and "nfs41: nfs4_setup_sequence"
Define two functions one for v4 and one for v41
add a pointer to struct nfs4_client to the correct one.
Signed-off-by: Andy Adamson <andros@netapp.com>
[added BUG() in _nfs4_call_sync_session if !CONFIG_NFS_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: check for session not minorversion]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[group minorversion specific stuff together]
Signed-off-by: Alexandros Batsakis <Alexandros.Batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfs41: fixup nfs4_clear_client_minor_version]
[introduce nfs4_init_client_minor_version() in this patch]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[cleaned-up patch: got rid of nfs_call_sync_t, dprintks, cosmetics, extra server defs]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  5 ++++
 fs/nfs/internal.h         | 12 +++++++++
 fs/nfs/nfs4proc.c         | 67 +++++++++++++++++++++++++++++++++--------------
 include/linux/nfs_fs_sb.h |  8 ++++++
 4 files changed, 73 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f1506f14852..a9828baaa44 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -194,6 +194,8 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 		nfs4_destroy_session(clp->cl_session);
 		clp->cl_session = NULL;
 	}
+
+	clp->cl_call_sync = _nfs4_call_sync;
 #endif /* CONFIG_NFS_V4_1 */
 }
 
@@ -1073,6 +1075,8 @@ error:
  */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
+	clp->cl_call_sync = _nfs4_call_sync;
+
 #if defined(CONFIG_NFS_V4_1)
 	if (clp->cl_minorversion) {
 		struct nfs4_session *session = NULL;
@@ -1086,6 +1090,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 			return -ENOMEM;
 
 		clp->cl_session = session;
+		clp->cl_call_sync = _nfs4_call_sync_session;
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7cef45db925..8d67c2865dc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -206,6 +206,18 @@ extern int nfs4_path_walk(struct nfs_server *server,
 			  const char *path);
 #endif
 
+/* nfs4proc.c */
+extern int _nfs4_call_sync(struct nfs_server *server,
+			   struct rpc_message *msg,
+			   struct nfs4_sequence_args *args,
+			   struct nfs4_sequence_res *res,
+			   int cache_reply);
+extern int _nfs4_call_sync_session(struct nfs_server *server,
+				   struct rpc_message *msg,
+				   struct nfs4_sequence_args *args,
+				   struct nfs4_sequence_res *res,
+				   int cache_reply);
+
 /*
  * Determine if sessions are in use.
  */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e08edc99faa..4fc5b385f61 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -271,6 +271,33 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 	spin_unlock(&clp->cl_lock);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
+int _nfs4_call_sync_session(struct nfs_server *server,
+			    struct rpc_message *msg,
+			    struct nfs4_sequence_args *args,
+			    struct nfs4_sequence_res *res,
+			    int cache_reply)
+{
+	/* in preparation for setting up the sequence op */
+	return rpc_call_sync(server->client, msg, 0);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+int _nfs4_call_sync(struct nfs_server *server,
+		    struct rpc_message *msg,
+		    struct nfs4_sequence_args *args,
+		    struct nfs4_sequence_res *res,
+		    int cache_reply)
+{
+	return rpc_call_sync(server->client, msg, 0);
+}
+
+#define nfs4_call_sync(server, msg, args, res, cache_reply) \
+	(server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+			&(res)->seq_res, (cache_reply))
+
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
@@ -1269,7 +1296,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	} else
 		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	if (status == 0 && state != NULL)
 		renew_lease(server, timestamp);
 	return status;
@@ -1595,7 +1622,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	};
 	int status;
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (status == 0) {
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
@@ -1609,6 +1636,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
+
 	return status;
 }
 
@@ -1641,7 +1669,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_resp = &res,
 	};
 	nfs_fattr_init(info->fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -1731,7 +1759,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 	
 	nfs_fattr_init(fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -1815,7 +1843,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
 	nfs_fattr_init(fattr);
 
 	dprintk("NFS call  lookupfh %s\n", name->name);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	dprintk("NFS reply lookupfh: %d\n", status);
 	return status;
 }
@@ -1901,7 +1929,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 			args.access |= NFS4_ACCESS_EXECUTE;
 	}
 	nfs_fattr_init(&fattr);
-	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (!status) {
 		entry->mask = 0;
 		if (res.access & NFS4_ACCESS_READ)
@@ -1967,7 +1995,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
 		.rpc_resp = &res,
 	};
 
-	return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2061,7 +2089,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 	int			status;
 
 	nfs_fattr_init(&res.dir_attr);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, &res.dir_attr);
@@ -2129,7 +2157,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 	
 	nfs_fattr_init(res.old_fattr);
 	nfs_fattr_init(res.new_fattr);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 
 	if (!status) {
 		update_changeattr(old_dir, &res.old_cinfo);
@@ -2178,7 +2206,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 
 	nfs_fattr_init(res.fattr);
 	nfs_fattr_init(res.dir_attr);
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2239,7 +2267,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
 {
-	int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+	int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg,
+				    &data->arg, &data->res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &data->res.dir_cinfo);
 		nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2348,7 +2377,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 			(unsigned long long)cookie);
 	nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
 	res.pgbase = args.pgbase;
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
 	if (status == 0)
 		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
 
@@ -2436,7 +2465,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 
 	nfs_fattr_init(fsstat->fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return  nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -2467,7 +2496,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_resp = &res,
 	};
 
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -2512,7 +2541,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
 	}
 
 	nfs_fattr_init(pathconf->fattr);
-	return rpc_call_sync(server->client, &msg, 0);
+	return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 
 static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2781,7 +2810,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		resp_buf = buf;
 		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
 	}
-	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
 	if (ret)
 		goto out_free;
 	if (res.acl_len > args.acl_len)
@@ -2854,7 +2883,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		return -EOPNOTSUPP;
 	nfs_inode_return_delegation(inode);
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
-	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	nfs_access_zap_cache(inode);
 	nfs_zap_acl_cache(inode);
 	return ret;
@@ -3143,7 +3172,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
 	arg.lock_owner.id = lsp->ls_id.id;
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	switch (status) {
 		case 0:
 			request->fl_type = F_UNLCK;
@@ -3736,7 +3765,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	nfs_fattr_init(&fs_locations->fattr);
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	nfs_fixup_referral_attributes(&fs_locations->fattr);
 	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b47c0fc55d4..206485e5082 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -12,6 +12,9 @@
 struct nfs4_session;
 struct nfs_iostats;
 struct nlm_host;
+struct nfs4_sequence_args;
+struct nfs4_sequence_res;
+struct nfs_server;
 
 /*
  * The nfs_client identifies our client state to the server.
@@ -67,6 +70,11 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
+	int		     (* cl_call_sync)(struct nfs_server *server,
+					      struct rpc_message *msg,
+					      struct nfs4_sequence_args *args,
+					      struct nfs4_sequence_res *res,
+					      int cache_reply);
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
-- 
cgit v1.2.3-70-g09d2


From f3752975caa716709c5ea0b0820b86111d921df4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:04 -0400
Subject: nfs41: nfs41: pass *session in seq_args and seq_res

To be used for getting the rpc's minorversion and for nfs41 xdr
{en,de}coding of the sequence operation.
Reset the seq session ptrs for minorversion=0 rpc calls.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 1 +
 include/linux/nfs_xdr.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4fc5b385f61..49cf969417c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -291,6 +291,7 @@ int _nfs4_call_sync(struct nfs_server *server,
 		    struct nfs4_sequence_res *res,
 		    int cache_reply)
 {
+	args->sa_session = res->sr_session = NULL;
 	return rpc_call_sync(server->client, msg, 0);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d837f10c49e..f5675063f95 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -162,11 +162,11 @@ struct nfs4_slot {
 };
 
 struct nfs4_sequence_args {
-	/* stub */
+	struct nfs4_session	*sa_session;
 };
 
 struct nfs4_sequence_res {
-	/* stub */
+	struct nfs4_session	*sr_session;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 5f7dbd5c752d88310d8fe1feedefd5c6496eff48 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:05 -0400
Subject: nfs41: set up seq_res.sr_slotid

Initialize nfs4_sequence_res sr_slotid to NFS4_MAX_SLOT_TABLE.

[was nfs41: sequence res use slotid]
Signed-off-by: Andy Adamson <andros@netapp.com>
[pulled definition of struct nfs4_sequence_res.sr_slotid to here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 5 +++++
 fs/nfs/read.c           | 1 +
 fs/nfs/unlink.c         | 1 +
 fs/nfs/write.c          | 2 ++
 include/linux/nfs_xdr.h | 1 +
 5 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 49cf969417c..5878930b4c3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -371,6 +371,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
+	p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	if (flags & O_EXCL) {
 		u32 *s = (u32 *) p->o_arg.u.verifier.data;
 		s[0] = jiffies;
@@ -1463,6 +1464,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
+	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	calldata->path.mnt = mntget(path->mnt);
 	calldata->path.dentry = dget(path->dentry);
 
@@ -3088,6 +3090,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
 	data->res.fattr = &data->fattr;
 	data->res.server = server;
+	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
@@ -3240,6 +3243,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.fl = &p->fl;
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
+	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->arg.stateid = &lsp->ls_stateid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
@@ -3412,6 +3416,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
 	p->res.lock_seqid = p->arg.lock_seqid;
+	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4ace3c50a8e..70ba2b4cb9a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,6 +46,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
+		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index ecc29534777..83d0ce2600a 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -241,6 +241,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 		status = PTR_ERR(data->cred);
 		goto out_free;
 	}
+	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 
 	status = -EBUSY;
 	spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e560a78995a..035e6fb9f57 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -52,6 +52,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	}
 	return p;
 }
@@ -71,6 +72,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
+		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f5675063f95..db0d1236aae 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -167,6 +167,7 @@ struct nfs4_sequence_args {
 
 struct nfs4_sequence_res {
 	struct nfs4_session	*sr_session;
+	u8			sr_slotid;	/* slot used to send request */
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From fbcd4abcb3841f85578985c09c6df85aa41b0ae8 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:15 -0400
Subject: nfs41: setup_sequence method

Allocate a slot in the session slot table and set the sequence op arguments.

Called at the rpc prepare stage.

Add a status to nfs41_sequence_res, initialize it to one so that we catch
rpc level failures which do not go through decode_sequence which sets
the new status field.

Note that upon an rpc level failure, we don't know if the server processed the
sequence operation or not. Proceed as if the server did process the sequence
operation.

Signed-off-by: Rahul Iyer <iyer@netapp.com>
[nfs41: sequence args use slotid]
[nfs41: find slot return slotid]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: remove SEQ4_STATUS_USE_TK_STATUS]
As per 11-14-08 review
[move extern declaration from nfs41: sequence setup/done support]
[removed sa_session definition, changed sa_cache_this into a u8 to reduce footprint]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: rpc_sleep_on slot_tbl_waitq must be called under slot_tbl_lock]
    Otherwise there's a race (we've hit) with nfs4_free_slot where
    nfs41_setup_sequence sees a full slot table, unlocks slot_tbl_lock,
    nfs4_free_slots happen concurrently and call rpc_wake_up_next
    where there's nobody to wake up yet, context goes back to
    nfs41_setup_sequence which goes to sleep when the slot table
    is actually empty now and there's no-one to wake it up anymore.
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        | 10 ++++++++++
 fs/nfs/nfs4proc.c       | 34 ++++++++++++++++++++++++++++++++--
 include/linux/nfs_xdr.h |  4 ++++
 3 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index acac6f8c3d3..eccf4e93e7d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -203,8 +203,18 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
 #if defined(CONFIG_NFS_V4_1)
+extern int nfs4_setup_sequence(struct nfs_client *clp,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+#else /* CONFIG_NFS_v4_1 */
+static inline int nfs4_setup_sequence(struct nfs_client *clp,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 extern const u32 nfs4_fattr_bitmap[2];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a0946a0d116..c9618080317 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -280,6 +280,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * If found, we mark the slot as used, update the highest_used_slotid,
  * and respectively set up the sequence operation args.
  * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ *
+ * Note: must be called with under the slot_tbl_lock.
  */
 static u8
 nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
@@ -288,7 +290,6 @@ nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
 	u8 ret_id = NFS4_MAX_SLOT_TABLE;
 	BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
 
-	spin_lock(&tbl->slot_tbl_lock);
 	dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
 		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
 		tbl->max_slots);
@@ -302,7 +303,6 @@ nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
 out:
 	dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
 		__func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
-	spin_unlock(&tbl->slot_tbl_lock);
 	return ret_id;
 }
 
@@ -312,12 +312,42 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 				int cache_reply,
 				struct rpc_task *task)
 {
+	struct nfs4_slot *slot;
+	struct nfs4_slot_table *tbl;
+	u8 slotid;
+
+	dprintk("--> %s\n", __func__);
 	/* slot already allocated? */
 	if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
 		return 0;
 
 	memset(res, 0, sizeof(*res));
 	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+	tbl = &session->fc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	slotid = nfs4_find_slot(tbl, task);
+	if (slotid == NFS4_MAX_SLOT_TABLE) {
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("<-- %s: no free slots\n", __func__);
+		return -EAGAIN;
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	slot = tbl->slots + slotid;
+	args->sa_slotid = slotid;
+	args->sa_cache_this = cache_reply;
+
+	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+
+	res->sr_slotid = slotid;
+	res->sr_renewal_time = jiffies;
+	/*
+	 * sr_status is only set in decode_sequence, and so will remain
+	 * set to 1 if an rpc level failure occurs.
+	 */
+	res->sr_status = 1;
 	return 0;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index db0d1236aae..4ac14b40efc 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -163,11 +163,15 @@ struct nfs4_slot {
 
 struct nfs4_sequence_args {
 	struct nfs4_session	*sa_session;
+	u8			sa_slotid;
+	u8			sa_cache_this;
 };
 
 struct nfs4_sequence_res {
 	struct nfs4_session	*sr_session;
 	u8			sr_slotid;	/* slot used to send request */
+	unsigned long		sr_renewal_time;
+	int			sr_status;	/* sequence operation status */
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 99fe60d062cfecf382c036065b3278b82b6c5eff Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:22:29 -0400
Subject: nfs41: exchange_id operation

Implement the exchange_id operation conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Unlike NFSv4.0, NFSv4.1 requires machine credentials. RPC_AUTH_GSS machine
credentials will be passed into the kernel at mount time to be available for
the exchange_id operation.

RPC_AUTH_UNIX root mounts can use the UNIX root credential. Store the root
credential in the nfs_client struct.

Without a credential, NFSv4.1 state renewal fails.

[nfs41: establish clientid via exchange id only if cred != NULL]
Signed-off-by: Andy Adamson<andros@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: move nfstime4 from under CONFIG_NFS_V4_1]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: do not wait a lease time in exchange id]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: pass *session in seq_args and seq_res]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[nfs41: Ignoring impid in decode_exchange_id is missing a READ_BUF]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: fix Xcode_exchange_id's xdr Xcoding pointer type]
[nfs41: get rid of unused struct nfs41_exchange_id_res members]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c          |  61 ++++++++++++++++++++
 fs/nfs/nfs4xdr.c           | 139 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h       |   1 +
 include/linux/nfs_fs_sb.h  |   6 ++
 include/linux/nfs_xdr.h    |  40 +++++++++++++
 include/linux/nfsd/state.h |   1 -
 6 files changed, 247 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dc0feb5837b..6f384e29075 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -48,6 +48,7 @@
 #include <linux/smp_lock.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/module.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -433,11 +434,13 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	spin_unlock(&tbl->slot_tbl_lock);
 
 	slot = tbl->slots + slotid;
+	args->sa_session = session;
 	args->sa_slotid = slotid;
 	args->sa_cache_this = cache_reply;
 
 	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
 
+	res->sr_session = session;
 	res->sr_slotid = slotid;
 	res->sr_renewal_time = jiffies;
 	/*
@@ -4128,6 +4131,64 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 }
 
 #ifdef CONFIG_NFS_V4_1
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ */
+static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	nfs4_verifier verifier;
+	struct nfs41_exchange_id_args args = {
+		.client = clp,
+		.flags = clp->cl_exchange_flags,
+	};
+	struct nfs41_exchange_id_res res = {
+		.client = clp,
+	};
+	int status;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	__be32 *p;
+
+	dprintk("--> %s\n", __func__);
+	BUG_ON(clp == NULL);
+	p = (u32 *)verifier.data;
+	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
+	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
+	args.verifier = &verifier;
+
+	while (1) {
+		args.id_len = scnprintf(args.id, sizeof(args.id),
+					"%s/%s %u",
+					clp->cl_ipaddr,
+					rpc_peeraddr2str(clp->cl_rpcclient,
+							 RPC_DISPLAY_ADDR),
+					clp->cl_id_uniquifier);
+
+		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+
+		if (status != NFS4ERR_CLID_INUSE)
+			break;
+
+		if (signalled())
+			break;
+
+		if (++clp->cl_id_uniquifier == 0)
+			break;
+	}
+
+	dprintk("<-- %s status= %d\n", __func__, status);
+	return status;
+}
+
 /* Destroy the slot table */
 static void nfs4_destroy_slot_table(struct nfs4_session *session)
 {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5b944cd5721..783c4214dcc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -246,6 +246,27 @@ static int nfs4_stat_to_errno(int);
 				(0)
 
 #if defined(CONFIG_NFS_V4_1)
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+				encode_verifier_maxsz + \
+				1 /* co_ownerid.len */ + \
+				XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+				1 /* flags */ + \
+				1 /* spa_how */ + \
+				0 /* SP4_NONE (for now) */ + \
+				1 /* zero implemetation id array */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+				2 /* eir_clientid */ + \
+				1 /* eir_sequenceid */ + \
+				1 /* eir_flags */ + \
+				1 /* spr_how */ + \
+				0 /* SP4_NONE (for now) */ + \
+				2 /* eir_server_owner.so_minor_id */ + \
+				/* eir_server_owner.so_major_id<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				/* eir_server_scope<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				1 /* eir_server_impl_id array length */ + \
+				0 /* ignored eir_server_impl_id contents */)
 #define encode_sequence_maxsz	0 /* stub */
 #define decode_sequence_maxsz	0 /* stub */
 #else /* CONFIG_NFS_V4_1 */
@@ -594,6 +615,14 @@ static int nfs4_stat_to_errno(int);
 				 decode_putfh_maxsz + \
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_exchange_id_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_exchange_id_maxsz)
+#endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
 	[NF4BAD] = 0,
@@ -1455,7 +1484,29 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	hdr->replen += decode_delegreturn_maxsz;
 }
 
+#if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
+static void encode_exchange_id(struct xdr_stream *xdr,
+			       struct nfs41_exchange_id_args *args,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(4 + sizeof(args->verifier->data));
+	WRITE32(OP_EXCHANGE_ID);
+	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+
+	encode_string(xdr, args->id_len, args->id);
+
+	RESERVE_SPACE(12);
+	WRITE32(args->flags);
+	WRITE32(0);	/* zero length state_protect4_a */
+	WRITE32(0);	/* zero length implementation id array */
+	hdr->nops++;
+	hdr->replen += decode_exchange_id_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static void encode_sequence(struct xdr_stream *xdr,
 			    const struct nfs4_sequence_args *args,
 			    struct compound_hdr *hdr)
@@ -2162,6 +2213,26 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 	return 0;
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+				    struct nfs41_exchange_id_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_minorversion,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_exchange_id(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * START OF "GENERIC" DECODE ROUTINES.
  *   These may look a little ugly since they are imported from a "generic"
@@ -3877,6 +3948,52 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+static int decode_exchange_id(struct xdr_stream *xdr,
+			      struct nfs41_exchange_id_res *res)
+{
+	__be32 *p;
+	uint32_t dummy;
+	int status;
+	struct nfs_client *clp = res->client;
+
+	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+	if (status)
+		return status;
+
+	READ_BUF(8);
+	READ64(clp->cl_ex_clid);
+	READ_BUF(12);
+	READ32(clp->cl_seqid);
+	READ32(clp->cl_exchange_flags);
+
+	/* We ask for SP4_NONE */
+	READ32(dummy);
+	if (dummy != SP4_NONE)
+		return -EIO;
+
+	/* Throw away minor_id */
+	READ_BUF(8);
+
+	/* Throw away Major id */
+	READ_BUF(4);
+	READ32(dummy);
+	READ_BUF(dummy);
+
+	/* Throw away server_scope */
+	READ_BUF(4);
+	READ32(dummy);
+	READ_BUF(dummy);
+
+	/* Throw away Implementation id array */
+	READ_BUF(4);
+	READ32(dummy);
+	READ_BUF(dummy);
+
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static int decode_sequence(struct xdr_stream *xdr,
 			   struct nfs4_sequence_res *res,
 			   struct rpc_rqst *rqstp)
@@ -4774,6 +4891,25 @@ out:
 	return status;
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+				    void *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_exchange_id(&xdr, res);
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
@@ -4943,6 +5079,9 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
+#if defined(CONFIG_NFS_V4_1)
+  PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+#endif /* CONFIG_NFS_V4_1 */
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 7c36fcf2dfb..ad65709ed8d 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -21,6 +21,7 @@
 #define NFS4_FHSIZE		128
 #define NFS4_MAXPATHLEN		PATH_MAX
 #define NFS4_MAXNAMLEN		NAME_MAX
+#define NFS4_OPAQUE_LIMIT	1024
 #define NFS4_MAX_SESSIONID_LEN	16
 
 #define NFS4_ACCESS_READ        0x0001
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 206485e5082..435ed556efb 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -78,6 +78,12 @@ struct nfs_client {
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
+	/* clientid returned from EXCHANGE_ID, used by session operations */
+	u64			cl_ex_clid;
+	/* The sequence id to use for the next CREATE_SESSION */
+	u32			cl_seqid;
+	/* The flags used for obtaining the clientid during EXCHANGE_ID */
+	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4ac14b40efc..5d70b924af5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -879,6 +879,46 @@ struct nfs4_fs_locations_res {
 
 #endif /* CONFIG_NFS_V4 */
 
+struct nfstime4 {
+	u64	seconds;
+	u32	nseconds;
+};
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs_impl_id4 {
+	u32		domain_len;
+	char		*domain;
+	u32		name_len;
+	char		*name;
+	struct nfstime4	date;
+};
+
+#define NFS4_EXCHANGE_ID_LEN	(48)
+struct nfs41_exchange_id_args {
+	struct nfs_client		*client;
+	nfs4_verifier			*verifier;
+	unsigned int 			id_len;
+	char 				id[NFS4_EXCHANGE_ID_LEN];
+	u32				flags;
+};
+
+struct server_owner {
+	uint64_t			minor_id;
+	uint32_t			major_id_sz;
+	char				major_id[NFS4_OPAQUE_LIMIT];
+};
+
+struct server_scope {
+	uint32_t			server_scope_sz;
+	char 				server_scope[NFS4_OPAQUE_LIMIT];
+};
+
+struct nfs41_exchange_id_res {
+	struct nfs_client		*client;
+	u32				flags;
+};
+#endif /* CONFIG_NFS_V4_1 */
+
 struct nfs_page;
 
 #define NFS_PAGEVEC_SIZE	(8U)
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 4d61c873fee..7ef4b7ad121 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -41,7 +41,6 @@
 #include <linux/kref.h>
 #include <linux/sunrpc/clnt.h>
 
-#define NFS4_OPAQUE_LIMIT 1024
 typedef struct {
 	u32             cl_boot;
 	u32             cl_id;
-- 
cgit v1.2.3-70-g09d2


From 2050f0cc0703aab7cee798b3cb47037754f368bc Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:30 -0400
Subject: nfs41: get_lease_time

get_lease_time uses the FSINFO rpc operation to
get the lease time attribute.

nfs4_get_lease_time() is only called from the state manager on session setup
so don't recover from clientid or sequence level errors.

We do need to recover from NFS4ERR_DELAY or NFS4ERR_GRACE.
Use NFS4_POLL_RETRY_MIN - the Linux server returns NFS4ERR_DELAY when an
upcall is needed to resolve an uncached export referenced by a file handle.

[nfs41: sequence res use slotid]
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: remove extraneous rpc_clnt pointer]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: have get_lease_time work on nfs_client]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: get_lease_time recover from NFS4ERR_DELAY]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: pass *session in seq_args and seq_res]
[define nfs4_get_lease_time_{args,res}]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        | 51 +++++++++++++++++++++++++++
 include/linux/nfs_xdr.h |  9 +++++
 3 files changed, 154 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6f384e29075..eafc99afd35 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4189,6 +4189,100 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	return status;
 }
 
+struct nfs4_get_lease_time_data {
+	struct nfs4_get_lease_time_args *args;
+	struct nfs4_get_lease_time_res *res;
+	struct nfs_client *clp;
+};
+
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+					void *calldata)
+{
+	int ret;
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	/* just setup sequence, do not trigger session recovery
+	   since we're invoked within one */
+	ret = nfs41_setup_sequence(data->clp->cl_session,
+					&data->args->la_seq_args,
+					&data->res->lr_seq_res, 0, task);
+
+	BUG_ON(ret == -EAGAIN);
+	rpc_call_start(task);
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+		rpc_delay(task, NFS4_POLL_RETRY_MIN);
+		task->tk_status = 0;
+		rpc_restart_call(task);
+		return;
+	}
+	nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
+	dprintk("<-- %s\n", __func__);
+}
+
+struct rpc_call_ops nfs4_get_lease_time_ops = {
+	.rpc_call_prepare = nfs4_get_lease_time_prepare,
+	.rpc_call_done = nfs4_get_lease_time_done,
+};
+
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+	struct rpc_task *task;
+	struct nfs4_get_lease_time_args args;
+	struct nfs4_get_lease_time_res res = {
+		.lr_fsinfo = fsinfo,
+	};
+	struct nfs4_get_lease_time_data data = {
+		.args = &args,
+		.res = &res,
+		.clp = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_get_lease_time_ops,
+		.callback_data = &data
+	};
+	int status;
+
+	res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+	dprintk("--> %s\n", __func__);
+	task = rpc_run_task(&task_setup);
+
+	if (IS_ERR(task))
+		status = PTR_ERR(task);
+	else {
+		status = task->tk_status;
+		rpc_put_task(task);
+	}
+	dprintk("<-- %s return %d\n", __func__, status);
+
+	return status;
+}
+
 /* Destroy the slot table */
 static void nfs4_destroy_slot_table(struct nfs4_session *session)
 {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 783c4214dcc..85ee1d17a46 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -622,6 +622,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_exchange_id_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_exchange_id_maxsz)
+#define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putrootfh_maxsz + \
+					 encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putrootfh_maxsz + \
+					 decode_fsinfo_maxsz)
 #endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
@@ -2231,6 +2239,27 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 	encode_nops(&hdr);
 	return 0;
 }
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+				       struct nfs4_get_lease_time_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+	};
+	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->la_seq_args, &hdr);
+	encode_putrootfh(&xdr, &hdr);
+	encode_fsinfo(&xdr, lease_bitmap, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -4908,6 +4937,27 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 		status = decode_exchange_id(&xdr, res);
 	return status;
 }
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+				       struct nfs4_get_lease_time_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+	if (!status)
+		status = decode_putrootfh(&xdr);
+	if (!status)
+		status = decode_fsinfo(&xdr, res->lr_fsinfo);
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
@@ -5081,6 +5131,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+  PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5d70b924af5..ca643aa87d4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -174,6 +174,15 @@ struct nfs4_sequence_res {
 	int			sr_status;	/* sequence operation status */
 };
 
+struct nfs4_get_lease_time_args {
+	struct nfs4_sequence_args	la_seq_args;
+};
+
+struct nfs4_get_lease_time_res {
+	struct nfs_fsinfo	       *lr_fsinfo;
+	struct nfs4_sequence_res	lr_seq_res;
+};
+
 /*
  * Arguments to the open call.
  */
-- 
cgit v1.2.3-70-g09d2


From fc931582c260e53ca5ca23bd70ccc9b2265cca9f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:31 -0400
Subject: nfs41: create_session operation

Implement the create_session operation conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Set the real fore channel max operations to preserve server resources.
Note: If the server returns < NFS4_MAX_OPS, the client will very soon
get an NFS4ERR_TOO_MANY_OPS. A later patch will handle this.

Set the max_rqst_sz and max_resp_sz to PAGE_SIZE - we preallocate the buffers.

Set the back channel max_resp_sz_cached to zero to force the client to
always set csa_cachethis to FALSE because the current implementation
of the back channel DRC only supports caching the CB_SEQUENCE operation.

The client back channel server supports one slot, and desires 2 operations
per compound.

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Andy Adamson<andros@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: remove extraneous rpc_clnt pointer]
Use the struct nfs_client cl_rpcclient.
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: nfs4_init_channel_attrs, just use nfs41_create_session_args]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: use rsize and wsize for session channel attributes]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: set channel max operations]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: set back channel attributes]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: obliterate nfs4_adjust_channel_attrs]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: have create_session work on nfs_client]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: move CONFIG_NFS_V4_1 endif]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: pass *session in seq_args and seq_res]
[moved nfs4_init_slot_table definition here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: use kcalloc to allocate slot table]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
[nfs41: fix Xcode_create_session's xdr Xcoding pointer type]
[nfs41: refactor decoding of channel attributes]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c       | 172 ++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h    |  10 +++
 include/linux/nfs_xdr.h |  12 ++++
 4 files changed, 379 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eafc99afd35..7d81d6e5753 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -54,6 +54,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "callback.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -4283,6 +4284,50 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	return status;
 }
 
+/*
+ * Initialize slot table
+ */
+static int nfs4_init_slot_table(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl = &session->fc_slot_table;
+	int i, max_slots = session->fc_attrs.max_reqs;
+	struct nfs4_slot *slot;
+	int ret = -ENOMEM;
+
+	BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
+
+	dprintk("--> %s: max_reqs=%u\n", __func__,
+		session->fc_attrs.max_reqs);
+
+	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+	if (!slot)
+		goto out;
+	for (i = 0; i < max_slots; ++i)
+		slot[i].seq_nr = 1;
+	ret = 0;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (tbl->slots != NULL) {
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
+			__func__, tbl, tbl->slots);
+		WARN_ON(1);
+		goto out_free;
+	}
+	tbl->max_slots = max_slots;
+	tbl->slots = slot;
+	tbl->highest_used_slotid = -1;  /* no slot is currently used */
+	spin_unlock(&tbl->slot_tbl_lock);
+	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+		tbl, tbl->slots, tbl->max_slots);
+out:
+	dprintk("<-- %s: return %d\n", __func__, ret);
+	return ret;
+out_free:
+	kfree(slot);
+	goto out;
+}
+
 /* Destroy the slot table */
 static void nfs4_destroy_slot_table(struct nfs4_session *session)
 {
@@ -4314,6 +4359,133 @@ void nfs4_destroy_session(struct nfs4_session *session)
 	kfree(session);
 }
 
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+{
+	struct nfs4_session *session = args->client->cl_session;
+	unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
+		     mxresp_sz = session->fc_attrs.max_resp_sz;
+
+	if (mxrqst_sz == 0)
+		mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
+	if (mxresp_sz == 0)
+		mxresp_sz = NFS_MAX_FILE_IO_SIZE;
+	/* Fore channel attributes */
+	args->fc_attrs.headerpadsz = 0;
+	args->fc_attrs.max_rqst_sz = mxrqst_sz;
+	args->fc_attrs.max_resp_sz = mxresp_sz;
+	args->fc_attrs.max_resp_sz_cached = mxresp_sz;
+	args->fc_attrs.max_ops = NFS4_MAX_OPS;
+	args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+
+	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+		args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
+		args->fc_attrs.max_reqs);
+
+	/* Back channel attributes */
+	args->bc_attrs.headerpadsz = 0;
+	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz_cached = 0;
+	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+	args->bc_attrs.max_reqs = 1;
+
+	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+		args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+		args->bc_attrs.max_reqs);
+}
+
+static int _nfs4_proc_create_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session = clp->cl_session;
+	struct nfs41_create_session_args args = {
+		.client = clp,
+		.cb_program = NFS4_CALLBACK,
+	};
+	struct nfs41_create_session_res res = {
+		.client = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	nfs4_init_channel_attrs(&args);
+	args.flags = (SESSION4_PERSIST);
+
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+
+	/* Set the negotiated values in the session's channel_attrs struct */
+
+	if (!status) {
+		/* Increment the clientid slot sequence id */
+		clp->cl_seqid++;
+	}
+
+	return status;
+}
+
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp)
+{
+	int status;
+	unsigned *ptr;
+	struct nfs_fsinfo fsinfo;
+	struct nfs4_session *session = clp->cl_session;
+
+	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+
+	status = _nfs4_proc_create_session(clp);
+	if (status)
+		goto out;
+
+	/* Init the fore channel */
+	status = nfs4_init_slot_table(session);
+	dprintk("fore channel slot table initialization returned %d\n", status);
+	if (status)
+		goto out;
+
+	ptr = (unsigned *)&session->sess_id.data[0];
+	dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+		clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+
+	/* Get the lease time */
+	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+	if (status == 0) {
+		/* Update lease time and schedule renewal */
+		spin_lock(&clp->cl_lock);
+		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		clp->cl_last_renewal = jiffies;
+		clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		spin_unlock(&clp->cl_lock);
+
+		nfs4_schedule_state_renewal(clp);
+	}
+out:
+	dprintk("<-- %s\n", __func__);
+	return status;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 85ee1d17a46..7a243a2cf0b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -246,6 +246,8 @@ static int nfs4_stat_to_errno(int);
 				(0)
 
 #if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
 				encode_verifier_maxsz + \
 				1 /* co_ownerid.len */ + \
@@ -267,6 +269,31 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
 				1 /* eir_server_impl_id array length */ + \
 				0 /* ignored eir_server_impl_id contents */)
+#define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz  (6 + \
+				     1 /* ca_rdma_ird.len */ + \
+				     1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz  (op_encode_hdr_maxsz + \
+				     2 /* csa_clientid */ + \
+				     1 /* csa_sequence */ + \
+				     1 /* csa_flags */ + \
+				     encode_channel_attrs_maxsz + \
+				     encode_channel_attrs_maxsz + \
+				     1 /* csa_cb_program */ + \
+				     1 /* csa_sec_parms.len (1) */ + \
+				     1 /* cb_secflavor (AUTH_SYS) */ + \
+				     1 /* stamp */ + \
+				     1 /* machinename.len */ + \
+				     XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+				     1 /* uid */ + \
+				     1 /* gid */ + \
+				     1 /* gids.len (0) */)
+#define decode_create_session_maxsz  (op_decode_hdr_maxsz +	\
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* csr_sequence */ + \
+				     1 /* csr_flags */ + \
+				     decode_channel_attrs_maxsz + \
+				     decode_channel_attrs_maxsz)
 #define encode_sequence_maxsz	0 /* stub */
 #define decode_sequence_maxsz	0 /* stub */
 #else /* CONFIG_NFS_V4_1 */
@@ -622,6 +649,12 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_exchange_id_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_create_session_maxsz)
 #define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_putrootfh_maxsz + \
@@ -712,6 +745,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 static void encode_nops(struct compound_hdr *hdr)
 {
+	BUG_ON(hdr->nops > NFS4_MAX_OPS);
 	*hdr->nops_p = htonl(hdr->nops);
 }
 
@@ -1513,6 +1547,68 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
+
+static void encode_create_session(struct xdr_stream *xdr,
+				  struct nfs41_create_session_args *args,
+				  struct compound_hdr *hdr)
+{
+	__be32 *p;
+	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
+	uint32_t len;
+	struct nfs_client *clp = args->client;
+
+	RESERVE_SPACE(4);
+	WRITE32(OP_CREATE_SESSION);
+
+	RESERVE_SPACE(8);
+	WRITE64(clp->cl_ex_clid);
+
+	RESERVE_SPACE(8);
+	WRITE32(clp->cl_seqid);			/*Sequence id */
+	WRITE32(args->flags);			/*flags */
+
+	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
+	/* Fore Channel */
+	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
+	WRITE32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	WRITE32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	WRITE32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	WRITE32(args->fc_attrs.max_ops);	/* max operations */
+	WRITE32(args->fc_attrs.max_reqs);	/* max requests */
+	WRITE32(0);				/* rdmachannel_attrs */
+
+	/* Back Channel */
+	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
+	WRITE32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	WRITE32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	WRITE32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	WRITE32(args->bc_attrs.max_ops);	/* max operations */
+	WRITE32(args->bc_attrs.max_reqs);	/* max requests */
+	WRITE32(0);				/* rdmachannel_attrs */
+
+	RESERVE_SPACE(4);
+	WRITE32(args->cb_program);		/* cb_program */
+
+	RESERVE_SPACE(4);			/* # of security flavors */
+	WRITE32(1);
+
+	RESERVE_SPACE(4);
+	WRITE32(RPC_AUTH_UNIX);			/* auth_sys */
+
+	/* authsys_parms rfc1831 */
+	RESERVE_SPACE(4);
+	WRITE32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	len = scnprintf(machine_name, sizeof(machine_name), "%s",
+			clp->cl_ipaddr);
+	RESERVE_SPACE(16 + len);
+	WRITE32(len);
+	WRITEMEM(machine_name, len);
+	WRITE32(0);				/* UID */
+	WRITE32(0);				/* GID */
+	WRITE32(0);				/* No more gids */
+	hdr->nops++;
+	hdr->replen += decode_create_session_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void encode_sequence(struct xdr_stream *xdr,
@@ -2240,6 +2336,24 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 	return 0;
 }
 
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+				       struct nfs41_create_session_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_minorversion,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_create_session(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
 /*
  * a GET_LEASE_TIME request
  */
@@ -4021,6 +4135,59 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 
 	return 0;
 }
+
+static int decode_chan_attrs(struct xdr_stream *xdr,
+			     struct nfs4_channel_attrs *attrs)
+{
+	__be32 *p;
+	u32 nr_attrs;
+
+	READ_BUF(28);
+	READ32(attrs->headerpadsz);
+	READ32(attrs->max_rqst_sz);
+	READ32(attrs->max_resp_sz);
+	READ32(attrs->max_resp_sz_cached);
+	READ32(attrs->max_ops);
+	READ32(attrs->max_reqs);
+	READ32(nr_attrs);
+	if (unlikely(nr_attrs > 1)) {
+		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
+			__func__, nr_attrs);
+		return -EINVAL;
+	}
+	if (nr_attrs == 1)
+		READ_BUF(4); /* skip rdma_attrs */
+	return 0;
+}
+
+static int decode_create_session(struct xdr_stream *xdr,
+				 struct nfs41_create_session_res *res)
+{
+	__be32 *p;
+	int status;
+	struct nfs_client *clp = res->client;
+	struct nfs4_session *session = clp->cl_session;
+
+	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+
+	if (status)
+		return status;
+
+	/* sessionid */
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
+
+	/* seqid, flags */
+	READ_BUF(8);
+	READ32(clp->cl_seqid);
+	READ32(session->flags);
+
+	/* Channel attributes */
+	status = decode_chan_attrs(xdr, &session->fc_attrs);
+	if (!status)
+		status = decode_chan_attrs(xdr, &session->bc_attrs);
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static int decode_sequence(struct xdr_stream *xdr,
@@ -4938,6 +5105,23 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 	return status;
 }
 
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+				       struct nfs41_create_session_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_create_session(&xdr, res);
+	return status;
+}
+
 /*
  * a GET_LEASE_TIME request
  */
@@ -5131,6 +5315,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+  PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
   PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index ad65709ed8d..bd2eba53066 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -131,6 +131,16 @@
 
 #define NFS4_MAX_UINT64	(~(u64)0)
 
+/* An NFS4 sessions server must support at least NFS4_MAX_OPS operations.
+ * If a compound requires more operations, adjust NFS4_MAX_OPS accordingly.
+ */
+#define NFS4_MAX_OPS   8
+
+/* Our NFS4 client back channel server only wants the cb_sequene and the
+ * actual operation per compound
+ */
+#define NFS4_MAX_BACK_CHANNEL_OPS 2
+
 enum nfs4_acl_whotype {
 	NFS4_ACL_WHO_NAMED = 0,
 	NFS4_ACL_WHO_OWNER,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ca643aa87d4..62f63fb0c4c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -926,6 +926,18 @@ struct nfs41_exchange_id_res {
 	struct nfs_client		*client;
 	u32				flags;
 };
+
+struct nfs41_create_session_args {
+	struct nfs_client	       *client;
+	uint32_t			flags;
+	uint32_t			cb_program;
+	struct nfs4_channel_attrs	fc_attrs;	/* Fore Channel */
+	struct nfs4_channel_attrs	bc_attrs;	/* Back Channel */
+};
+
+struct nfs41_create_session_res {
+	struct nfs_client	       *client;
+};
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs_page;
-- 
cgit v1.2.3-70-g09d2


From 76db6d9500caeaa774a3e32a997eba30bbdc176b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:38 -0400
Subject: nfs41: add session setup to the state manager

At mount, nfs_alloc_client sets the cl_state NFS4CLNT_LEASE_EXPIRED bit
and nfs4_alloc_session sets the NFS4CLNT_SESSION_SETUP bit, so both bits are
set when nfs4_lookup_root calls nfs4_recover_expired_lease which schedules
the nfs4_state_manager and waits for it to complete.

Place the session setup after the clientid establishment in nfs4_state_manager
so that the session is setup right after the clientid has been established
without rescheduling the state manager.

Unlike nfsv4.0, the nfs_client struct is not ready to use until the session
has been established.  Postpone marking the nfs_client struct to NFS_CS_READY
until after a successful CREATE_SESSION call so that other threads cannot use
the client until the session is established.

If the EXCHANGE_ID call fails and the session has not been setup (the
NFS4CLNT_SESSION_SETUP bit is set), mark the client with the error and return.

If the session setup CREATE_SESSION call fails with NFS4ERR_STALE_CLIENTID
which could occur due to server reboot or network partition inbetween the
EXCHANGE_ID and CREATE_SESSION call, reset the NFS4CLNT_LEASE_EXPIRED and
NFS4CLNT_SESSION_SETUP bits and try again.

If the CREATE_SESSION call fails with other errors, mark the client with
the error and return.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>

[nfs41: NFS_CS_SESSION_SETUP cl_cons_state for back channel setup]
  On session setup, the CREATE_SESSION reply races with the server back channel
  probe which needs to succeed to setup the back channel. Set a new
  cl_cons_state NFS_CS_SESSION_SETUP just prior to the CREATE_SESSION call
  and add it as a valid state to nfs_find_client so that the client back channel
  can find the nfs_client struct and won't drop the server backchannel probe.
  Use a new cl_cons_state so that NFSv4.0 back channel behaviour which only
  sets NFS_CS_READY is unchanged.
  Adjust waiting on the nfs_client_active_wq accordingly.
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>

[nfs41: rename NFS_CS_SESSION_SETUP to NFS_CS_SESSION_INITING]
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfs41: set NFS_CL_SESSION_INITING in alloc_session]
Signed-off-by: Andy Adamson <andros@netapp.com>
[nfs41: move session setup into a function]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[moved nfs4_proc_create_session declaration here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 10 ++++++----
 fs/nfs/internal.h         |  1 +
 fs/nfs/nfs4_fs.h          |  2 ++
 fs/nfs/nfs4proc.c         | 10 ++++++++++
 fs/nfs/nfs4state.c        | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  1 +
 6 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bb7432d83b5..d28a987f569 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -366,7 +366,8 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 
 		/* Don't match clients that failed to initialise properly */
-		if (clp->cl_cons_state != NFS_CS_READY)
+		if (!(clp->cl_cons_state == NFS_CS_READY ||
+		      clp->cl_cons_state == NFS_CS_SESSION_INITING))
 			continue;
 
 		/* Different NFS versions cannot share the same nfs_client */
@@ -499,7 +500,7 @@ found_client:
 		nfs_free_client(new);
 
 	error = wait_event_killable(nfs_client_active_wq,
-				clp->cl_cons_state != NFS_CS_INITING);
+				clp->cl_cons_state < NFS_CS_INITING);
 	if (error < 0) {
 		nfs_put_client(clp);
 		return ERR_PTR(-ERESTARTSYS);
@@ -520,7 +521,7 @@ found_client:
 /*
  * Mark a server as ready or failed
  */
-static void nfs_mark_client_ready(struct nfs_client *clp, int state)
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
 	clp->cl_cons_state = state;
 	wake_up_all(&nfs_client_active_wq);
@@ -1135,7 +1136,8 @@ static int nfs4_init_client(struct nfs_client *clp,
 	if (error < 0)
 		goto error;
 
-	nfs_mark_client_ready(clp, NFS_CS_READY);
+	if (!nfs4_has_session(clp))
+		nfs_mark_client_ready(clp, NFS_CS_READY);
 	return 0;
 
 error:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f62bc522615..f3b310e8ea0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -100,6 +100,7 @@ extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
 					   struct nfs_fattr *);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index eccf4e93e7d..288717abadd 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
 	NFS4CLNT_RECLAIM_REBOOT,
 	NFS4CLNT_RECLAIM_NOGRACE,
 	NFS4CLNT_DELEGRETURN,
+	NFS4CLNT_SESSION_SETUP,
 };
 
 /*
@@ -208,6 +209,7 @@ extern int nfs4_setup_sequence(struct nfs_client *clp,
 		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern int nfs4_proc_create_session(struct nfs_client *, int reset);
 #else /* CONFIG_NFS_v4_1 */
 static inline int nfs4_setup_sequence(struct nfs_client *clp,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0b121474024..7fc0c9c8f5e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4382,6 +4382,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
 	if (!session)
 		return NULL;
+
+	set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+	/*
+	 * The create session reply races with the server back
+	 * channel probe. Mark the client NFS_CS_SESSION_INITING
+	 * so that the client back channel can find the
+	 * nfs_client struct
+	 */
+	clp->cl_cons_state = NFS_CS_SESSION_INITING;
+
 	tbl = &session->fc_slot_table;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "Slot table");
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index bc683ed477e..df5b4807daa 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1113,6 +1113,27 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 	return status;
 }
 
+#ifdef CONFIG_NFS_V4_1
+
+static int nfs4_initialize_session(struct nfs_client *clp)
+{
+	int status;
+
+	status = nfs4_proc_create_session(clp, 0);
+	if (!status) {
+		nfs_mark_client_ready(clp, NFS_CS_READY);
+	} else if (status == -NFS4ERR_STALE_CLIENTID) {
+		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+	} else {
+		nfs_mark_client_ready(clp, status);
+	}
+	return status;
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
+#endif /* CONFIG_NFS_V4_1 */
+
 static void nfs4_state_manager(struct nfs_client *clp)
 {
 	int status = 0;
@@ -1126,6 +1147,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 				if (status == -EAGAIN)
 					continue;
+				if (clp->cl_cons_state ==
+							NFS_CS_SESSION_INITING)
+					nfs_mark_client_ready(clp, status);
 				goto out_error;
 			}
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
@@ -1136,7 +1160,16 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			if (status != 0)
 				continue;
 		}
-
+		/* Setup the session */
+		if (nfs4_has_session(clp) &&
+		   test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+			status = nfs4_initialize_session(clp);
+			if (status) {
+				if (status == -NFS4ERR_STALE_CLIENTID)
+					continue;
+				goto out_error;
+			}
+		}
 		/* First recover reboot state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 435ed556efb..d0902ccec9c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -24,6 +24,7 @@ struct nfs_client {
 	int			cl_cons_state;	/* current construction state (-ve: init error) */
 #define NFS_CS_READY		0		/* ready to be used */
 #define NFS_CS_INITING		1		/* busy initialising */
+#define NFS_CS_SESSION_INITING	2		/* busy initialising  session */
 	unsigned long		cl_res_state;	/* NFS resources state */
 #define NFS_CS_CALLBACK		1		/* - callback started */
 #define NFS_CS_IDMAP		2		/* - idmap started */
-- 
cgit v1.2.3-70-g09d2


From aae2006e9b0c294114915c13022fa348e1a88023 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:22:40 -0400
Subject: nfs41: sunrpc: Export the call prepare state for session reset

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h  |  1 +
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/clnt.c            | 13 +++++++++++++
 net/sunrpc/sched.c           |  2 +-
 4 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index c39a21040dc..37881f1a0bd 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -143,6 +143,7 @@ int		rpc_call_sync(struct rpc_clnt *clnt,
 			      const struct rpc_message *msg, int flags);
 struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
 			       int flags);
+void		rpc_restart_call_prepare(struct rpc_task *);
 void		rpc_restart_call(struct rpc_task *);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 64981a2f1ca..177376880fa 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -237,6 +237,7 @@ void		rpc_show_tasks(void);
 int		rpc_init_mempool(void);
 void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
+void		rpc_prepare_task(struct rpc_task *task);
 
 static inline void rpc_exit(struct rpc_task *task, int status)
 {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5abab094441..d00e8135f86 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -694,6 +694,19 @@ void rpc_force_rebind(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_force_rebind);
 
+/*
+ * Restart an (async) RPC call from the call_prepare state.
+ * Usually called from within the exit handler.
+ */
+void
+rpc_restart_call_prepare(struct rpc_task *task)
+{
+	if (RPC_ASSASSINATED(task))
+		return;
+	task->tk_action = rpc_prepare_task;
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
+
 /*
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ff50a054686..1102ce1251f 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -569,7 +569,7 @@ EXPORT_SYMBOL_GPL(rpc_delay);
 /*
  * Helper to call task->tk_ops->rpc_call_prepare
  */
-static void rpc_prepare_task(struct rpc_task *task)
+void rpc_prepare_task(struct rpc_task *task)
 {
 	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
 }
-- 
cgit v1.2.3-70-g09d2


From 56632b5bff5af10eb12d7e9499b5ffcadcb7a7b2 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:22:58 -0400
Subject: nfs41: client callback structures

Adds new list of rpc_xprt structures, and a readers/writers lock to
protect the list.  The list is used to preallocate resources for
the backchannel during backchannel requests.  Callbacks are not
expected to cause significant latency, so only one callback will
be allowed at this time.

It also adds a pointer to the NFS callback service so that
requests can be directed to it for processing.

New callback members added to svc_serv. The NFSv4.1 callback service will
sleep on the svc_serv->svc_cb_waitq until new callback requests arrive.
The request will be queued in svc_serv->svc_cb_list. This patch adds this
list, the sleep queue and spinlock to svc_serv.

[nfs41: NFSv4.1 callback support]
Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svc.h  |  8 ++++++++
 include/linux/sunrpc/xprt.h | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 2a30775959e..4a8afbd6200 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -96,6 +96,14 @@ struct svc_serv {
 	svc_thread_fn		sv_function;	/* main function for threads */
 	unsigned int		sv_drc_max_pages; /* Total pages for DRC */
 	unsigned int		sv_drc_pages_used;/* DRC pages used */
+#if defined(CONFIG_NFS_V4_1)
+	struct list_head	sv_cb_list;	/* queue for callback requests
+						 * that arrive over the same
+						 * connection */
+	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
+	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
+						 * entries in the svc_cb_list */
+#endif /* CONFIG_NFS_V4_1 */
 };
 
 /*
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 08afe43118f..703af7ebf6c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -97,6 +97,12 @@ struct rpc_rqst {
 
 	unsigned long		rq_xtime;	/* when transmitted */
 	int			rq_ntrans;
+
+#if defined(CONFIG_NFS_V4_1)
+	struct list_head	rq_bc_list;	/* Callback service list */
+	unsigned long		rq_bc_pa_state;	/* Backchannel prealloc state */
+	struct list_head	rq_bc_pa_list;	/* Backchannel prealloc list */
+#endif /* CONFIG_NFS_V4_1 */
 };
 #define rq_svec			rq_snd_buf.head
 #define rq_slen			rq_snd_buf.len
@@ -174,6 +180,14 @@ struct rpc_xprt {
 	spinlock_t		reserve_lock;	/* lock slot table */
 	u32			xid;		/* Next XID value to use */
 	struct rpc_task *	snd_task;	/* Task blocked in send */
+#if defined(CONFIG_NFS_V4_1)
+	struct svc_serv		*bc_serv;       /* The RPC service which will */
+						/* process the callback */
+	spinlock_t		bc_pa_lock;	/* Protects the preallocated
+						 * items */
+	struct list_head	bc_pa_list;	/* List of preallocated
+						 * backchannel rpc_rqst's */
+#endif /* CONFIG_NFS_V4_1 */
 	struct list_head	recv;
 
 	struct {
@@ -192,6 +206,14 @@ struct rpc_xprt {
 	const char		*address_strings[RPC_DISPLAY_MAX];
 };
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Backchannel flags
+ */
+#define	RPC_BC_PA_IN_USE	0x0001		/* Preallocated backchannel */
+						/* buffer in use */
+#endif /* CONFIG_NFS_V4_1 */
+
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct sockaddr *	srcaddr;	/* optional local address */
-- 
cgit v1.2.3-70-g09d2


From fb7a0b9addbdbbb13b7bc02abf55ee524ea19ce1 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:00 -0400
Subject: nfs41: New backchannel helper routines

This patch introduces support to setup the callback xprt on the client side.
It allocates/ destroys the preallocated memory structures used to process
backchannel requests.

At setup time, xprt_setup_backchannel() is invoked to allocate one or
more rpc_rqst structures and substructures.  This ensures that they
are available when an RPC callback arrives.  The rpc_rqst structures
are maintained in a linked list attached to the rpc_xprt structure.
We keep track of the number of allocations so that they can be correctly
removed when the channel is destroyed.

When an RPC callback arrives, xprt_alloc_bc_request() is invoked to
obtain a preallocated rpc_rqst structure.  An rpc_xprt structure is
returned, and its RPC_BC_PREALLOC_IN_USE bit is set in
rpc_xprt->bc_flags.  The structure is removed from the the list
since it is now in use, and it will be later added back when its
user is done with it.

After the RPC callback replies, the rpc_rqst structure is returned
by invoking xprt_free_bc_request().  This clears the
RPC_BC_PREALLOC_IN_USE bit and adds it back to the list, allowing it
to be reused by a subsequent RPC callback request.

To be consistent with the reception of RPC messages, the backchannel requests
should be placed into the 'struct rpc_rqst' rq_rcv_buf, which is then in turn
copied to the 'struct rpc_rqst' rq_private_buf.

[nfs41: Preallocate rpc_rqst receive buffer for handling callbacks]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Update copyright notice and explain page allocation]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/xprt.h   |   1 +
 net/sunrpc/Makefile           |   1 +
 net/sunrpc/backchannel_rqst.c | 278 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 280 insertions(+)
 create mode 100644 net/sunrpc/backchannel_rqst.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 703af7ebf6c..beae030e80b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -183,6 +183,7 @@ struct rpc_xprt {
 #if defined(CONFIG_NFS_V4_1)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
+	unsigned int		bc_alloc_count;	/* Total number of preallocs */
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5369aa369b3..4a01f9684b8 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -13,5 +13,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
+sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
new file mode 100644
index 00000000000..f56e18a2349
--- /dev/null
+++ b/net/sunrpc/backchannel_rqst.c
@@ -0,0 +1,278 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
+(c) 2009 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+#include <linux/tcp.h>
+#include <linux/sunrpc/xprt.h>
+
+#ifdef RPC_DEBUG
+#define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * Helper routines that track the number of preallocation elements
+ * on the transport.
+ */
+static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
+{
+	return xprt->bc_alloc_count > 0;
+}
+
+static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+	xprt->bc_alloc_count += n;
+}
+
+static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+	return xprt->bc_alloc_count -= n;
+}
+
+/*
+ * Free the preallocated rpc_rqst structure and the memory
+ * buffers hanging off of it.
+ */
+static void xprt_free_allocation(struct rpc_rqst *req)
+{
+	struct xdr_buf *xbufp;
+
+	dprintk("RPC:        free allocations for req= %p\n", req);
+	BUG_ON(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	xbufp = &req->rq_private_buf;
+	free_page((unsigned long)xbufp->head[0].iov_base);
+	xbufp = &req->rq_snd_buf;
+	free_page((unsigned long)xbufp->head[0].iov_base);
+	list_del(&req->rq_bc_pa_list);
+	kfree(req);
+}
+
+/*
+ * Preallocate up to min_reqs structures and related buffers for use
+ * by the backchannel.  This function can be called multiple times
+ * when creating new sessions that use the same rpc_xprt.  The
+ * preallocated buffers are added to the pool of resources used by
+ * the rpc_xprt.  Anyone of these resources may be used used by an
+ * incoming callback request.  It's up to the higher levels in the
+ * stack to enforce that the maximum number of session slots is not
+ * being exceeded.
+ *
+ * Some callback arguments can be large.  For example, a pNFS server
+ * using multiple deviceids.  The list can be unbound, but the client
+ * has the ability to tell the server the maximum size of the callback
+ * requests.  Each deviceID is 16 bytes, so allocate one page
+ * for the arguments to have enough room to receive a number of these
+ * deviceIDs.  The NFS client indicates to the pNFS server that its
+ * callback requests can be up to 4096 bytes in size.
+ */
+int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
+{
+	struct page *page_rcv = NULL, *page_snd = NULL;
+	struct xdr_buf *xbufp = NULL;
+	struct rpc_rqst *req, *tmp;
+	struct list_head tmp_list;
+	int i;
+
+	dprintk("RPC:       setup backchannel transport\n");
+
+	/*
+	 * We use a temporary list to keep track of the preallocated
+	 * buffers.  Once we're done building the list we splice it
+	 * into the backchannel preallocation list off of the rpc_xprt
+	 * struct.  This helps minimize the amount of time the list
+	 * lock is held on the rpc_xprt struct.  It also makes cleanup
+	 * easier in case of memory allocation errors.
+	 */
+	INIT_LIST_HEAD(&tmp_list);
+	for (i = 0; i < min_reqs; i++) {
+		/* Pre-allocate one backchannel rpc_rqst */
+		req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+		if (req == NULL) {
+			printk(KERN_ERR "Failed to create bc rpc_rqst\n");
+			goto out_free;
+		}
+
+		/* Add the allocated buffer to the tmp list */
+		dprintk("RPC:       adding req= %p\n", req);
+		list_add(&req->rq_bc_pa_list, &tmp_list);
+
+		req->rq_xprt = xprt;
+		INIT_LIST_HEAD(&req->rq_list);
+		INIT_LIST_HEAD(&req->rq_bc_list);
+
+		/* Preallocate one XDR receive buffer */
+		page_rcv = alloc_page(GFP_KERNEL);
+		if (page_rcv == NULL) {
+			printk(KERN_ERR "Failed to create bc receive xbuf\n");
+			goto out_free;
+		}
+		xbufp = &req->rq_rcv_buf;
+		xbufp->head[0].iov_base = page_address(page_rcv);
+		xbufp->head[0].iov_len = PAGE_SIZE;
+		xbufp->tail[0].iov_base = NULL;
+		xbufp->tail[0].iov_len = 0;
+		xbufp->page_len = 0;
+		xbufp->len = PAGE_SIZE;
+		xbufp->buflen = PAGE_SIZE;
+
+		/* Preallocate one XDR send buffer */
+		page_snd = alloc_page(GFP_KERNEL);
+		if (page_snd == NULL) {
+			printk(KERN_ERR "Failed to create bc snd xbuf\n");
+			goto out_free;
+		}
+
+		xbufp = &req->rq_snd_buf;
+		xbufp->head[0].iov_base = page_address(page_snd);
+		xbufp->head[0].iov_len = 0;
+		xbufp->tail[0].iov_base = NULL;
+		xbufp->tail[0].iov_len = 0;
+		xbufp->page_len = 0;
+		xbufp->len = 0;
+		xbufp->buflen = PAGE_SIZE;
+	}
+
+	/*
+	 * Add the temporary list to the backchannel preallocation list
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_splice(&tmp_list, &xprt->bc_pa_list);
+	xprt_inc_alloc_count(xprt, min_reqs);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	dprintk("RPC:       setup backchannel transport done\n");
+	return 0;
+
+out_free:
+	/*
+	 * Memory allocation failed, free the temporary list
+	 */
+	list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list)
+		xprt_free_allocation(req);
+
+	dprintk("RPC:       setup backchannel transport failed\n");
+	return -1;
+}
+EXPORT_SYMBOL(xprt_setup_backchannel);
+
+/*
+ * Destroys the backchannel preallocated structures.
+ * Since these structures may have been allocated by multiple calls
+ * to xprt_setup_backchannel, we only destroy up to the maximum number
+ * of reqs specified by the caller.
+ * @xprt:	the transport holding the preallocated strucures
+ * @max_reqs	the maximum number of preallocated structures to destroy
+ */
+void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
+{
+	struct rpc_rqst *req = NULL, *tmp = NULL;
+
+	dprintk("RPC:        destroy backchannel transport\n");
+
+	BUG_ON(max_reqs == 0);
+	spin_lock_bh(&xprt->bc_pa_lock);
+	xprt_dec_alloc_count(xprt, max_reqs);
+	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+		dprintk("RPC:        req=%p\n", req);
+		xprt_free_allocation(req);
+		if (--max_reqs == 0)
+			break;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	dprintk("RPC:        backchannel list empty= %s\n",
+		list_empty(&xprt->bc_pa_list) ? "true" : "false");
+}
+EXPORT_SYMBOL(xprt_destroy_backchannel);
+
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup.  Buffer space for the send and private XDR buffers
+ * has been preallocated as well.  Use xprt_alloc_bc_request to allocate
+ * to this request.  Use xprt_free_bc_request to return it.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
+{
+	struct rpc_rqst *req;
+
+	dprintk("RPC:       allocate a backchannel request\n");
+	spin_lock_bh(&xprt->bc_pa_lock);
+	if (!list_empty(&xprt->bc_pa_list)) {
+		req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
+				rq_bc_pa_list);
+		list_del(&req->rq_bc_pa_list);
+	} else {
+		req = NULL;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	if (req != NULL) {
+		set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+		req->rq_received = 0;
+		req->rq_bytes_sent = 0;
+		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+			sizeof(req->rq_private_buf));
+	}
+	dprintk("RPC:       backchannel req=%p\n", req);
+	return req;
+}
+
+/*
+ * Return the preallocated rpc_rqst structure and XDR buffers
+ * associated with this rpc_task.
+ */
+void xprt_free_bc_request(struct rpc_rqst *req)
+{
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	dprintk("RPC:       free backchannel req=%p\n", req);
+
+	smp_mb__before_clear_bit();
+	BUG_ON(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+	smp_mb__after_clear_bit();
+
+	if (!xprt_need_to_requeue(xprt)) {
+		/*
+		 * The last remaining session was destroyed while this
+		 * entry was in use.  Free the entry and don't attempt
+		 * to add back to the list because there is no need to
+		 * have anymore preallocated entries.
+		 */
+		dprintk("RPC:       Last session removed req=%p\n", req);
+		xprt_free_allocation(req);
+		return;
+	}
+
+	/*
+	 * Return it to the list of preallocations so that it
+	 * may be reused by a new callback request.
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-70-g09d2


From 4a8d70bfef01f8e6b27785e2625e88e9a80924a5 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:01 -0400
Subject: nfs41: New include/linux/sunrpc/bc_xprt.h

Contains prototype for backchannel helper routines.

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: xprt_setup_backchannel v4.0 only inline]
    Fix compile error when CONFIG_NFS_V4_1 is not set.
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Update Copyright notice and fix formatting]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/bc_xprt.h | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 include/linux/sunrpc/bc_xprt.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
new file mode 100644
index 00000000000..5965ae4f902
--- /dev/null
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -0,0 +1,46 @@
+/******************************************************************************
+
+(c) 2008 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions to create and manage the backchannel
+ */
+
+#ifndef _LINUX_SUNRPC_BC_XPRT_H
+#define _LINUX_SUNRPC_BC_XPRT_H
+
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprt.h>
+
+#ifdef CONFIG_NFS_V4_1
+struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt);
+void xprt_free_bc_request(struct rpc_rqst *req);
+int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
+void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs);
+#else /* CONFIG_NFS_V4_1 */
+static inline int xprt_setup_backchannel(struct rpc_xprt *xprt,
+					 unsigned int min_reqs)
+{
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+#endif /* _LINUX_SUNRPC_BC_XPRT_H */
+
-- 
cgit v1.2.3-70-g09d2


From 55ae1aabfb108106dd095de2578ceef1c755a8b8 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:03 -0400
Subject: nfs41: Add backchannel processing support to RPC state machine

Adds rpc_run_bc_task() which is called by the NFS callback service to
process backchannel requests.  It performs similar work to rpc_run_task()
though "schedules" the backchannel task to be executed starting at the
call_trasmit state in the RPC state machine.

It also introduces some miscellaneous updates to the argument validation,
call_transmit, and transport cleanup functions to take into account
that there are now forechannel and backchannel tasks.

Backchannel requests do not carry an RPC message structure, since the
payload has already been XDR encoded using the existing NFSv4 callback
mechanism.

Introduce a new transmit state for the client to reply on to backchannel
requests.  This new state simply reserves the transport and issues the
reply.  In case of a connection related error, disconnects the transport and
drops the reply.  It requires the forechannel to re-establish the connection
and the server to retransmit the request, as stated in NFSv4.1 section
2.9.2 "Client and Server Transport Behavior".

Note: There is no need to loop attempting to reserve the transport.  If EAGAIN
is returned by xprt_prepare_transmit(), return with tk_status == 0,
setting tk_action to call_bc_transmit.  rpc_execute() will invoke it again
after the task is taken off the sleep queue.

[nfs41: rpc_run_bc_task() need not be exported outside RPC module]
[nfs41: New call_bc_transmit RPC state]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: Backchannel: No need to loop in call_bc_transmit()]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[rpc_count_iostats incorrectly exits early]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Convert rpc_reply_expected() to inline function]
[Remove unnecessary BUG_ON()]
[Rename variable]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/sched.h |   2 +
 include/linux/sunrpc/xprt.h  |  12 +++++
 net/sunrpc/clnt.c            | 117 ++++++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/stats.c           |   6 ++-
 net/sunrpc/sunrpc.h          |  37 ++++++++++++++
 net/sunrpc/xprt.c            |  38 +++++++++++---
 6 files changed, 203 insertions(+), 9 deletions(-)
 create mode 100644 net/sunrpc/sunrpc.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 177376880fa..401097781fc 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -210,6 +210,8 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+				const struct rpc_call_ops *ops);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_release_calldata(const struct rpc_call_ops *, void *);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index beae030e80b..55c6c37e249 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -215,6 +215,18 @@ struct rpc_xprt {
 						/* buffer in use */
 #endif /* CONFIG_NFS_V4_1 */
 
+#if defined(CONFIG_NFS_V4_1)
+static inline int bc_prealloc(struct rpc_rqst *req)
+{
+	return test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+}
+#else
+static inline int bc_prealloc(struct rpc_rqst *req)
+{
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct sockaddr *	srcaddr;	/* optional local address */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index aca3ab6fc14..f3e93b8eb90 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -36,7 +36,9 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
 
+#include "sunrpc.h"
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_CALL
@@ -63,6 +65,9 @@ static void	call_decode(struct rpc_task *task);
 static void	call_bind(struct rpc_task *task);
 static void	call_bind_status(struct rpc_task *task);
 static void	call_transmit(struct rpc_task *task);
+#if defined(CONFIG_NFS_V4_1)
+static void	call_bc_transmit(struct rpc_task *task);
+#endif /* CONFIG_NFS_V4_1 */
 static void	call_status(struct rpc_task *task);
 static void	call_transmit_status(struct rpc_task *task);
 static void	call_refresh(struct rpc_task *task);
@@ -613,6 +618,50 @@ rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags,
 }
 EXPORT_SYMBOL_GPL(rpc_call_async);
 
+#if defined(CONFIG_NFS_V4_1)
+/**
+ * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
+ * rpc_execute against it
+ * @ops: RPC call ops
+ */
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+					const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_task *task;
+	struct xdr_buf *xbufp = &req->rq_snd_buf;
+	struct rpc_task_setup task_setup_data = {
+		.callback_ops = tk_ops,
+	};
+
+	dprintk("RPC: rpc_run_bc_task req= %p\n", req);
+	/*
+	 * Create an rpc_task to send the data
+	 */
+	task = rpc_new_task(&task_setup_data);
+	if (!task) {
+		xprt_free_bc_request(req);
+		goto out;
+	}
+	task->tk_rqstp = req;
+
+	/*
+	 * Set up the xdr_buf length.
+	 * This also indicates that the buffer is XDR encoded already.
+	 */
+	xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
+			xbufp->tail[0].iov_len;
+
+	task->tk_action = call_bc_transmit;
+	atomic_inc(&task->tk_count);
+	BUG_ON(atomic_read(&task->tk_count) != 2);
+	rpc_execute(task);
+
+out:
+	dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
+	return task;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 void
 rpc_call_start(struct rpc_task *task)
 {
@@ -1098,7 +1147,7 @@ call_transmit(struct rpc_task *task)
 	 * in order to allow access to the socket to other RPC requests.
 	 */
 	call_transmit_status(task);
-	if (task->tk_msg.rpc_proc->p_decode != NULL)
+	if (rpc_reply_expected(task))
 		return;
 	task->tk_action = rpc_exit_task;
 	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
@@ -1133,6 +1182,72 @@ call_transmit_status(struct rpc_task *task)
 	}
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * 5b.	Send the backchannel RPC reply.  On error, drop the reply.  In
+ * addition, disconnect on connectivity errors.
+ */
+static void
+call_bc_transmit(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	BUG_ON(task->tk_status != 0);
+	task->tk_status = xprt_prepare_transmit(task);
+	if (task->tk_status == -EAGAIN) {
+		/*
+		 * Could not reserve the transport. Try again after the
+		 * transport is released.
+		 */
+		task->tk_status = 0;
+		task->tk_action = call_bc_transmit;
+		return;
+	}
+
+	task->tk_action = rpc_exit_task;
+	if (task->tk_status < 0) {
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		return;
+	}
+
+	xprt_transmit(task);
+	xprt_end_transmit(task);
+	dprint_status(task);
+	switch (task->tk_status) {
+	case 0:
+		/* Success */
+		break;
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -ETIMEDOUT:
+		/*
+		 * Problem reaching the server.  Disconnect and let the
+		 * forechannel reestablish the connection.  The server will
+		 * have to retransmit the backchannel request and we'll
+		 * reprocess it.  Since these ops are idempotent, there's no
+		 * need to cache our reply at this time.
+		 */
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		xprt_conditional_disconnect(task->tk_xprt,
+			req->rq_connect_cookie);
+		break;
+	default:
+		/*
+		 * We were unable to reply and will have to drop the
+		 * request.  The server should reconnect and retransmit.
+		 */
+		BUG_ON(task->tk_status == -EAGAIN);
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		break;
+	}
+	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * 6.	Sort out the RPC call status
  */
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 1ef6e46d9da..8487aa0f1f5 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -141,12 +141,14 @@ EXPORT_SYMBOL_GPL(rpc_free_iostats);
 void rpc_count_iostats(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
-	struct rpc_iostats *stats = task->tk_client->cl_metrics;
+	struct rpc_iostats *stats;
 	struct rpc_iostats *op_metrics;
 	long rtt, execute, queue;
 
-	if (!stats || !req)
+	if (!task->tk_client || !task->tk_client->cl_metrics || !req)
 		return;
+
+	stats = task->tk_client->cl_metrics;
 	op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
 
 	op_metrics->om_ops++;
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
new file mode 100644
index 00000000000..5d9dd742264
--- /dev/null
+++ b/net/sunrpc/sunrpc.h
@@ -0,0 +1,37 @@
+/******************************************************************************
+
+(c) 2008 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions and macros used internally by RPC
+ */
+
+#ifndef _NET_SUNRPC_SUNRPC_H
+#define _NET_SUNRPC_SUNRPC_H
+
+static inline int rpc_reply_expected(struct rpc_task *task)
+{
+	return (task->tk_msg.rpc_proc != NULL) &&
+		(task->tk_msg.rpc_proc->p_decode != NULL);
+}
+
+#endif /* _NET_SUNRPC_SUNRPC_H */
+
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 52739f82df1..0eea2bfe111 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -12,8 +12,9 @@
  *  -	Next, the caller puts together the RPC message, stuffs it into
  *	the request struct, and calls xprt_transmit().
  *  -	xprt_transmit sends the message and installs the caller on the
- *	transport's wait list. At the same time, it installs a timer that
- *	is run after the packet's timeout has expired.
+ *	transport's wait list. At the same time, if a reply is expected,
+ *	it installs a timer that is run after the packet's timeout has
+ *	expired.
  *  -	When a packet arrives, the data_ready handler walks the list of
  *	pending requests for that transport. If a matching XID is found, the
  *	caller is woken up, and the timer removed.
@@ -46,6 +47,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/metrics.h>
 
+#include "sunrpc.h"
+
 /*
  * Local variables
  */
@@ -873,7 +876,10 @@ void xprt_transmit(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
 
 	if (!req->rq_received) {
-		if (list_empty(&req->rq_list)) {
+		if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
+			/*
+			 * Add to the list only if we're expecting a reply
+			 */
 			spin_lock_bh(&xprt->transport_lock);
 			/* Update the softirq receive buffer */
 			memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
@@ -908,8 +914,13 @@ void xprt_transmit(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (!xprt_connected(xprt))
 		task->tk_status = -ENOTCONN;
-	else if (!req->rq_received)
+	else if (!req->rq_received && rpc_reply_expected(task)) {
+		/*
+		 * Sleep on the pending queue since
+		 * we're expecting a reply.
+		 */
 		rpc_sleep_on(&xprt->pending, task, xprt_timer);
+	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
@@ -982,11 +993,17 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
  */
 void xprt_release(struct rpc_task *task)
 {
-	struct rpc_xprt	*xprt = task->tk_xprt;
+	struct rpc_xprt	*xprt;
 	struct rpc_rqst	*req;
+	int is_bc_request;
 
 	if (!(req = task->tk_rqstp))
 		return;
+
+	/* Preallocated backchannel request? */
+	is_bc_request = bc_prealloc(req);
+
+	xprt = req->rq_xprt;
 	rpc_count_iostats(task);
 	spin_lock_bh(&xprt->transport_lock);
 	xprt->ops->release_xprt(xprt, task);
@@ -999,10 +1016,19 @@ void xprt_release(struct rpc_task *task)
 		mod_timer(&xprt->timer,
 				xprt->last_used + xprt->idle_timeout);
 	spin_unlock_bh(&xprt->transport_lock);
-	xprt->ops->buf_free(req->rq_buffer);
+	if (!bc_prealloc(req))
+		xprt->ops->buf_free(req->rq_buffer);
 	task->tk_rqstp = NULL;
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
+
+	/*
+	 * Early exit if this is a backchannel preallocated request.
+	 * There is no need to have it added to the RPC slot list.
+	 */
+	if (is_bc_request)
+		return;
+
 	memset(req, 0, sizeof(*req));	/* mark unused */
 
 	dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
-- 
cgit v1.2.3-70-g09d2


From 0d90ba1cd416525c4825c111db862d8b15a02e9b Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:04 -0400
Subject: nfs41: Backchannel callback service helper routines

Executes the backchannel task on the RPC state machine using
the existing open connection previously established by the client.

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>

nfs41: Add bc_svc.o to sunrpc Makefile.

[nfs41: bc_send() does not need to be exported outside RPC module]
[nfs41: xprt_free_bc_request() need not be exported outside RPC module]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Update copyright]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/bc_xprt.h |  3 ++
 net/sunrpc/Makefile            |  2 +-
 net/sunrpc/bc_svc.c            | 81 ++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtsock.c          |  3 ++
 4 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/bc_svc.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 5965ae4f902..6508f0dc0ef 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -29,12 +29,15 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
 
 #ifdef CONFIG_NFS_V4_1
 struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs);
+void bc_release_request(struct rpc_task *);
+int bc_send(struct rpc_rqst *req);
 #else /* CONFIG_NFS_V4_1 */
 static inline int xprt_setup_backchannel(struct rpc_xprt *xprt,
 					 unsigned int min_reqs)
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 4a01f9684b8..db73fd2a3f0 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -13,6 +13,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
-sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o
+sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
new file mode 100644
index 00000000000..13f214f5312
--- /dev/null
+++ b/net/sunrpc/bc_svc.c
@@ -0,0 +1,81 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
+(c) 2009 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * The NFSv4.1 callback service helper routines.
+ * They implement the transport level processing required to send the
+ * reply over an existing open connection previously established by the client.
+ */
+
+#if defined(CONFIG_NFS_V4_1)
+
+#include <linux/module.h>
+
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCDSP
+
+void bc_release_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	dprintk("RPC:       bc_release_request: task= %p\n", task);
+
+	/*
+	 * Release this request only if it's a backchannel
+	 * preallocated request
+	 */
+	if (!bc_prealloc(req))
+		return;
+	xprt_free_bc_request(req);
+}
+
+/* Empty callback ops */
+static const struct rpc_call_ops nfs41_callback_ops = {
+};
+
+
+/*
+ * Send the callback reply
+ */
+int bc_send(struct rpc_rqst *req)
+{
+	struct rpc_task *task;
+	int ret;
+
+	dprintk("RPC:       bc_send req= %p\n", req);
+	task = rpc_run_bc_task(req, &nfs41_callback_ops);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else {
+		BUG_ON(atomic_read(&task->tk_count) != 1);
+		ret = task->tk_status;
+		rpc_put_task(task);
+	}
+	return ret;
+	dprintk("RPC:       bc_send ret= %d \n", ret);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e3e3a57116f..8a721867b60 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2183,6 +2183,9 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.buf_free		= rpc_free,
 	.send_request		= xs_tcp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+#if defined(CONFIG_NFS_V4_1)
+	.release_request	= bc_release_request,
+#endif /* CONFIG_NFS_V4_1 */
 	.close			= xs_tcp_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_tcp_print_stats,
-- 
cgit v1.2.3-70-g09d2


From 4d6bbb6233c9cf23822a2f66f8470c9f40854b77 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:07 -0400
Subject: nfs41: Backchannel bc_svc_process()

Implement the NFSv4.1 backchannel service.  Invokes the common callback
processing logic svc_process_common() to authenticate the call and
dispatch the appropriate NFSv4.1 XDR decoder and operation procedure.
It then invokes bc_send() to send the reply over the same connection.
bc_send() is implemented in a separate patch.

At this time there is no slot validation or reply cache handling.

[nfs41: Preallocate rpc_rqst receive buffer for handling callbacks]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Move bc_svc_process() declaration to correct patch]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svc.h |  2 ++
 net/sunrpc/svc.c           | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4a8afbd6200..16043c4a8bf 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -419,6 +419,8 @@ int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
+int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
+			struct svc_rqst *);
 int		   svc_register(const struct svc_serv *, const int,
 				const unsigned short, const unsigned short);
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bfda66db2f4..06b52e465f4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -25,6 +25,7 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/bc_xprt.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCDSP
 
@@ -1239,6 +1240,54 @@ svc_process(struct svc_rqst *rqstp)
 	return svc_send(rqstp);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Process a backchannel RPC request that arrived over an existing
+ * outbound connection
+ */
+int
+bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
+	       struct svc_rqst *rqstp)
+{
+	struct kvec	*argv = &rqstp->rq_arg.head[0];
+	struct kvec	*resv = &rqstp->rq_res.head[0];
+	int 		error;
+
+	/* Build the svc_rqst used by the common processing routine */
+	rqstp->rq_xid = req->rq_xid;
+	rqstp->rq_prot = req->rq_xprt->prot;
+	rqstp->rq_server = serv;
+
+	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
+	memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
+	memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
+	memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
+
+	/* reset result send buffer "put" position */
+	resv->iov_len = 0;
+
+	if (rqstp->rq_prot != IPPROTO_TCP) {
+		printk(KERN_ERR "No support for Non-TCP transports!\n");
+		BUG();
+	}
+
+	/*
+	 * Skip the next two words because they've already been
+	 * processed in the trasport
+	 */
+	svc_getu32(argv);	/* XID */
+	svc_getnl(argv);	/* CALLDIR */
+
+	error = svc_process_common(rqstp, argv, resv);
+	if (error <= 0)
+		return error;
+
+	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+	return bc_send(req);
+}
+EXPORT_SYMBOL(bc_svc_process);
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * Return (transport-specific) limit on the rpc payload.
  */
-- 
cgit v1.2.3-70-g09d2


From 7652e5a09ba319241607b22d9055ce93fd5b8039 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 1 Apr 2009 09:23:09 -0400
Subject: nfs41: sunrpc: provide functions to create and destroy a svc_xprt for
 backchannel use

For nfs41 callbacks we need an svc_xprt to process requests coming up the
backchannel socket as rpc_rqst's that are transformed into svc_rqst's that
need a rq_xprt to be processed.

The svc_{udp,tcp}_create methods are too heavy for this job as svc_create_socket
creates an actual socket to listen on while for nfs41 we're "reusing" the
fore channel's socket.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svcsock.h |  2 ++
 net/sunrpc/svcsock.c           | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 483e10380aa..6bb1ec4ae31 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -42,6 +42,8 @@ int		svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
 int		svc_addsock(struct svc_serv *serv, int fd, char *name_return);
 void		svc_init_xprt_sock(void);
 void		svc_cleanup_xprt_sock(void);
+struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot);
+void		svc_sock_destroy(struct svc_xprt *);
 
 /*
  * svc_makesock socket characteristics
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9d504234af4..a2a03e50053 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1327,3 +1327,42 @@ static void svc_sock_free(struct svc_xprt *xprt)
 		sock_release(svsk->sk_sock);
 	kfree(svsk);
 }
+
+/*
+ * Create a svc_xprt.
+ *
+ * For internal use only (e.g. nfsv4.1 backchannel).
+ * Callers should typically use the xpo_create() method.
+ */
+struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot)
+{
+	struct svc_sock *svsk;
+	struct svc_xprt *xprt = NULL;
+
+	dprintk("svc: %s\n", __func__);
+	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+	if (!svsk)
+		goto out;
+
+	xprt = &svsk->sk_xprt;
+	if (prot == IPPROTO_TCP)
+		svc_xprt_init(&svc_tcp_class, xprt, serv);
+	else if (prot == IPPROTO_UDP)
+		svc_xprt_init(&svc_udp_class, xprt, serv);
+	else
+		BUG();
+out:
+	dprintk("svc: %s return %p\n", __func__, xprt);
+	return xprt;
+}
+EXPORT_SYMBOL_GPL(svc_sock_create);
+
+/*
+ * Destroy a svc_sock.
+ */
+void svc_sock_destroy(struct svc_xprt *xprt)
+{
+	if (xprt)
+		kfree(container_of(xprt, struct svc_sock, sk_xprt));
+}
+EXPORT_SYMBOL_GPL(svc_sock_destroy);
-- 
cgit v1.2.3-70-g09d2


From 9c9f3f5fa62cc4959e4d4d1cf1ec74f2d6ac1197 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 1 Apr 2009 09:23:10 -0400
Subject: nfs41: sunrpc: add a struct svc_xprt pointer to struct svc_serv for
 backchannel use

This svc_xprt is passed on to the callback service thread to be later used
to processes incoming svc_rqst's

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svc.c           | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 16043c4a8bf..ea8009695c6 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -103,6 +103,7 @@ struct svc_serv {
 	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
 	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
 						 * entries in the svc_cb_list */
+	struct svc_xprt		*bc_xprt;
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 06b52e465f4..b35048fabe2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -487,6 +487,10 @@ svc_destroy(struct svc_serv *serv)
 	if (svc_serv_is_pooled(serv))
 		svc_pool_map_put();
 
+#if defined(CONFIG_NFS_V4_1)
+	svc_sock_destroy(serv->bc_xprt);
+#endif /* CONFIG_NFS_V4_1 */
+
 	svc_unregister(serv);
 	kfree(serv->sv_pools);
 	kfree(serv);
-- 
cgit v1.2.3-70-g09d2


From dd2b63d049480979016b959abc2d141cdddb1389 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:28 -0400
Subject: nfs41: Rename rq_received to rq_reply_bytes_recvd

The 'rq_received' member of 'struct rpc_rqst' is used to track when we
have received a reply to our request.  With v4.1, the backchannel
can now accept callback requests over the existing connection.  Rename
this field to make it clear that it is only used for tracking reply bytes
and not all bytes received on the connection.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/xprt.h   |  3 ++-
 net/sunrpc/backchannel_rqst.c |  2 +-
 net/sunrpc/clnt.c             |  8 ++++----
 net/sunrpc/stats.c            |  2 +-
 net/sunrpc/xprt.c             | 15 ++++++++-------
 5 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 55c6c37e249..1175d58efc2 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -67,7 +67,8 @@ struct rpc_rqst {
 	struct rpc_task *	rq_task;	/* RPC task data */
 	__be32			rq_xid;		/* request XID */
 	int			rq_cong;	/* has incremented xprt->cong */
-	int			rq_received;	/* receive completed */
+	int			rq_reply_bytes_recvd;	/* number of reply */
+							/* bytes received */
 	u32			rq_seqno;	/* gss seq no. used on req. */
 	int			rq_enc_pages_num;
 	struct page		**rq_enc_pages;	/* scratch pages for use by
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index f56e18a2349..5a7d342e308 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -230,7 +230,7 @@ struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
 
 	if (req != NULL) {
 		set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
-		req->rq_received = 0;
+		req->rq_reply_bytes_recvd = 0;
 		req->rq_bytes_sent = 0;
 		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
 			sizeof(req->rq_private_buf));
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f3e93b8eb90..5bc2f45bddf 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1258,8 +1258,8 @@ call_status(struct rpc_task *task)
 	struct rpc_rqst	*req = task->tk_rqstp;
 	int		status;
 
-	if (req->rq_received > 0 && !req->rq_bytes_sent)
-		task->tk_status = req->rq_received;
+	if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
+		task->tk_status = req->rq_reply_bytes_recvd;
 
 	dprint_status(task);
 
@@ -1376,7 +1376,7 @@ call_decode(struct rpc_task *task)
 
 	/*
 	 * Ensure that we see all writes made by xprt_complete_rqst()
-	 * before it changed req->rq_received.
+	 * before it changed req->rq_reply_bytes_recvd.
 	 */
 	smp_rmb();
 	req->rq_rcv_buf.len = req->rq_private_buf.len;
@@ -1417,7 +1417,7 @@ out_retry:
 	task->tk_status = 0;
 	/* Note: rpc_verify_header() may have freed the RPC slot */
 	if (task->tk_rqstp == req) {
-		req->rq_received = req->rq_rcv_buf.len = 0;
+		req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
 		if (task->tk_client->cl_discrtry)
 			xprt_conditional_disconnect(task->tk_xprt,
 					req->rq_connect_cookie);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 8487aa0f1f5..1b4e6791ecf 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -156,7 +156,7 @@ void rpc_count_iostats(struct rpc_task *task)
 	op_metrics->om_timeouts += task->tk_timeouts;
 
 	op_metrics->om_bytes_sent += task->tk_bytes_sent;
-	op_metrics->om_bytes_recv += req->rq_received;
+	op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
 
 	queue = (long)req->rq_xtime - task->tk_start;
 	if (queue < 0)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c144611223f..f412a852bc7 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -806,9 +806,10 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 
 	list_del_init(&req->rq_list);
 	req->rq_private_buf.len = copied;
-	/* Ensure all writes are done before we update req->rq_received */
+	/* Ensure all writes are done before we update */
+	/* req->rq_reply_bytes_recvd */
 	smp_wmb();
-	req->rq_received = copied;
+	req->rq_reply_bytes_recvd = copied;
 	rpc_wake_up_queued_task(&xprt->pending, task);
 }
 EXPORT_SYMBOL_GPL(xprt_complete_rqst);
@@ -823,7 +824,7 @@ static void xprt_timer(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
 
 	spin_lock_bh(&xprt->transport_lock);
-	if (!req->rq_received) {
+	if (!req->rq_reply_bytes_recvd) {
 		if (xprt->ops->timer)
 			xprt->ops->timer(task);
 	} else
@@ -845,8 +846,8 @@ int xprt_prepare_transmit(struct rpc_task *task)
 	dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
 
 	spin_lock_bh(&xprt->transport_lock);
-	if (req->rq_received && !req->rq_bytes_sent) {
-		err = req->rq_received;
+	if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) {
+		err = req->rq_reply_bytes_recvd;
 		goto out_unlock;
 	}
 	if (!xprt->ops->reserve_xprt(task))
@@ -875,7 +876,7 @@ void xprt_transmit(struct rpc_task *task)
 
 	dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
 
-	if (!req->rq_received) {
+	if (!req->rq_reply_bytes_recvd) {
 		if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
 			/*
 			 * Add to the list only if we're expecting a reply
@@ -914,7 +915,7 @@ void xprt_transmit(struct rpc_task *task)
 	/* Don't race with disconnect */
 	if (!xprt_connected(xprt))
 		task->tk_status = -ENOTCONN;
-	else if (!req->rq_received && rpc_reply_expected(task)) {
+	else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) {
 		/*
 		 * Sleep on the pending queue since
 		 * we're expecting a reply.
-- 
cgit v1.2.3-70-g09d2


From f8625a6a4bb76207302be58453603d8e324df490 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 1 Apr 2009 09:23:33 -0400
Subject: nfs41: Backchannel: Add a backchannel slot table to the session

Defines a new 'struct nfs4_slot_table' in the 'struct nfs4_session'
for use by the backchannel.  Initializes, resets, and destroys the backchannel
slot table in the same manner the forechannel slot table is initialized,
reset, and destroyed.

The sequenceid for each slot in the backchannel slot table is initialized
to 0, whereas the forechannel slotid's sequenceid is set to 1.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c         | 48 +++++++++++++++++++++++++++++++++++------------
 include/linux/nfs_fs_sb.h |  2 +-
 2 files changed, 37 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c3019ad8589..57dabb8a048 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4414,9 +4414,30 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
 			session->fc_attrs.max_reqs,
 			session->fc_slot_table.max_slots,
 			1);
+	if (status)
+		return status;
+
+	status = nfs4_reset_slot_table(&session->bc_slot_table,
+			session->bc_attrs.max_reqs,
+			session->bc_slot_table.max_slots,
+			0);
 	return status;
 }
 
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+	if (session->fc_slot_table.slots != NULL) {
+		kfree(session->fc_slot_table.slots);
+		session->fc_slot_table.slots = NULL;
+	}
+	if (session->bc_slot_table.slots != NULL) {
+		kfree(session->bc_slot_table.slots);
+		session->bc_slot_table.slots = NULL;
+	}
+	return;
+}
+
 /*
  * Initialize slot table
  */
@@ -4470,17 +4491,15 @@ static int nfs4_init_slot_tables(struct nfs4_session *session)
 
 	status = nfs4_init_slot_table(&session->fc_slot_table,
 			session->fc_attrs.max_reqs, 1);
-	return status;
-}
+	if (status)
+		return status;
 
-/* Destroy the slot table */
-static void nfs4_destroy_slot_table(struct nfs4_session *session)
-{
-	if (session->fc_slot_table.slots == NULL)
-		return;
-	kfree(session->fc_slot_table.slots);
-	session->fc_slot_table.slots = NULL;
-	return;
+	status = nfs4_init_slot_table(&session->bc_slot_table,
+			session->bc_attrs.max_reqs, 0);
+	if (status)
+		nfs4_destroy_slot_tables(session);
+
+	return status;
 }
 
 struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
@@ -4503,7 +4522,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 
 	tbl = &session->fc_slot_table;
 	spin_lock_init(&tbl->slot_tbl_lock);
-	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "Slot table");
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+
+	tbl = &session->bc_slot_table;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+
 	session->clp = clp;
 	return session;
 }
@@ -4515,7 +4539,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
 		__func__, session->clp->cl_rpcclient->cl_xprt);
 	xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
 				NFS41_BC_MIN_CALLBACKS);
-	nfs4_destroy_slot_table(session);
+	nfs4_destroy_slot_tables(session);
 	kfree(session);
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d0902ccec9c..19fe15d1204 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -205,7 +205,7 @@ struct nfs4_session {
 	struct nfs4_channel_attrs	fc_attrs;
 	struct nfs4_slot_table		fc_slot_table;
 	struct nfs4_channel_attrs	bc_attrs;
-					/* back channel has one slot */
+	struct nfs4_slot_table		bc_slot_table;
 	struct nfs_client		*clp;
 };
 
-- 
cgit v1.2.3-70-g09d2


From cdc2ae6d6a30df8fd92c5e300d0e3005e13eb6b0 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:46:47 +1000
Subject: md: fix some comments.

1/ Raid5 has learned to take over also raid4 and raid6 arrays.
2/ new_chunk in mdp_superblock_1 is in sectors, not bytes.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c        | 2 --
 include/linux/raid/md_p.h | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 72e8a324dcf..00934415675 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5231,8 +5231,6 @@ static void *raid5_takeover(mddev_t *mddev)
 	 *  raid1 - if there are two drives.  We need to know the chunk size
 	 *  raid4 - trivial - just use a raid4 layout.
 	 *  raid6 - Providing it is a *_6 layout
-	 *
-	 * For now, just do raid1
 	 */
 
 	if (mddev->level == 1)
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 6ba830fa853..ffa2efbbe38 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -232,7 +232,7 @@ struct mdp_superblock_1 {
 	__le64	reshape_position;	/* next address in array-space for reshape */
 	__le32	delta_disks;	/* change in number of raid_disks		*/
 	__le32	new_layout;	/* new layout					*/
-	__le32	new_chunk;	/* new chunk size (bytes)			*/
+	__le32	new_chunk;	/* new chunk size (512byte sectors)		*/
 	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
-- 
cgit v1.2.3-70-g09d2


From 6c9dc4255108bab4ef5c177d369b99c3c23492a7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 17 Jun 2009 18:02:10 -0700
Subject: lockd: Update NSM state from SM_MON replies

When rpc.statd starts up in user space at boot time, it attempts to
write the latest NSM local state number into
/proc/sys/fs/nfs/nsm_local_state.

If lockd.ko isn't loaded yet (as is the case in most configurations),
that file doesn't exist, thus the kernel's NSM state remains set to
its initial value of zero during lockd operation.

This is a problem because rpc.statd and lockd use the NSM state number
to prevent repeated lock recovery on rebooted hosts.  If lockd sends
a zero NSM state, but then a delayed SM_NOTIFY with a real NSM state
number is received, there is no way for lockd or rpc.statd to
distinguish that stale SM_NOTIFY from an actual reboot.  Thus lock
recovery could be performed after the rebooted host has already
started reclaiming locks, and those locks will be lost.

We could change /etc/init.d/nfslock so it always modprobes lockd.ko
before starting rpc.statd.  However, if lockd.ko is ever unloaded
and reloaded, we are back at square one, since the NSM state is not
preserved across an unload/reload cycle.  This may happen frequently
on clients that use automounter.  A period of NFS inactivity causes
lockd.ko to be unloaded, and the kernel loses its NSM state setting.

Instead, let's use the fact that rpc.statd plants the local system's
NSM state in every SM_MON (and SM_UNMON) reply.  lockd performs a
synchronous SM_MON upcall to the local rpc.statd _before_ sending its
first NLM request to a new remote.  This would permit rpc.statd to
provide the current NSM state to lockd, even after lockd.ko had been
unloaded and reloaded.

Note that NLMPROC_LOCK arguments are constructed before the
nsm_monitor() call, so we have to rearrange argument construction very
slightly to make this all work out.

And, the kernel appears to treat NSM state as a u32 (see struct
nlm_args and nsm_res).  Make nsm_local_state a u32 as well, to ensure
we don't get bogus comparison results.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         |  2 +-
 fs/lockd/mon.c              | 18 ++++++++++++------
 include/linux/lockd/lockd.h |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 273e229353f..f2fdcbce143 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -126,7 +126,6 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	struct nlm_lock	*lock = &argp->lock;
 
 	nlmclnt_next_cookie(&argp->cookie);
-	argp->state   = nsm_local_state;
 	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
 	lock->caller  = utsname()->nodename;
 	lock->oh.data = req->a_owner;
@@ -521,6 +520,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 
 	if (nsm_monitor(host) < 0)
 		goto out;
+	req->a_args.state = nsm_local_state;
 
 	fl->fl_flags |= FL_ACCESS;
 	status = do_vfs_lock(fl);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6d5d4a4169e..38385336614 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -53,7 +53,7 @@ static				DEFINE_SPINLOCK(nsm_lock);
 /*
  * Local NSM state
  */
-int	__read_mostly		nsm_local_state;
+u32	__read_mostly		nsm_local_state;
 int	__read_mostly		nsm_use_hostnames;
 
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
@@ -184,13 +184,19 @@ int nsm_monitor(const struct nlm_host *host)
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
 	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-	if (res.status != 0)
+	if (unlikely(res.status != 0))
 		status = -EIO;
-	if (status < 0)
+	if (unlikely(status < 0)) {
 		printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
-	else
-		nsm->sm_monitored = 1;
-	return status;
+		return status;
+	}
+
+	nsm->sm_monitored = 1;
+	if (unlikely(nsm_local_state != res.state)) {
+		nsm_local_state = res.state;
+		dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
+	}
+	return 0;
 }
 
 /**
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 51855dfd8ad..c325b187966 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -195,7 +195,7 @@ extern struct svc_procedure	nlmsvc_procedures4[];
 extern int			nlmsvc_grace_period;
 extern unsigned long		nlmsvc_timeout;
 extern int			nsm_use_hostnames;
-extern int			nsm_local_state;
+extern u32			nsm_local_state;
 
 /*
  * Lockd client functions
-- 
cgit v1.2.3-70-g09d2


From 2ad780978b7c0c3e7877949f098cbd06e7c73839 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 17 Jun 2009 18:02:11 -0700
Subject: NFS: Clean up MNT program definitions

Clean up:  Relocate MNT program procedure number definitions to the
only file that uses them.  Relocate the version number definitions,
which are shared, to nfs.h.  Remove duplicate program number
definitions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c  | 32 ++++++++++++++++++++++++++++----
 fs/nfs/nfsroot.c     |  3 +++
 include/linux/nfs.h  |  5 +++--
 include/linux/nfs2.h |  7 -------
 include/linux/nfs3.h |  5 -----
 5 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca905a5bb1b..af45a374d56 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -20,6 +20,30 @@
 # define NFSDBG_FACILITY	NFSDBG_MOUNT
 #endif
 
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+	MOUNTPROC_NULL		= 0,
+	MOUNTPROC_MNT		= 1,
+	MOUNTPROC_DUMP		= 2,
+	MOUNTPROC_UMNT		= 3,
+	MOUNTPROC_UMNTALL	= 4,
+	MOUNTPROC_EXPORT	= 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+	MOUNTPROC3_NULL		= 0,
+	MOUNTPROC3_MNT		= 1,
+	MOUNTPROC3_DUMP		= 2,
+	MOUNTPROC3_UMNT		= 3,
+	MOUNTPROC3_UMNTALL	= 4,
+	MOUNTPROC3_EXPORT	= 5,
+};
+
 static struct rpc_program	mnt_program;
 
 struct mnt_fhstatus {
@@ -68,7 +92,7 @@ int nfs_mount(struct nfs_mount_request *info)
 	if (info->version == NFS_MNT3_VERSION)
 		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
 	else
-		msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
 
 	status = rpc_call_sync(mnt_clnt, &msg, 0);
 	rpc_shutdown_client(mnt_clnt);
@@ -145,13 +169,13 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
 #define MNT_fhstatus3_sz	(1 + 16)
 
 static struct rpc_procinfo mnt_procedures[] = {
-	[MNTPROC_MNT] = {
-		.p_proc		= MNTPROC_MNT,
+	[MOUNTPROC_MNT] = {
+		.p_proc		= MOUNTPROC_MNT,
 		.p_encode	= (kxdrproc_t) xdr_encode_dirpath,
 		.p_decode	= (kxdrproc_t) xdr_decode_fhstatus,
 		.p_arglen	= MNT_dirpath_sz,
 		.p_replen	= MNT_fhstatus_sz,
-		.p_statidx	= MNTPROC_MNT,
+		.p_statidx	= MOUNTPROC_MNT,
 		.p_name		= "MOUNT",
 	},
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e3ed5908820..24c1b93874c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -92,6 +92,9 @@
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
 
+/* Default port to use if server is not running a portmapper */
+#define NFS_MNT_PORT	627
+
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT		"/tftpboot/%s"
 
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 214d499718f..f387919bbc5 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -25,8 +25,9 @@
 #define NFSMODE_SOCK	0140000
 #define NFSMODE_FIFO	0010000
 
-#define NFS_MNT_PROGRAM	100005
-#define NFS_MNT_PORT	627
+#define NFS_MNT_PROGRAM		100005
+#define NFS_MNT_VERSION		1
+#define NFS_MNT3_VERSION	3
 
 /*
  * NFS stats. The good thing with these values is that NFSv3 errors are
diff --git a/include/linux/nfs2.h b/include/linux/nfs2.h
index 0ed9517138f..fde24b30cc9 100644
--- a/include/linux/nfs2.h
+++ b/include/linux/nfs2.h
@@ -64,11 +64,4 @@ struct nfs2_fh {
 #define NFSPROC_READDIR		16
 #define NFSPROC_STATFS		17
 
-#define NFS_MNT_PROGRAM		100005
-#define NFS_MNT_VERSION		1
-#define MNTPROC_NULL		0
-#define MNTPROC_MNT		1
-#define MNTPROC_UMNT		3
-#define MNTPROC_UMNTALL		4
-
 #endif /* _LINUX_NFS2_H */
diff --git a/include/linux/nfs3.h b/include/linux/nfs3.h
index 539f3b550ea..ac33806ec7f 100644
--- a/include/linux/nfs3.h
+++ b/include/linux/nfs3.h
@@ -88,12 +88,7 @@ struct nfs3_fh {
 #define NFS3PROC_PATHCONF	20
 #define NFS3PROC_COMMIT		21
 
-#define NFS_MNT3_PROGRAM	100005
 #define NFS_MNT3_VERSION	3
-#define MOUNTPROC3_NULL		0
-#define MOUNTPROC3_MNT		1
-#define MOUNTPROC3_UMNT		3
-#define MOUNTPROC3_UMNTALL	4
  
 
 #if defined(__KERNEL__)
-- 
cgit v1.2.3-70-g09d2


From 275582031f9b3597a1b973f3ff617adfe639faa2 Mon Sep 17 00:00:00 2001
From: Alexander Chiang <achiang@hp.com>
Date: Wed, 10 Jun 2009 19:55:14 +0000
Subject: ACPI: Introduce acpi_is_root_bridge()

Returns whether an ACPI CA node is a PCI root bridge or not.

This API is generically useful, and shouldn't just be a hotplug function.

The implementation becomes much simpler as well.

Signed-off-by: Alex Chiang <achiang@hp.com>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/pci_root.c            | 24 +++++++++++++++++++++++
 drivers/pci/hotplug/acpi_pcihp.c   | 40 ++------------------------------------
 drivers/pci/hotplug/acpiphp_glue.c |  2 +-
 include/acpi/acpi_bus.h            |  1 +
 include/linux/pci_hotplug.h        |  1 -
 5 files changed, 28 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index ca8dba3b40b..888cb9f5c5f 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -142,6 +142,30 @@ acpi_handle acpi_get_pci_rootbridge_handle(unsigned int seg, unsigned int bus)
 
 EXPORT_SYMBOL_GPL(acpi_get_pci_rootbridge_handle);
 
+/**
+ * acpi_is_root_bridge - determine whether an ACPI CA node is a PCI root bridge
+ * @handle - the ACPI CA node in question.
+ *
+ * Note: we could make this API take a struct acpi_device * instead, but
+ * for now, it's more convenient to operate on an acpi_handle.
+ */
+int acpi_is_root_bridge(acpi_handle handle)
+{
+	int ret;
+	struct acpi_device *device;
+
+	ret = acpi_bus_get_device(handle, &device);
+	if (ret)
+		return 0;
+
+	ret = acpi_match_device_ids(device, root_device_ids);
+	if (ret)
+		return 0;
+	else
+		return 1;
+}
+EXPORT_SYMBOL_GPL(acpi_is_root_bridge);
+
 static acpi_status
 get_root_bridge_busnr_callback(struct acpi_resource *resource, void *data)
 {
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index fbc63d5e459..eb159587d0b 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -354,7 +354,7 @@ acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 		status = acpi_run_hpp(handle, hpp);
 		if (ACPI_SUCCESS(status))
 			break;
-		if (acpi_root_bridge(handle))
+		if (acpi_is_root_bridge(handle))
 			break;
 		status = acpi_get_parent(handle, &phandle);
 		if (ACPI_FAILURE(status))
@@ -428,7 +428,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags)
 		status = acpi_run_oshp(handle);
 		if (ACPI_SUCCESS(status))
 			goto got_one;
-		if (acpi_root_bridge(handle))
+		if (acpi_is_root_bridge(handle))
 			break;
 		chandle = handle;
 		status = acpi_get_parent(chandle, &handle);
@@ -449,42 +449,6 @@ got_one:
 }
 EXPORT_SYMBOL(acpi_get_hp_hw_control_from_firmware);
 
-/* acpi_root_bridge - check to see if this acpi object is a root bridge
- *
- * @handle - the acpi object in question.
- */
-int acpi_root_bridge(acpi_handle handle)
-{
-	acpi_status status;
-	struct acpi_device_info *info;
-	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
-	int i;
-
-	status = acpi_get_object_info(handle, &buffer);
-	if (ACPI_SUCCESS(status)) {
-		info = buffer.pointer;
-		if ((info->valid & ACPI_VALID_HID) &&
-			!strcmp(PCI_ROOT_HID_STRING,
-					info->hardware_id.value)) {
-			kfree(buffer.pointer);
-			return 1;
-		}
-		if (info->valid & ACPI_VALID_CID) {
-			for (i=0; i < info->compatibility_id.count; i++) {
-				if (!strcmp(PCI_ROOT_HID_STRING,
-					info->compatibility_id.id[i].value)) {
-					kfree(buffer.pointer);
-					return 1;
-				}
-			}
-		}
-		kfree(buffer.pointer);
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(acpi_root_bridge);
-
-
 static int is_ejectable(acpi_handle handle)
 {
 	acpi_status status;
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 3a6064bce56..fc6636e3300 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -1631,7 +1631,7 @@ find_root_bridges(acpi_handle handle, u32 lvl, void *context, void **rv)
 {
 	int *count = (int *)context;
 
-	if (acpi_root_bridge(handle)) {
+	if (acpi_is_root_bridge(handle)) {
 		acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
 				handle_hotplug_event_bridge, NULL);
 			(*count)++;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index c34b1102290..96d593ee485 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -369,6 +369,7 @@ struct device *acpi_get_physical_pci_device(acpi_handle);
 
 /* helper */
 acpi_handle acpi_get_child(acpi_handle, acpi_integer);
+int acpi_is_root_bridge(acpi_handle);
 acpi_handle acpi_get_pci_rootbridge_handle(unsigned int, unsigned int);
 #define DEVICE_ACPI_HANDLE(dev) ((acpi_handle)((dev)->archdata.acpi_handle))
 
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 20998746518..a3576ef9fc7 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -226,7 +226,6 @@ struct hotplug_params {
 extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 				struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
-int acpi_root_bridge(acpi_handle handle);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(struct pci_bus *pbus);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 7522060c95395f479ee4a6af3bbf9e097e92e48f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 18 Jun 2009 08:00:17 +0200
Subject: perf report: Add validation of call-chain entries

Add boundary checks for call-chain events. In case of corrupted
entries we could crash otherwise.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 20 ++++++------
 tools/perf/builtin-report.c  | 74 +++++++++++++++++++++++++++-----------------
 2 files changed, 56 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index eccae437fe3..a7d3a61a59b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -337,6 +337,16 @@ enum perf_event_type {
 	 */
 };
 
+#define MAX_STACK_DEPTH			255
+
+struct perf_callchain_entry {
+	__u16				nr;
+	__u16				hv;
+	__u16				kernel;
+	__u16				user;
+	__u64				ip[MAX_STACK_DEPTH];
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -652,16 +662,6 @@ extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
-#define MAX_STACK_DEPTH			255
-
-struct perf_callchain_entry {
-	u16				nr;
-	u16				hv;
-	u16				kernel;
-	u16				user;
-	u64				ip[MAX_STACK_DEPTH];
-};
-
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 986834623b4..e14e9867617 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -39,6 +39,8 @@ static int		dump_trace = 0;
 #define cdprintf(x...)	do { if (dump_trace) color_fprintf(stdout, color, x); } while (0)
 
 static int		verbose;
+#define eprintf(x...)	do { if (verbose) fprintf(stderr, x); } while (0)
+
 static int		full_paths;
 
 static unsigned long	page_size;
@@ -47,14 +49,6 @@ static unsigned long	mmap_window = 32;
 static char		*parent_pattern = "^sys_|^do_page_fault";
 static regex_t		parent_regex;
 
-struct ip_chain_event {
-	__u16 nr;
-	__u16 hv;
-	__u16 kernel;
-	__u16 user;
-	__u64 ips[];
-};
-
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
@@ -131,15 +125,11 @@ static struct dso *dsos__findnew(const char *name)
 
 	nr = dso__load(dso, NULL, verbose);
 	if (nr < 0) {
-		if (verbose)
-			fprintf(stderr, "Failed to open: %s\n", name);
+		eprintf("Failed to open: %s\n", name);
 		goto out_delete_dso;
 	}
-	if (!nr && verbose) {
-		fprintf(stderr,
-		"No symbols found in: %s, maybe install a debug package?\n",
-				name);
-	}
+	if (!nr)
+		eprintf("No symbols found in: %s, maybe install a debug package?\n", name);
 
 	dsos__add(dso);
 
@@ -844,7 +834,7 @@ static struct symbol *call__match(struct symbol *sym)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, __u64 ip, struct ip_chain_event *chain,
+		struct symbol *sym, __u64 ip, struct perf_callchain_entry *chain,
 		char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
@@ -868,7 +858,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		__u64 ip;
 
 		for (i = 0; i < chain->kernel; i++) {
-			ip = chain->ips[nr + i];
+			ip = chain->ip[nr + i];
 			dso = kernel_dso;
 			sym = resolve_symbol(thread, NULL, &dso, &ip);
 			entry.parent = call__match(sym);
@@ -878,7 +868,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		nr += i;
 
 		for (i = 0; i < chain->user; i++) {
-			ip = chain->ips[nr + i];
+			ip = chain->ip[nr + i];
 			sym = resolve_symbol(thread, NULL, NULL, &ip);
 			entry.parent = call__match(sym);
 			if (entry.parent)
@@ -1080,6 +1070,30 @@ static unsigned long total = 0,
 		     total_fork = 0,
 		     total_unknown = 0;
 
+static int validate_chain(struct perf_callchain_entry *chain, event_t *event)
+{
+	unsigned int chain_size;
+
+	if (chain->nr > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->hv > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->kernel > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->user > MAX_STACK_DEPTH)
+		return -1;
+	if (chain->hv + chain->kernel + chain->user != chain->nr)
+		return -1;
+
+	chain_size = event->header.size;
+	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
+
+	if (chain->nr*sizeof(__u64) > chain_size)
+		return -1;
+
+	return 0;
+}
+
 static int
 process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1091,7 +1105,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	__u64 period = 1;
 	struct map *map = NULL;
 	void *more_data = event->ip.__more_data;
-	struct ip_chain_event *chain = NULL;
+	struct perf_callchain_entry *chain = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD) {
 		period = *(__u64 *)more_data;
@@ -1111,21 +1125,26 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 
 		chain = (void *)more_data;
 
-		if (dump_trace) {
-			dprintf("... chain: u:%d, k:%d, nr:%d\n",
-				chain->user,
-				chain->kernel,
-				chain->nr);
+		dprintf("... chain: u:%d, k:%d, nr:%d\n",
+			chain->user,
+			chain->kernel,
+			chain->nr);
 
+		if (validate_chain(chain, event) < 0) {
+			eprintf("call-chain problem with event, skipping it.\n");
+			return 0;
+		}
+
+		if (dump_trace) {
 			for (i = 0; i < chain->nr; i++)
-				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
+				dprintf("..... %2d: %016Lx\n", i, chain->ip[i]);
 		}
 	}
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
 	if (thread == NULL) {
-		fprintf(stderr, "problem processing %d event, skipping it.\n",
+		eprintf("problem processing %d event, skipping it.\n",
 			event->header.type);
 		return -1;
 	}
@@ -1153,8 +1172,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip);
 
 		if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) {
-			fprintf(stderr,
-		"problem incrementing symbol count, skipping event\n");
+			eprintf("problem incrementing symbol count, skipping event\n");
 			return -1;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 31278e71471399beaff9280737e52b47db4dc345 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 17 Jun 2009 01:12:19 +0000
Subject: net: group address list and its count

This patch is inspired by patch recently posted by Johannes Berg. Basically what
my patch does is to group list and a count of addresses into newly introduced
structure netdev_hw_addr_list. This brings us two benefits:
1) struct net_device becames a bit nicer.
2) in the future there will be a possibility to operate with lists independently
   on netdevices (with exporting right functions).
I wanted to introduce this patch before I'll post a multicast lists conversion.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

 drivers/net/bnx2.c              |    4 +-
 drivers/net/e1000/e1000_main.c  |    4 +-
 drivers/net/ixgbe/ixgbe_main.c  |    6 +-
 drivers/net/mv643xx_eth.c       |    2 +-
 drivers/net/niu.c               |    4 +-
 drivers/net/virtio_net.c        |   10 ++--
 drivers/s390/net/qeth_l2_main.c |    2 +-
 include/linux/netdevice.h       |   17 +++--
 net/core/dev.c                  |  130 ++++++++++++++++++--------------------
 9 files changed, 89 insertions(+), 90 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bnx2.c              |   4 +-
 drivers/net/e1000/e1000_main.c  |   4 +-
 drivers/net/ixgbe/ixgbe_main.c  |   6 +-
 drivers/net/mv643xx_eth.c       |   2 +-
 drivers/net/niu.c               |   4 +-
 drivers/net/virtio_net.c        |  10 ++--
 drivers/s390/net/qeth_l2_main.c |   2 +-
 include/linux/netdevice.h       |  17 ++++--
 net/core/dev.c                  | 130 +++++++++++++++++++---------------------
 9 files changed, 89 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 7e3738112c4..38f1c3375d7 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -3552,14 +3552,14 @@ bnx2_set_rx_mode(struct net_device *dev)
 		sort_mode |= BNX2_RPM_SORT_USER0_MC_HSH_EN;
 	}
 
-	if (dev->uc_count > BNX2_MAX_UNICAST_ADDRESSES) {
+	if (dev->uc.count > BNX2_MAX_UNICAST_ADDRESSES) {
 		rx_mode |= BNX2_EMAC_RX_MODE_PROMISCUOUS;
 		sort_mode |= BNX2_RPM_SORT_USER0_PROM_EN |
 			     BNX2_RPM_SORT_USER0_PROM_VLAN;
 	} else if (!(dev->flags & IFF_PROMISC)) {
 		/* Add all entries into to the match filter list */
 		i = 0;
-		list_for_each_entry(ha, &dev->uc_list, list) {
+		list_for_each_entry(ha, &dev->uc.list, list) {
 			bnx2_set_mac_addr(bp, ha->addr,
 					  i + BNX2_START_UNICAST_ADDRESS_INDEX);
 			sort_mode |= (1 <<
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 8d36743c814..5e3356f8eb5 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2370,7 +2370,7 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 			rctl |= E1000_RCTL_VFE;
 	}
 
-	if (netdev->uc_count > rar_entries - 1) {
+	if (netdev->uc.count > rar_entries - 1) {
 		rctl |= E1000_RCTL_UPE;
 	} else if (!(netdev->flags & IFF_PROMISC)) {
 		rctl &= ~E1000_RCTL_UPE;
@@ -2394,7 +2394,7 @@ static void e1000_set_rx_mode(struct net_device *netdev)
 	 */
 	i = 1;
 	if (use_uc)
-		list_for_each_entry(ha, &netdev->uc_list, list) {
+		list_for_each_entry(ha, &netdev->uc.list, list) {
 			if (i == rar_entries)
 				break;
 			e1000_rar_set(hw, ha->addr, i++);
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index a551a96ce67..e756e220db3 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -2321,7 +2321,7 @@ static void ixgbe_set_rx_mode(struct net_device *netdev)
 	IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctrl);
 
 	/* reprogram secondary unicast list */
-	hw->mac.ops.update_uc_addr_list(hw, &netdev->uc_list);
+	hw->mac.ops.update_uc_addr_list(hw, &netdev->uc.list);
 
 	/* reprogram multicast list */
 	addr_count = netdev->mc_count;
@@ -5261,7 +5261,7 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd)
 
 /**
  * ixgbe_add_sanmac_netdev - Add the SAN MAC address to the corresponding
- * netdev->dev_addr_list
+ * netdev->dev_addrs
  * @netdev: network interface device structure
  *
  * Returns non-zero on failure
@@ -5282,7 +5282,7 @@ static int ixgbe_add_sanmac_netdev(struct net_device *dev)
 
 /**
  * ixgbe_del_sanmac_netdev - Removes the SAN MAC address to the corresponding
- * netdev->dev_addr_list
+ * netdev->dev_addrs
  * @netdev: network interface device structure
  *
  * Returns non-zero on failure
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index b4e18a58cb1..745ae8b4a2e 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1729,7 +1729,7 @@ static u32 uc_addr_filter_mask(struct net_device *dev)
 		return 0;
 
 	nibbles = 1 << (dev->dev_addr[5] & 0x0f);
-	list_for_each_entry(ha, &dev->uc_list, list) {
+	list_for_each_entry(ha, &dev->uc.list, list) {
 		if (memcmp(dev->dev_addr, ha->addr, 5))
 			return 0;
 		if ((dev->dev_addr[5] ^ ha->addr[5]) & 0xf0)
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index fa61a12c5e1..d2146d4a10f 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -6376,7 +6376,7 @@ static void niu_set_rx_mode(struct net_device *dev)
 	if ((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 0))
 		np->flags |= NIU_FLAGS_MCAST;
 
-	alt_cnt = dev->uc_count;
+	alt_cnt = dev->uc.count;
 	if (alt_cnt > niu_num_alt_addr(np)) {
 		alt_cnt = 0;
 		np->flags |= NIU_FLAGS_PROMISC;
@@ -6385,7 +6385,7 @@ static void niu_set_rx_mode(struct net_device *dev)
 	if (alt_cnt) {
 		int index = 0;
 
-		list_for_each_entry(ha, &dev->uc_list, list) {
+		list_for_each_entry(ha, &dev->uc.list, list) {
 			err = niu_set_alt_mac(np, index, ha->addr);
 			if (err)
 				printk(KERN_WARNING PFX "%s: Error %d "
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 52198f6797a..2a6e81d5b57 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -709,7 +709,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 			 allmulti ? "en" : "dis");
 
 	/* MAC filter - use one buffer for both lists */
-	mac_data = buf = kzalloc(((dev->uc_count + dev->mc_count) * ETH_ALEN) +
+	mac_data = buf = kzalloc(((dev->uc.count + dev->mc_count) * ETH_ALEN) +
 				 (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
 	if (!buf) {
 		dev_warn(&dev->dev, "No memory for MAC address buffer\n");
@@ -719,16 +719,16 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	sg_init_table(sg, 2);
 
 	/* Store the unicast list and count in the front of the buffer */
-	mac_data->entries = dev->uc_count;
+	mac_data->entries = dev->uc.count;
 	i = 0;
-	list_for_each_entry(ha, &dev->uc_list, list)
+	list_for_each_entry(ha, &dev->uc.list, list)
 		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
 
 	sg_set_buf(&sg[0], mac_data,
-		   sizeof(mac_data->entries) + (dev->uc_count * ETH_ALEN));
+		   sizeof(mac_data->entries) + (dev->uc.count * ETH_ALEN));
 
 	/* multicast list and count fill the end */
-	mac_data = (void *)&mac_data->macs[dev->uc_count][0];
+	mac_data = (void *)&mac_data->macs[dev->uc.count][0];
 
 	mac_data->entries = dev->mc_count;
 	addr = dev->mc_list;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index ecd3d06c0d5..3607d107f49 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -655,7 +655,7 @@ static void qeth_l2_set_multicast_list(struct net_device *dev)
 	for (dm = dev->mc_list; dm; dm = dm->next)
 		qeth_l2_add_mc(card, dm->da_addr, 0);
 
-	list_for_each_entry(ha, &dev->uc_list, list)
+	list_for_each_entry(ha, &dev->uc.list, list)
 		qeth_l2_add_mc(card, ha->addr, 1);
 
 	spin_unlock_bh(&card->mclock);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9ea8d6dfe54..d4a4d986779 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -224,6 +224,11 @@ struct netdev_hw_addr {
 	struct rcu_head		rcu_head;
 };
 
+struct netdev_hw_addr_list {
+	struct list_head	list;
+	int			count;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,9 +781,8 @@ struct net_device
 	unsigned char		addr_len;	/* hardware address length	*/
 	unsigned short          dev_id;		/* for shared network cards */
 
-	struct list_head	uc_list;	/* Secondary unicast mac
-						   addresses */
-	int			uc_count;	/* Number of installed ucasts	*/
+	struct netdev_hw_addr_list	uc;	/* Secondary unicast
+						   mac addresses */
 	int			uc_promisc;
 	spinlock_t		addr_list_lock;
 	struct dev_addr_list	*mc_list;	/* Multicast mac addresses	*/
@@ -810,7 +814,8 @@ struct net_device
 						   because most packets are
 						   unicast) */
 
-	struct list_head	dev_addr_list; /* list of device hw addresses */
+	struct netdev_hw_addr_list	dev_addrs; /* list of device
+						      hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1806,11 +1811,11 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 }
 
 /*
- * dev_addr_list walker. Should be used only for read access. Call with
+ * dev_addrs walker. Should be used only for read access. Call with
  * rcu_read_lock held.
  */
 #define for_each_dev_addr(dev, ha) \
-		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+		list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)
 
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 576a61574a9..baf2dc13a34 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3461,10 +3461,10 @@ void __dev_set_rx_mode(struct net_device *dev)
 		/* Unicast addresses changes may only happen under the rtnl,
 		 * therefore calling __dev_set_promiscuity here is safe.
 		 */
-		if (dev->uc_count > 0 && !dev->uc_promisc) {
+		if (dev->uc.count > 0 && !dev->uc_promisc) {
 			__dev_set_promiscuity(dev, 1);
 			dev->uc_promisc = 1;
-		} else if (dev->uc_count == 0 && dev->uc_promisc) {
+		} else if (dev->uc.count == 0 && dev->uc_promisc) {
 			__dev_set_promiscuity(dev, -1);
 			dev->uc_promisc = 0;
 		}
@@ -3483,9 +3483,8 @@ void dev_set_rx_mode(struct net_device *dev)
 
 /* hw addresses list handling functions */
 
-static int __hw_addr_add(struct list_head *list, int *delta,
-			 unsigned char *addr, int addr_len,
-			 unsigned char addr_type)
+static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
 {
 	struct netdev_hw_addr *ha;
 	int alloc_size;
@@ -3493,7 +3492,7 @@ static int __hw_addr_add(struct list_head *list, int *delta,
 	if (addr_len > MAX_ADDR_LEN)
 		return -EINVAL;
 
-	list_for_each_entry(ha, list, list) {
+	list_for_each_entry(ha, &list->list, list) {
 		if (!memcmp(ha->addr, addr, addr_len) &&
 		    ha->type == addr_type) {
 			ha->refcount++;
@@ -3512,9 +3511,8 @@ static int __hw_addr_add(struct list_head *list, int *delta,
 	ha->type = addr_type;
 	ha->refcount = 1;
 	ha->synced = false;
-	list_add_tail_rcu(&ha->list, list);
-	if (delta)
-		(*delta)++;
+	list_add_tail_rcu(&ha->list, &list->list);
+	list->count++;
 	return 0;
 }
 
@@ -3526,120 +3524,121 @@ static void ha_rcu_free(struct rcu_head *head)
 	kfree(ha);
 }
 
-static int __hw_addr_del(struct list_head *list, int *delta,
-			 unsigned char *addr, int addr_len,
-			 unsigned char addr_type)
+static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
 {
 	struct netdev_hw_addr *ha;
 
-	list_for_each_entry(ha, list, list) {
+	list_for_each_entry(ha, &list->list, list) {
 		if (!memcmp(ha->addr, addr, addr_len) &&
 		    (ha->type == addr_type || !addr_type)) {
 			if (--ha->refcount)
 				return 0;
 			list_del_rcu(&ha->list);
 			call_rcu(&ha->rcu_head, ha_rcu_free);
-			if (delta)
-				(*delta)--;
+			list->count--;
 			return 0;
 		}
 	}
 	return -ENOENT;
 }
 
-static int __hw_addr_add_multiple(struct list_head *to_list, int *to_delta,
-				  struct list_head *from_list, int addr_len,
+static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
+				  struct netdev_hw_addr_list *from_list,
+				  int addr_len,
 				  unsigned char addr_type)
 {
 	int err;
 	struct netdev_hw_addr *ha, *ha2;
 	unsigned char type;
 
-	list_for_each_entry(ha, from_list, list) {
+	list_for_each_entry(ha, &from_list->list, list) {
 		type = addr_type ? addr_type : ha->type;
-		err = __hw_addr_add(to_list, to_delta, ha->addr,
-				    addr_len, type);
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
 		if (err)
 			goto unroll;
 	}
 	return 0;
 
 unroll:
-	list_for_each_entry(ha2, from_list, list) {
+	list_for_each_entry(ha2, &from_list->list, list) {
 		if (ha2 == ha)
 			break;
 		type = addr_type ? addr_type : ha2->type;
-		__hw_addr_del(to_list, to_delta, ha2->addr,
-			      addr_len, type);
+		__hw_addr_del(to_list, ha2->addr, addr_len, type);
 	}
 	return err;
 }
 
-static void __hw_addr_del_multiple(struct list_head *to_list, int *to_delta,
-				   struct list_head *from_list, int addr_len,
+static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
+				   struct netdev_hw_addr_list *from_list,
+				   int addr_len,
 				   unsigned char addr_type)
 {
 	struct netdev_hw_addr *ha;
 	unsigned char type;
 
-	list_for_each_entry(ha, from_list, list) {
+	list_for_each_entry(ha, &from_list->list, list) {
 		type = addr_type ? addr_type : ha->type;
-		__hw_addr_del(to_list, to_delta, ha->addr,
-			      addr_len, addr_type);
+		__hw_addr_del(to_list, ha->addr, addr_len, addr_type);
 	}
 }
 
-static int __hw_addr_sync(struct list_head *to_list, int *to_delta,
-			  struct list_head *from_list, int *from_delta,
+static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
+			  struct netdev_hw_addr_list *from_list,
 			  int addr_len)
 {
 	int err = 0;
 	struct netdev_hw_addr *ha, *tmp;
 
-	list_for_each_entry_safe(ha, tmp, from_list, list) {
+	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
 		if (!ha->synced) {
-			err = __hw_addr_add(to_list, to_delta, ha->addr,
+			err = __hw_addr_add(to_list, ha->addr,
 					    addr_len, ha->type);
 			if (err)
 				break;
 			ha->synced = true;
 			ha->refcount++;
 		} else if (ha->refcount == 1) {
-			__hw_addr_del(to_list, to_delta, ha->addr,
-				      addr_len, ha->type);
-			__hw_addr_del(from_list, from_delta, ha->addr,
-				      addr_len, ha->type);
+			__hw_addr_del(to_list, ha->addr, addr_len, ha->type);
+			__hw_addr_del(from_list, ha->addr, addr_len, ha->type);
 		}
 	}
 	return err;
 }
 
-static void __hw_addr_unsync(struct list_head *to_list, int *to_delta,
-			     struct list_head *from_list, int *from_delta,
+static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
+			     struct netdev_hw_addr_list *from_list,
 			     int addr_len)
 {
 	struct netdev_hw_addr *ha, *tmp;
 
-	list_for_each_entry_safe(ha, tmp, from_list, list) {
+	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
 		if (ha->synced) {
-			__hw_addr_del(to_list, to_delta, ha->addr,
+			__hw_addr_del(to_list, ha->addr,
 				      addr_len, ha->type);
 			ha->synced = false;
-			__hw_addr_del(from_list, from_delta, ha->addr,
+			__hw_addr_del(from_list, ha->addr,
 				      addr_len, ha->type);
 		}
 	}
 }
 
-
-static void __hw_addr_flush(struct list_head *list)
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
 {
 	struct netdev_hw_addr *ha, *tmp;
 
-	list_for_each_entry_safe(ha, tmp, list, list) {
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
 		list_del_rcu(&ha->list);
 		call_rcu(&ha->rcu_head, ha_rcu_free);
 	}
+	list->count = 0;
+}
+
+static void __hw_addr_init(struct netdev_hw_addr_list *list)
+{
+	INIT_LIST_HEAD(&list->list);
+	list->count = 0;
 }
 
 /* Device addresses handling functions */
@@ -3648,7 +3647,7 @@ static void dev_addr_flush(struct net_device *dev)
 {
 	/* rtnl_mutex must be held here */
 
-	__hw_addr_flush(&dev->dev_addr_list);
+	__hw_addr_flush(&dev->dev_addrs);
 	dev->dev_addr = NULL;
 }
 
@@ -3660,16 +3659,16 @@ static int dev_addr_init(struct net_device *dev)
 
 	/* rtnl_mutex must be held here */
 
-	INIT_LIST_HEAD(&dev->dev_addr_list);
+	__hw_addr_init(&dev->dev_addrs);
 	memset(addr, 0, sizeof(addr));
-	err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(addr),
+	err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
 			    NETDEV_HW_ADDR_T_LAN);
 	if (!err) {
 		/*
 		 * Get the first (previously created) address from the list
 		 * and set dev_addr pointer to this location.
 		 */
-		ha = list_first_entry(&dev->dev_addr_list,
+		ha = list_first_entry(&dev->dev_addrs.list,
 				      struct netdev_hw_addr, list);
 		dev->dev_addr = ha->addr;
 	}
@@ -3694,8 +3693,7 @@ int dev_addr_add(struct net_device *dev, unsigned char *addr,
 
 	ASSERT_RTNL();
 
-	err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, dev->addr_len,
-			    addr_type);
+	err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 	return err;
@@ -3725,11 +3723,12 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr,
 	 * We can not remove the first address from the list because
 	 * dev->dev_addr points to that.
 	 */
-	ha = list_first_entry(&dev->dev_addr_list, struct netdev_hw_addr, list);
+	ha = list_first_entry(&dev->dev_addrs.list,
+			      struct netdev_hw_addr, list);
 	if (ha->addr == dev->dev_addr && ha->refcount == 1)
 		return -ENOENT;
 
-	err = __hw_addr_del(&dev->dev_addr_list, NULL, addr, dev->addr_len,
+	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
 			    addr_type);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
@@ -3757,8 +3756,7 @@ int dev_addr_add_multiple(struct net_device *to_dev,
 
 	if (from_dev->addr_len != to_dev->addr_len)
 		return -EINVAL;
-	err = __hw_addr_add_multiple(&to_dev->dev_addr_list, NULL,
-				     &from_dev->dev_addr_list,
+	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
 				     to_dev->addr_len, addr_type);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
@@ -3784,15 +3782,14 @@ int dev_addr_del_multiple(struct net_device *to_dev,
 
 	if (from_dev->addr_len != to_dev->addr_len)
 		return -EINVAL;
-	__hw_addr_del_multiple(&to_dev->dev_addr_list, NULL,
-			       &from_dev->dev_addr_list,
+	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
 			       to_dev->addr_len, addr_type);
 	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
 	return 0;
 }
 EXPORT_SYMBOL(dev_addr_del_multiple);
 
-/* unicast and multicast addresses handling functions */
+/* multicast addresses handling functions */
 
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
@@ -3868,8 +3865,8 @@ int dev_unicast_delete(struct net_device *dev, void *addr)
 
 	ASSERT_RTNL();
 
-	err = __hw_addr_del(&dev->uc_list, &dev->uc_count, addr,
-			    dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
+	err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
+			    NETDEV_HW_ADDR_T_UNICAST);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	return err;
@@ -3892,8 +3889,8 @@ int dev_unicast_add(struct net_device *dev, void *addr)
 
 	ASSERT_RTNL();
 
-	err = __hw_addr_add(&dev->uc_list, &dev->uc_count, addr,
-			    dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
+	err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
+			    NETDEV_HW_ADDR_T_UNICAST);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	return err;
@@ -3966,8 +3963,7 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from)
 	if (to->addr_len != from->addr_len)
 		return -EINVAL;
 
-	err = __hw_addr_sync(&to->uc_list, &to->uc_count,
-			     &from->uc_list, &from->uc_count, to->addr_len);
+	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
 	if (!err)
 		__dev_set_rx_mode(to);
 	return err;
@@ -3990,8 +3986,7 @@ void dev_unicast_unsync(struct net_device *to, struct net_device *from)
 	if (to->addr_len != from->addr_len)
 		return;
 
-	__hw_addr_unsync(&to->uc_list, &to->uc_count,
-			 &from->uc_list, &from->uc_count, to->addr_len);
+	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
 	__dev_set_rx_mode(to);
 }
 EXPORT_SYMBOL(dev_unicast_unsync);
@@ -4000,15 +3995,14 @@ static void dev_unicast_flush(struct net_device *dev)
 {
 	/* rtnl_mutex must be held here */
 
-	__hw_addr_flush(&dev->uc_list);
-	dev->uc_count = 0;
+	__hw_addr_flush(&dev->uc);
 }
 
 static void dev_unicast_init(struct net_device *dev)
 {
 	/* rtnl_mutex must be held here */
 
-	INIT_LIST_HEAD(&dev->uc_list);
+	__hw_addr_init(&dev->uc);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 43a21ea81a2400992561146327c4785ce7f7be38 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 19:39:37 +0100
Subject: perf_counter: Add event overlow handling

Alternative method of mmap() data output handling that provides
better overflow management and a more reliable data stream.

Unlike the previous method, that didn't have any user->kernel
feedback and relied on userspace keeping up, this method relies on
userspace writing its last read position into the control page.

It will ensure new output doesn't overwrite not-yet read events,
new events for which there is no space left are lost and the
overflow counter is incremented, providing exact event loss
numbers.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  40 +++++++---
 kernel/perf_counter.c        | 185 ++++++++++++++++++++++++++++++-------------
 2 files changed, 158 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a7d3a61a59b..0765e8e6984 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -236,10 +236,16 @@ struct perf_counter_mmap_page {
 	/*
 	 * Control data for the mmap() data buffer.
 	 *
-	 * User-space reading this value should issue an rmb(), on SMP capable
-	 * platforms, after reading this value -- see perf_counter_wakeup().
+	 * User-space reading the @data_head value should issue an rmb(), on
+	 * SMP capable platforms, after reading this value -- see
+	 * perf_counter_wakeup().
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data. In this case
+	 * the kernel will not over-write unread data.
 	 */
 	__u64   data_head;		/* head in the data section */
+	__u64	data_tail;		/* user-space written tail */
 };
 
 #define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
@@ -273,6 +279,15 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_MMAP			= 1,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				id;
+	 * 	u64				lost;
+	 * };
+	 */
+	PERF_EVENT_LOST			= 2,
+
 	/*
 	 * struct {
 	 *	struct perf_event_header	header;
@@ -313,26 +328,26 @@ enum perf_event_type {
 
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
-	 * will be PERF_RECORD_*
+	 * will be PERF_SAMPLE_*
 	 *
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *
-	 *	{ u64			ip;	  } && PERF_RECORD_IP
-	 *	{ u32			pid, tid; } && PERF_RECORD_TID
-	 *	{ u64			time;     } && PERF_RECORD_TIME
-	 *	{ u64			addr;     } && PERF_RECORD_ADDR
-	 *	{ u64			config;   } && PERF_RECORD_CONFIG
-	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			config;   } && PERF_SAMPLE_CONFIG
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 *
 	 *	{ u64			nr;
-	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
 	 *
 	 *	{ u16			nr,
 	 *				hv,
 	 *				kernel,
 	 *				user;
-	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 * };
 	 */
 };
@@ -424,6 +439,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
+	int				writable;	/* are we writable   */
 	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
@@ -433,8 +449,8 @@ struct perf_mmap_data {
 	atomic_long_t			done_head;	/* completed head    */
 
 	atomic_t			lock;		/* concurrent writes */
-
 	atomic_t			wakeup;		/* needs a wakeup    */
+	atomic_t			lost;		/* nr records lost   */
 
 	struct perf_counter_mmap_page   *user_page;
 	void				*data_pages[0];
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 109a9572385..7e9108efd30 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct perf_mmap_data *data;
 	int ret = VM_FAULT_SIGBUS;
 
+	if (vmf->flags & FAULT_FLAG_MKWRITE) {
+		if (vmf->pgoff == 0)
+			ret = 0;
+		return ret;
+	}
+
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (!data)
@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if ((unsigned)nr > data->nr_pages)
 			goto unlock;
 
+		if (vmf->flags & FAULT_FLAG_WRITE)
+			goto unlock;
+
 		vmf->page = virt_to_page(data->data_pages[nr]);
 	}
+
 	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
 	ret = 0;
 unlock:
 	rcu_read_unlock();
@@ -1862,6 +1875,14 @@ fail:
 	return -ENOMEM;
 }
 
+static void perf_mmap_free_page(unsigned long addr)
+{
+	struct page *page = virt_to_page(addr);
+
+	page->mapping = NULL;
+	__free_page(page);
+}
+
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
 	struct perf_mmap_data *data;
@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 
 	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
 
-	free_page((unsigned long)data->user_page);
+	perf_mmap_free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
-		free_page((unsigned long)data->data_pages[i]);
+		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
 	kfree(data);
 }
 
@@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
-	.open  = perf_mmap_open,
-	.close = perf_mmap_close,
-	.fault = perf_mmap_fault,
+	.open		= perf_mmap_open,
+	.close		= perf_mmap_close,
+	.fault		= perf_mmap_fault,
+	.page_mkwrite	= perf_mmap_fault,
 };
 
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	long user_extra, extra;
 	int ret = 0;
 
-	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
 	vma_size = vma->vm_end - vma->vm_start;
@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
 	counter->data->nr_locked = extra;
+	if (vma->vm_flags & VM_WRITE)
+		counter->data->writable = 1;
+
 unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
-	vma->vm_flags &= ~VM_MAYWRITE;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_ops = &perf_mmap_vmops;
 
@@ -2163,11 +2188,38 @@ struct perf_output_handle {
 	unsigned long		head;
 	unsigned long		offset;
 	int			nmi;
-	int			overflow;
+	int			sample;
 	int			locked;
 	unsigned long		flags;
 };
 
+static bool perf_output_space(struct perf_mmap_data *data,
+			      unsigned int offset, unsigned int head)
+{
+	unsigned long tail;
+	unsigned long mask;
+
+	if (!data->writable)
+		return true;
+
+	mask = (data->nr_pages << PAGE_SHIFT) - 1;
+	/*
+	 * Userspace could choose to issue a mb() before updating the tail
+	 * pointer. So that all reads will be completed before the write is
+	 * issued.
+	 */
+	tail = ACCESS_ONCE(data->user_page->data_tail);
+	smp_rmb();
+
+	offset = (offset - tail) & mask;
+	head   = (head   - tail) & mask;
+
+	if ((int)(head - offset) < 0)
+		return false;
+
+	return true;
+}
+
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
 	atomic_set(&handle->data->poll, POLL_IN);
@@ -2258,12 +2310,57 @@ out:
 	local_irq_restore(handle->flags);
 }
 
+static void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len)
+{
+	unsigned int pages_mask;
+	unsigned int offset;
+	unsigned int size;
+	void **pages;
+
+	offset		= handle->offset;
+	pages_mask	= handle->data->nr_pages - 1;
+	pages		= handle->data->data_pages;
+
+	do {
+		unsigned int page_offset;
+		int nr;
+
+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
+		page_offset = offset & (PAGE_SIZE - 1);
+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+		memcpy(pages[nr] + page_offset, buf, size);
+
+		len	    -= size;
+		buf	    += size;
+		offset	    += size;
+	} while (len);
+
+	handle->offset = offset;
+
+	/*
+	 * Check we didn't copy past our reservation window, taking the
+	 * possible unsigned int wrap into account.
+	 */
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int overflow)
+			     int nmi, int sample)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
+	int have_lost;
+	struct {
+		struct perf_event_header header;
+		u64			 id;
+		u64			 lost;
+	} lost_event;
 
 	/*
 	 * For inherited counters we send all the output towards the parent.
@@ -2276,19 +2373,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
-	handle->data	 = data;
-	handle->counter	 = counter;
-	handle->nmi	 = nmi;
-	handle->overflow = overflow;
+	handle->data	= data;
+	handle->counter	= counter;
+	handle->nmi	= nmi;
+	handle->sample	= sample;
 
 	if (!data->nr_pages)
 		goto fail;
 
+	have_lost = atomic_read(&data->lost);
+	if (have_lost)
+		size += sizeof(lost_event);
+
 	perf_output_lock(handle);
 
 	do {
 		offset = head = atomic_long_read(&data->head);
 		head += size;
+		if (unlikely(!perf_output_space(data, offset, head)))
+			goto fail;
 	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
@@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
 		atomic_set(&data->wakeup, 1);
 
+	if (have_lost) {
+		lost_event.header.type = PERF_EVENT_LOST;
+		lost_event.header.misc = 0;
+		lost_event.header.size = sizeof(lost_event);
+		lost_event.id          = counter->id;
+		lost_event.lost        = atomic_xchg(&data->lost, 0);
+
+		perf_output_put(handle, lost_event);
+	}
+
 	return 0;
 
 fail:
-	perf_output_wakeup(handle);
+	atomic_inc(&data->lost);
+	perf_output_unlock(handle);
 out:
 	rcu_read_unlock();
 
 	return -ENOSPC;
 }
 
-static void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len)
-{
-	unsigned int pages_mask;
-	unsigned int offset;
-	unsigned int size;
-	void **pages;
-
-	offset		= handle->offset;
-	pages_mask	= handle->data->nr_pages - 1;
-	pages		= handle->data->data_pages;
-
-	do {
-		unsigned int page_offset;
-		int nr;
-
-		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
-		page_offset = offset & (PAGE_SIZE - 1);
-		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-
-		memcpy(pages[nr] + page_offset, buf, size);
-
-		len	    -= size;
-		buf	    += size;
-		offset	    += size;
-	} while (len);
-
-	handle->offset = offset;
-
-	/*
-	 * Check we didn't copy past our reservation window, taking the
-	 * possible unsigned int wrap into account.
-	 */
-	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
 static void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_counter *counter = handle->counter;
@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 
 	int wakeup_events = counter->attr.wakeup_events;
 
-	if (handle->overflow && wakeup_events) {
+	if (handle->sample && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &data->events);
@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 }
 
 /*
- * Generic counter overflow handling.
+ * Generic counter overflow handling, sampling.
  */
 
 int perf_counter_overflow(struct perf_counter *counter, int nmi,
-- 
cgit v1.2.3-70-g09d2


From aa0ce5bbc2dbb1853bd0c6d13f17716fcc38ac5a Mon Sep 17 00:00:00 2001
From: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Date: Wed, 17 Jun 2009 16:25:52 -0700
Subject: softirq: introduce statistics for softirq

Statistics for softirq doesn't exist.
It will be helpful like statistics for interrupts.
This patch introduces counting the number of softirq,
which will be exported in /proc/softirqs.

When softirq handler consumes much CPU time,
/proc/stat is like the following.

$ while :; do  cat /proc/stat | head -n1 ; sleep 10 ; done
cpu  88 0 408 739665 583 28 2 0 0
cpu  450 0 1090 740970 594 28 1294 0 0
                              ^^^^
                             softirq

In such a situation,
/proc/softirqs shows us which softirq handler is invoked.
We can see the increase rate of softirqs.

<before>
$ cat /proc/softirqs
                CPU0       CPU1       CPU2       CPU3
HI                 0          0          0          0
TIMER         462850     462805     462782     462718
NET_TX             0          0          0        365
NET_RX          2472          2          2         40
BLOCK              0          0        381       1164
TASKLET            0          0          0        224
SCHED         462654     462689     462698     462427
RCU             3046       2423       3367       3173

<after>
$ cat /proc/softirqs
                CPU0       CPU1       CPU2       CPU3
HI                 0          0          0          0
TIMER         463361     465077     465056     464991
NET_TX            53          0          1        365
NET_RX          3757          2          2         40
BLOCK              0          0        398       1170
TASKLET            0          0          0        224
SCHED         463074     464318     464612     463330
RCU             3505       2948       3947       3673

When CPU TIME of softirq is high,
the rates of increase is the following.
  TIMER  : 220/sec     : CPU1-3
  NET_TX : 5/sec       : CPU0
  NET_RX : 120/sec     : CPU0
  SCHED  : 40-200/sec  : all CPU
  RCU    : 45-58/sec   : all CPU

The rates of increase in an idle mode is the following.
  TIMER  : 250/sec
  SCHED  : 250/sec
  RCU    : 2/sec

It seems many softirqs for receiving packets and rcu are invoked.  This
gives us help for checking system.

Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Reviewed-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel_stat.h | 12 ++++++++++++
 kernel/softirq.c            |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a77c6007dc9..348fa8874b5 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -5,6 +5,7 @@
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
+#include <linux/interrupt.h>
 #include <asm/irq.h>
 #include <asm/cputime.h>
 
@@ -31,6 +32,7 @@ struct kernel_stat {
 #ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
 #endif
+	unsigned int softirqs[NR_SOFTIRQS];
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -67,6 +69,16 @@ extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
 
 #endif
 
+static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
+{
+	kstat_this_cpu.softirqs[irq]++;
+}
+
+static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
+{
+       return kstat_cpu(cpu).softirqs[irq];
+}
+
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b41fb710e11..3a94905fa5d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -213,6 +213,7 @@ restart:
 	do {
 		if (pending & 1) {
 			int prev_count = preempt_count();
+			kstat_incr_softirqs_this_cpu(h - softirq_vec);
 
 			trace_softirq_entry(h, softirq_vec);
 			h->action(h);
-- 
cgit v1.2.3-70-g09d2


From 7d0771970c51e736758525dd71fb82dd036b823a Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 17 Jun 2009 16:26:03 -0700
Subject: spi: move common spi_setup() functionality into core

Start moving some spi_setup() functionality into the SPI core from the
various spi_master controller drivers:

 - Make that function stop being an inline;

 - Move two common idioms from drivers into that new function:
    * Default bits_per_word to 8 if that field isn't set
    * Issue a standardized dev_dbg() message

This is a net minor source code shrink, and supports enhancments found in
some follow-up patches.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/atmel_spi.c   |  2 --
 drivers/spi/au1550_spi.c  |  2 --
 drivers/spi/omap2_mcspi.c |  4 +---
 drivers/spi/omap_uwire.c  |  2 --
 drivers/spi/orion_spi.c   |  3 ---
 drivers/spi/pxa2xx_spi.c  | 11 ++--------
 drivers/spi/spi.c         | 55 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/spi/spi_bfin5xx.c |  4 ----
 drivers/spi/spi_bitbang.c |  7 +-----
 drivers/spi/spi_imx.c     |  5 +----
 drivers/spi/spi_mpc83xx.c |  6 ------
 drivers/spi/spi_s3c24xx.c |  7 ------
 drivers/spi/spi_txx9.c    |  2 +-
 drivers/spi/xilinx_spi.c  |  6 ------
 include/linux/spi/spi.h   | 25 +--------------------
 15 files changed, 61 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 12e443cc4ac..9f9ff3af72d 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -555,8 +555,6 @@ static int atmel_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (bits == 0)
-		bits = 8;
 	if (bits < 8 || bits > 16) {
 		dev_dbg(&spi->dev,
 				"setup: invalid bits_per_word %u (8 to 16)\n",
diff --git a/drivers/spi/au1550_spi.c b/drivers/spi/au1550_spi.c
index b02f25c702f..6a407e60f05 100644
--- a/drivers/spi/au1550_spi.c
+++ b/drivers/spi/au1550_spi.c
@@ -291,8 +291,6 @@ static int au1550_spi_setup(struct spi_device *spi)
 {
 	struct au1550_spi *hw = spi_master_get_devdata(spi->master);
 
-	if (spi->bits_per_word == 0)
-		spi->bits_per_word = 8;
 	if (spi->bits_per_word < 4 || spi->bits_per_word > 24) {
 		dev_err(&spi->dev, "setup: invalid bits_per_word=%d\n",
 			spi->bits_per_word);
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c
index d6d0c5d241c..b4f3b753d0f 100644
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -619,9 +619,7 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (spi->bits_per_word == 0)
-		spi->bits_per_word = 8;
-	else if (spi->bits_per_word < 4 || spi->bits_per_word > 32) {
+	if (spi->bits_per_word < 4 || spi->bits_per_word > 32) {
 		dev_dbg(&spi->dev, "setup: unsupported %d bit words\n",
 			spi->bits_per_word);
 		return -EINVAL;
diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c
index fe8b9ac0cce..747d29be45d 100644
--- a/drivers/spi/omap_uwire.c
+++ b/drivers/spi/omap_uwire.c
@@ -339,8 +339,6 @@ static int uwire_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 	bits = spi->bits_per_word;
 	if (t != NULL && t->bits_per_word)
 		bits = t->bits_per_word;
-	if (!bits)
-		bits = 8;
 
 	if (bits > 16) {
 		pr_debug("%s: wordsize %d?\n", dev_name(&spi->dev), bits);
diff --git a/drivers/spi/orion_spi.c b/drivers/spi/orion_spi.c
index c8b0babdc2a..6d5e33bb4b4 100644
--- a/drivers/spi/orion_spi.c
+++ b/drivers/spi/orion_spi.c
@@ -369,9 +369,6 @@ static int orion_spi_setup(struct spi_device *spi)
 		orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG,
 				  (1 << 14));
 
-	if (spi->bits_per_word == 0)
-		spi->bits_per_word = 8;
-
 	if ((spi->max_speed_hz == 0)
 			|| (spi->max_speed_hz > orion_spi->max_speed))
 		spi->max_speed_hz = orion_spi->max_speed;
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 3f3c08c6ba4..c7365e0b22d 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -1236,9 +1236,6 @@ static int setup(struct spi_device *spi)
 	uint tx_thres = TX_THRESH_DFLT;
 	uint rx_thres = RX_THRESH_DFLT;
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	if (drv_data->ssp_type != PXA25x_SSP
 		&& (spi->bits_per_word < 4 || spi->bits_per_word > 32)) {
 		dev_err(&spi->dev, "failed setup: ssp_type=%d, bits/wrd=%d "
@@ -1328,18 +1325,14 @@ static int setup(struct spi_device *spi)
 
 	/* NOTE:  PXA25x_SSP _could_ use external clocking ... */
 	if (drv_data->ssp_type != PXA25x_SSP)
-		dev_dbg(&spi->dev, "%d bits/word, %ld Hz, mode %d, %s\n",
-				spi->bits_per_word,
+		dev_dbg(&spi->dev, "%ld Hz actual, %s\n",
 				clk_get_rate(ssp->clk)
 					/ (1 + ((chip->cr0 & SSCR0_SCR) >> 8)),
-				spi->mode & 0x3,
 				chip->enable_dma ? "DMA" : "PIO");
 	else
-		dev_dbg(&spi->dev, "%d bits/word, %ld Hz, mode %d, %s\n",
-				spi->bits_per_word,
+		dev_dbg(&spi->dev, "%ld Hz actual, %s\n",
 				clk_get_rate(ssp->clk) / 2
 					/ (1 + ((chip->cr0 & SSCR0_SCR) >> 8)),
-				spi->mode & 0x3,
 				chip->enable_dma ? "DMA" : "PIO");
 
 	if (spi->bits_per_word <= 8) {
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8eba98c8ed1..0276bc37e25 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -265,7 +265,7 @@ int spi_add_device(struct spi_device *spi)
 	 * normally rely on the device being setup.  Devices
 	 * using SPI_CS_HIGH can't coexist well otherwise...
 	 */
-	status = spi->master->setup(spi);
+	status = spi_setup(spi);
 	if (status < 0) {
 		dev_err(dev, "can't %s %s, status %d\n",
 				"setup", dev_name(&spi->dev), status);
@@ -583,6 +583,59 @@ EXPORT_SYMBOL_GPL(spi_busnum_to_master);
 
 /*-------------------------------------------------------------------------*/
 
+/* Core methods for SPI master protocol drivers.  Some of the
+ * other core methods are currently defined as inline functions.
+ */
+
+/**
+ * spi_setup - setup SPI mode and clock rate
+ * @spi: the device whose settings are being modified
+ * Context: can sleep, and no requests are queued to the device
+ *
+ * SPI protocol drivers may need to update the transfer mode if the
+ * device doesn't work with its default.  They may likewise need
+ * to update clock rates or word sizes from initial values.  This function
+ * changes those settings, and must be called from a context that can sleep.
+ * Except for SPI_CS_HIGH, which takes effect immediately, the changes take
+ * effect the next time the device is selected and data is transferred to
+ * or from it.  When this function returns, the spi device is deselected.
+ *
+ * Note that this call will fail if the protocol driver specifies an option
+ * that the underlying controller or its driver does not support.  For
+ * example, not all hardware supports wire transfers using nine bit words,
+ * LSB-first wire encoding, or active-high chipselects.
+ */
+int spi_setup(struct spi_device *spi)
+{
+	int		status;
+
+	if (!spi->bits_per_word)
+		spi->bits_per_word = 8;
+
+	status = spi->master->setup(spi);
+
+	dev_dbg(&spi->dev, "setup mode %d, %s%s%s%s"
+				"%u bits/w, %u Hz max --> %d\n",
+			(int) (spi->mode & (SPI_CPOL | SPI_CPHA)),
+			(spi->mode & SPI_CS_HIGH) ? "cs_high, " : "",
+			(spi->mode & SPI_LSB_FIRST) ? "lsb, " : "",
+			(spi->mode & SPI_3WIRE) ? "3wire, " : "",
+			(spi->mode & SPI_LOOP) ? "loopback, " : "",
+			spi->bits_per_word, spi->max_speed_hz,
+			status);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(spi_setup);
+
+
+/*-------------------------------------------------------------------------*/
+
+/* Utility methods for SPI master protocol drivers, layered on
+ * top of the core.  Some other utility methods are defined as
+ * inline functions.
+ */
+
 static void spi_complete(void *arg)
 {
 	complete(arg);
diff --git a/drivers/spi/spi_bfin5xx.c b/drivers/spi/spi_bfin5xx.c
index 2e5fd097733..d54058a903b 100644
--- a/drivers/spi/spi_bfin5xx.c
+++ b/drivers/spi/spi_bfin5xx.c
@@ -1016,10 +1016,6 @@ static int bfin_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	/* Zero (the default) here means 8 bits */
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	if (spi->bits_per_word != 8 && spi->bits_per_word != 16)
 		return -EINVAL;
 
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
index 85e61f45121..855b0b06e62 100644
--- a/drivers/spi/spi_bitbang.c
+++ b/drivers/spi/spi_bitbang.c
@@ -201,9 +201,6 @@ int spi_bitbang_setup(struct spi_device *spi)
 		spi->controller_state = cs;
 	}
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	/* per-word shift register access, in hardware or bitbanging */
 	cs->txrx_word = bitbang->txrx_word[spi->mode & (SPI_CPOL|SPI_CPHA)];
 	if (!cs->txrx_word)
@@ -213,9 +210,7 @@ int spi_bitbang_setup(struct spi_device *spi)
 	if (retval < 0)
 		return retval;
 
-	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u nsec/bit\n",
-			__func__, spi->mode & (SPI_CPOL | SPI_CPHA),
-			spi->bits_per_word, 2 * cs->nsecs);
+	dev_dbg(&spi->dev, "%s, %u nsec/bit\n", __func__, 2 * cs->nsecs);
 
 	/* NOTE we _need_ to call chipselect() early, ideally with adapter
 	 * setup, unless the hardware defaults cooperate to avoid confusion
diff --git a/drivers/spi/spi_imx.c b/drivers/spi/spi_imx.c
index 0671aeef579..26d5ef06dbd 100644
--- a/drivers/spi/spi_imx.c
+++ b/drivers/spi/spi_imx.c
@@ -1286,10 +1286,7 @@ static int setup(struct spi_device *spi)
 
 	/* SPI word width */
 	tmp = spi->bits_per_word;
-	if (tmp == 0) {
-		tmp = 8;
-		spi->bits_per_word = 8;
-	} else if (tmp > 16) {
+	if (tmp > 16) {
 		status = -EINVAL;
 		dev_err(&spi->dev,
 			"setup - "
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index a32ccb44065..0926a3e293e 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -447,9 +447,6 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 	}
 	mpc83xx_spi = spi_master_get_devdata(spi->master);
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	hw_mode = cs->hw_mode; /* Save orginal settings */
 	cs->hw_mode = mpc83xx_spi_read_reg(&mpc83xx_spi->base->mode);
 	/* mask out bits we are going to set */
@@ -471,9 +468,6 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 		return retval;
 	}
 
-	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u Hz\n",
-		__func__, spi->mode & (SPI_CPOL | SPI_CPHA),
-		spi->bits_per_word, spi->max_speed_hz);
 #if 0 /* Don't think this is needed */
 	/* NOTE we _need_ to call chipselect() early, ideally with adapter
 	 * setup, unless the hardware defaults cooperate to avoid confusion
diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c
index b3ebc1d0f85..18a4c7f5438 100644
--- a/drivers/spi/spi_s3c24xx.c
+++ b/drivers/spi/spi_s3c24xx.c
@@ -153,9 +153,6 @@ static int s3c24xx_spi_setup(struct spi_device *spi)
 {
 	int ret;
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	if (spi->mode & ~MODEBITS) {
 		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
 			spi->mode & ~MODEBITS);
@@ -168,10 +165,6 @@ static int s3c24xx_spi_setup(struct spi_device *spi)
 		return ret;
 	}
 
-	dev_dbg(&spi->dev, "%s: mode %d, %u bpw, %d hz\n",
-		__func__, spi->mode, spi->bits_per_word,
-		spi->max_speed_hz);
-
 	return 0;
 }
 
diff --git a/drivers/spi/spi_txx9.c b/drivers/spi/spi_txx9.c
index 29cbb065618..8e36b2153d9 100644
--- a/drivers/spi/spi_txx9.c
+++ b/drivers/spi/spi_txx9.c
@@ -126,7 +126,7 @@ static int txx9spi_setup(struct spi_device *spi)
 			|| spi->max_speed_hz < c->min_speed_hz)
 		return -EINVAL;
 
-	bits_per_word = spi->bits_per_word ? : 8;
+	bits_per_word = spi->bits_per_word;
 	if (bits_per_word != 8 && bits_per_word != 16)
 		return -EINVAL;
 
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 494d3f756e2..2d7e6b81fb4 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -170,9 +170,6 @@ static int xilinx_spi_setup(struct spi_device *spi)
 	xspi = spi_master_get_devdata(spi->master);
 	bitbang = &xspi->bitbang;
 
-	if (!spi->bits_per_word)
-		spi->bits_per_word = 8;
-
 	if (spi->mode & ~MODEBITS) {
 		dev_err(&spi->dev, "%s, unsupported mode bits %x\n",
 			__func__, spi->mode & ~MODEBITS);
@@ -183,9 +180,6 @@ static int xilinx_spi_setup(struct spi_device *spi)
 	if (retval < 0)
 		return retval;
 
-	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u nsec/bit\n",
-		__func__, spi->mode & MODEBITS, spi->bits_per_word, 0);
-
 	return 0;
 }
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a0faa18f7b1..0db5d64fc5e 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -523,30 +523,7 @@ static inline void spi_message_free(struct spi_message *m)
 	kfree(m);
 }
 
-/**
- * spi_setup - setup SPI mode and clock rate
- * @spi: the device whose settings are being modified
- * Context: can sleep, and no requests are queued to the device
- *
- * SPI protocol drivers may need to update the transfer mode if the
- * device doesn't work with its default.  They may likewise need
- * to update clock rates or word sizes from initial values.  This function
- * changes those settings, and must be called from a context that can sleep.
- * Except for SPI_CS_HIGH, which takes effect immediately, the changes take
- * effect the next time the device is selected and data is transferred to
- * or from it.  When this function returns, the spi device is deselected.
- *
- * Note that this call will fail if the protocol driver specifies an option
- * that the underlying controller or its driver does not support.  For
- * example, not all hardware supports wire transfers using nine bit words,
- * LSB-first wire encoding, or active-high chipselects.
- */
-static inline int
-spi_setup(struct spi_device *spi)
-{
-	return spi->master->setup(spi);
-}
-
+extern int spi_setup(struct spi_device *spi);
 
 /**
  * spi_async - asynchronous SPI transfer
-- 
cgit v1.2.3-70-g09d2


From e7db06b5d5afcef15c4c3e61c3a7441ed7ad1407 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 17 Jun 2009 16:26:04 -0700
Subject: spi: move more spi_setup() functionality into core

Move some common spi_setup() error checks into the SPI framework from the
spi_master controller drivers:

 - Add a new "mode_bits" field to spi_master

 - Use that in spi_setup to validate the spi->mode value being
   requested.  Setting this new field is now mandatory for any
   controller supporting more than vanilla SPI_MODE_0.

 - Update all spi_master drivers to:

     * Initialize that field
     * Remove current spi_setup() checks using that value.

This is a net minor code shrink.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/atmel_spi.c       | 12 +++---------
 drivers/spi/au1550_spi.c      | 12 +++---------
 drivers/spi/mpc52xx_psc_spi.c | 12 +++---------
 drivers/spi/omap2_mcspi.c     | 12 +++---------
 drivers/spi/omap_uwire.c      | 12 +++---------
 drivers/spi/orion_spi.c       |  9 +++------
 drivers/spi/pxa2xx_spi.c      | 12 +++---------
 drivers/spi/spi.c             | 11 +++++++++++
 drivers/spi/spi_bfin5xx.c     |  9 +++------
 drivers/spi/spi_bitbang.c     |  9 +++------
 drivers/spi/spi_imx.c         | 12 +++---------
 drivers/spi/spi_mpc83xx.c     | 14 ++++----------
 drivers/spi/spi_s3c24xx.c     | 12 +++---------
 drivers/spi/spi_txx9.c        |  9 +++------
 drivers/spi/xilinx_spi.c      | 12 +++---------
 include/linux/spi/spi.h       |  3 +++
 16 files changed, 57 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 9f9ff3af72d..f5b3fdbb1e2 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -530,9 +530,6 @@ atmel_spi_interrupt(int irq, void *dev_id)
 	return ret;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int atmel_spi_setup(struct spi_device *spi)
 {
 	struct atmel_spi	*as;
@@ -562,12 +559,6 @@ static int atmel_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	/* see notes above re chipselect */
 	if (!atmel_spi_is_v2()
 			&& spi->chip_select == 0
@@ -773,6 +764,9 @@ static int __init atmel_spi_probe(struct platform_device *pdev)
 	if (!master)
 		goto out_free;
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = 4;
 	master->setup = atmel_spi_setup;
diff --git a/drivers/spi/au1550_spi.c b/drivers/spi/au1550_spi.c
index 6a407e60f05..76cbc1a6659 100644
--- a/drivers/spi/au1550_spi.c
+++ b/drivers/spi/au1550_spi.c
@@ -284,9 +284,6 @@ static int au1550_spi_setupxfer(struct spi_device *spi, struct spi_transfer *t)
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST)
-
 static int au1550_spi_setup(struct spi_device *spi)
 {
 	struct au1550_spi *hw = spi_master_get_devdata(spi->master);
@@ -297,12 +294,6 @@ static int au1550_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (spi->max_speed_hz == 0)
 		spi->max_speed_hz = hw->freq_max;
 	if (spi->max_speed_hz > hw->freq_max
@@ -779,6 +770,9 @@ static int __init au1550_spi_probe(struct platform_device *pdev)
 		goto err_nomem;
 	}
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST;
+
 	hw = spi_master_get_devdata(master);
 
 	hw->master = spi_master_get(master);
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index 68c77a91159..bdae9f90897 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -261,9 +261,6 @@ static void mpc52xx_psc_spi_work(struct work_struct *work)
 	spin_unlock_irq(&mps->lock);
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST)
-
 static int mpc52xx_psc_spi_setup(struct spi_device *spi)
 {
 	struct mpc52xx_psc_spi *mps = spi_master_get_devdata(spi->master);
@@ -273,12 +270,6 @@ static int mpc52xx_psc_spi_setup(struct spi_device *spi)
 	if (spi->bits_per_word%8)
 		return -EINVAL;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (!cs) {
 		cs = kzalloc(sizeof *cs, GFP_KERNEL);
 		if (!cs)
@@ -385,6 +376,9 @@ static int __init mpc52xx_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	dev_set_drvdata(dev, master);
 	mps = spi_master_get_devdata(master);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST;
+
 	mps->irq = irq;
 	if (pdata == NULL) {
 		dev_warn(dev, "probe called without platform data, no "
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c
index b4f3b753d0f..eee4b6e0af2 100644
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -603,9 +603,6 @@ static int omap2_mcspi_request_dma(struct spi_device *spi)
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int omap2_mcspi_setup(struct spi_device *spi)
 {
 	int			ret;
@@ -613,12 +610,6 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 	struct omap2_mcspi_dma	*mcspi_dma;
 	struct omap2_mcspi_cs	*cs = spi->controller_state;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (spi->bits_per_word < 4 || spi->bits_per_word > 32) {
 		dev_dbg(&spi->dev, "setup: unsupported %d bit words\n",
 			spi->bits_per_word);
@@ -982,6 +973,9 @@ static int __init omap2_mcspi_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	if (pdev->id != -1)
 		master->bus_num = pdev->id;
 
diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c
index 747d29be45d..aa90ddb3706 100644
--- a/drivers/spi/omap_uwire.c
+++ b/drivers/spi/omap_uwire.c
@@ -447,19 +447,10 @@ done:
 	return status;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int uwire_setup(struct spi_device *spi)
 {
 	struct uwire_state *ust = spi->controller_state;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (ust == NULL) {
 		ust = kzalloc(sizeof(*ust), GFP_KERNEL);
 		if (ust == NULL)
@@ -520,6 +511,9 @@ static int __init uwire_probe(struct platform_device *pdev)
 
 	uwire_write_reg(UWIRE_SR3, 1);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = 2;	/* "official" */
 	master->num_chipselect = 4;
 	master->setup = uwire_setup;
diff --git a/drivers/spi/orion_spi.c b/drivers/spi/orion_spi.c
index 6d5e33bb4b4..3aea50da7b2 100644
--- a/drivers/spi/orion_spi.c
+++ b/drivers/spi/orion_spi.c
@@ -358,12 +358,6 @@ static int orion_spi_setup(struct spi_device *spi)
 
 	orion_spi = spi_master_get_devdata(spi->master);
 
-	if (spi->mode) {
-		dev_err(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode);
-		return -EINVAL;
-	}
-
 	/* Fix ac timing if required.   */
 	if (orion_spi->spi_info->enable_clock_fix)
 		orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG,
@@ -473,6 +467,9 @@ static int __init orion_spi_probe(struct platform_device *pdev)
 	if (pdev->id != -1)
 		master->bus_num = pdev->id;
 
+	/* we support only mode 0, and no options */
+	master->mode_bits = 0;
+
 	master->setup = orion_spi_setup;
 	master->transfer = orion_spi_transfer;
 	master->num_chipselect = ORION_NUM_CHIPSELECTS;
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index c7365e0b22d..9c311dc4771 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -1185,9 +1185,6 @@ static int transfer(struct spi_device *spi, struct spi_message *msg)
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA)
-
 static int setup_cs(struct spi_device *spi, struct chip_data *chip,
 		    struct pxa2xx_spi_chip *chip_info)
 {
@@ -1252,12 +1249,6 @@ static int setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	/* Only alloc on first setup */
 	chip = spi_get_ctldata(spi);
 	if (!chip) {
@@ -1493,6 +1484,9 @@ static int __init pxa2xx_spi_probe(struct platform_device *pdev)
 	drv_data->pdev = pdev;
 	drv_data->ssp = ssp;
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = platform_info->num_chipselect;
 	master->dma_alignment = DMA_ALIGNMENT;
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 0276bc37e25..a525a3a848c 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -607,8 +607,19 @@ EXPORT_SYMBOL_GPL(spi_busnum_to_master);
  */
 int spi_setup(struct spi_device *spi)
 {
+	unsigned	bad_bits;
 	int		status;
 
+	/* help drivers fail *cleanly* when they need options
+	 * that aren't supported with their current master
+	 */
+	bad_bits = spi->mode & ~spi->master->mode_bits;
+	if (bad_bits) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			bad_bits);
+		return -EINVAL;
+	}
+
 	if (!spi->bits_per_word)
 		spi->bits_per_word = 8;
 
diff --git a/drivers/spi/spi_bfin5xx.c b/drivers/spi/spi_bfin5xx.c
index d54058a903b..73e24ef5a2f 100644
--- a/drivers/spi/spi_bfin5xx.c
+++ b/drivers/spi/spi_bfin5xx.c
@@ -1010,12 +1010,6 @@ static int bfin_spi_setup(struct spi_device *spi)
 	struct driver_data *drv_data = spi_master_get_devdata(spi->master);
 	int ret;
 
-	/* Abort device setup if requested features are not supported */
-	if (spi->mode & ~(SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST)) {
-		dev_err(&spi->dev, "requested mode not fully supported\n");
-		return -EINVAL;
-	}
-
 	if (spi->bits_per_word != 8 && spi->bits_per_word != 16)
 		return -EINVAL;
 
@@ -1283,6 +1277,9 @@ static int __init bfin_spi_probe(struct platform_device *pdev)
 	drv_data->pdev = pdev;
 	drv_data->pin_req = platform_info->pin_req;
 
+	/* the spi->mode bits supported by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = platform_info->num_chipselect;
 	master->cleanup = bfin_spi_cleanup;
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
index 855b0b06e62..2a5abc08e85 100644
--- a/drivers/spi/spi_bitbang.c
+++ b/drivers/spi/spi_bitbang.c
@@ -188,12 +188,6 @@ int spi_bitbang_setup(struct spi_device *spi)
 
 	bitbang = spi_master_get_devdata(spi->master);
 
-	/* Bitbangers can support SPI_CS_HIGH, SPI_3WIRE, and so on;
-	 * add those to master->flags, and provide the other support.
-	 */
-	if ((spi->mode & ~(SPI_CPOL|SPI_CPHA|bitbang->flags)) != 0)
-		return -EINVAL;
-
 	if (!cs) {
 		cs = kzalloc(sizeof *cs, GFP_KERNEL);
 		if (!cs)
@@ -452,6 +446,9 @@ int spi_bitbang_start(struct spi_bitbang *bitbang)
 	spin_lock_init(&bitbang->lock);
 	INIT_LIST_HEAD(&bitbang->queue);
 
+	if (!bitbang->master->mode_bits)
+		bitbang->master->mode_bits = SPI_CPOL | SPI_CPHA | bitbang->flags;
+
 	if (!bitbang->master->transfer)
 		bitbang->master->transfer = spi_bitbang_transfer;
 	if (!bitbang->txrx_bufs) {
diff --git a/drivers/spi/spi_imx.c b/drivers/spi/spi_imx.c
index 26d5ef06dbd..c195e45f7f3 100644
--- a/drivers/spi/spi_imx.c
+++ b/drivers/spi/spi_imx.c
@@ -1171,9 +1171,6 @@ msg_rejected:
 	return -EINVAL;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 /* On first setup bad values must free chip_data memory since will cause
    spi_new_device to fail. Bad value setup from protocol driver are simply not
    applied and notified to the calling driver. */
@@ -1186,12 +1183,6 @@ static int setup(struct spi_device *spi)
 	u32 tmp;
 	int status = 0;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	/* Get controller data */
 	chip_info = spi->controller_data;
 
@@ -1478,6 +1469,9 @@ static int __init spi_imx_probe(struct platform_device *pdev)
 	drv_data->master_info = platform_info;
 	drv_data->pdev = pdev;
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->bus_num = pdev->id;
 	master->num_chipselect = platform_info->num_chipselect;
 	master->dma_alignment = DMA_ALIGNMENT;
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index 0926a3e293e..ce61be98e06 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -419,10 +419,6 @@ static void mpc83xx_spi_work(struct work_struct *work)
 	spin_unlock_irq(&mpc83xx_spi->lock);
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS	(SPI_CPOL | SPI_CPHA | SPI_CS_HIGH \
-			| SPI_LSB_FIRST | SPI_LOOP)
-
 static int mpc83xx_spi_setup(struct spi_device *spi)
 {
 	struct mpc83xx_spi *mpc83xx_spi;
@@ -430,12 +426,6 @@ static int mpc83xx_spi_setup(struct spi_device *spi)
 	u32 hw_mode;
 	struct spi_mpc83xx_cs	*cs = spi->controller_state;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	if (!spi->max_speed_hz)
 		return -EINVAL;
 
@@ -562,6 +552,10 @@ mpc83xx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 
 	dev_set_drvdata(dev, master);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH
+			| SPI_LSB_FIRST | SPI_LOOP;
+
 	master->setup = mpc83xx_spi_setup;
 	master->transfer = mpc83xx_spi_transfer;
 	master->cleanup = mpc83xx_spi_cleanup;
diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c
index 18a4c7f5438..e0d44af4745 100644
--- a/drivers/spi/spi_s3c24xx.c
+++ b/drivers/spi/spi_s3c24xx.c
@@ -146,19 +146,10 @@ static int s3c24xx_spi_setupxfer(struct spi_device *spi,
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
-
 static int s3c24xx_spi_setup(struct spi_device *spi)
 {
 	int ret;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
-			spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	ret = s3c24xx_spi_setupxfer(spi, NULL);
 	if (ret < 0) {
 		dev_err(&spi->dev, "setupxfer returned %d\n", ret);
@@ -283,6 +274,9 @@ static int __init s3c24xx_spi_probe(struct platform_device *pdev)
 
 	/* setup the master state. */
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
+
 	master->num_chipselect = hw->pdata->num_cs;
 	master->bus_num = pdata->bus_num;
 
diff --git a/drivers/spi/spi_txx9.c b/drivers/spi/spi_txx9.c
index 8e36b2153d9..96057de133a 100644
--- a/drivers/spi/spi_txx9.c
+++ b/drivers/spi/spi_txx9.c
@@ -110,17 +110,11 @@ static void txx9spi_cs_func(struct spi_device *spi, struct txx9spi *c,
 	ndelay(cs_delay);	/* CS Setup Time / CS Recovery Time */
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS	(SPI_CS_HIGH|SPI_CPOL|SPI_CPHA)
-
 static int txx9spi_setup(struct spi_device *spi)
 {
 	struct txx9spi *c = spi_master_get_devdata(spi->master);
 	u8 bits_per_word;
 
-	if (spi->mode & ~MODEBITS)
-		return -EINVAL;
-
 	if (!spi->max_speed_hz
 			|| spi->max_speed_hz > c->max_speed_hz
 			|| spi->max_speed_hz < c->min_speed_hz)
@@ -414,6 +408,9 @@ static int __init txx9spi_probe(struct platform_device *dev)
 		 (unsigned long long)res->start, irq,
 		 (c->baseclk + 500000) / 1000000);
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CS_HIGH | SPI_CPOL | SPI_CPHA;
+
 	master->bus_num = dev->id;
 	master->setup = txx9spi_setup;
 	master->transfer = txx9spi_transfer;
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 2d7e6b81fb4..46b8c5c2f45 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -158,9 +158,6 @@ static int xilinx_spi_setup_transfer(struct spi_device *spi,
 	return 0;
 }
 
-/* the spi->mode bits understood by this driver: */
-#define MODEBITS (SPI_CPOL | SPI_CPHA)
-
 static int xilinx_spi_setup(struct spi_device *spi)
 {
 	struct spi_bitbang *bitbang;
@@ -170,12 +167,6 @@ static int xilinx_spi_setup(struct spi_device *spi)
 	xspi = spi_master_get_devdata(spi->master);
 	bitbang = &xspi->bitbang;
 
-	if (spi->mode & ~MODEBITS) {
-		dev_err(&spi->dev, "%s, unsupported mode bits %x\n",
-			__func__, spi->mode & ~MODEBITS);
-		return -EINVAL;
-	}
-
 	retval = xilinx_spi_setup_transfer(spi, NULL);
 	if (retval < 0)
 		return retval;
@@ -327,6 +318,9 @@ static int __init xilinx_spi_of_probe(struct of_device *ofdev,
 		goto put_master;
 	}
 
+	/* the spi->mode bits understood by this driver: */
+	master->mode_bits = SPI_CPOL | SPI_CPHA;
+
 	xspi = spi_master_get_devdata(master);
 	xspi->bitbang.master = spi_master_get(master);
 	xspi->bitbang.chipselect = xilinx_spi_chipselect;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 0db5d64fc5e..9c4cd27f468 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -245,6 +245,9 @@ struct spi_master {
 	 */
 	u16			dma_alignment;
 
+	/* spi_device.mode flags understood by this controller driver */
+	u16			mode_bits;
+
 	/* Setup mode and clock, etc (spi driver may call many times).
 	 *
 	 * IMPORTANT:  this may be called when transfers to another
-- 
cgit v1.2.3-70-g09d2


From 7390284290b184a7f4bb648ca15dc62c3dea3e75 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 17 Jun 2009 16:26:05 -0700
Subject: mpc52xx_psc_spi: convert to cs_control callback

mpc52xx_psc_spi driver is the last user of the legacy activate_cs and
deactivate_cs callbacks, so convert the driver to the cs_control hook and
remove the legacy callbacks from fsl_spi_platform_data struct.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/mpc52xx_psc_spi.c | 22 +++++++++-------------
 include/linux/fsl_devices.h   |  4 ----
 2 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index bdae9f90897..1b74d5ca03f 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -13,6 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/of_platform.h>
@@ -30,8 +31,7 @@
 
 struct mpc52xx_psc_spi {
 	/* fsl_spi_platform data */
-	void (*activate_cs)(u8, u8);
-	void (*deactivate_cs)(u8, u8);
+	void (*cs_control)(struct spi_device *spi, bool on);
 	u32 sysclk;
 
 	/* driver internal data */
@@ -111,18 +111,16 @@ static void mpc52xx_psc_spi_activate_cs(struct spi_device *spi)
 	out_be16((u16 __iomem *)&psc->ccr, ccr);
 	mps->bits_per_word = cs->bits_per_word;
 
-	if (mps->activate_cs)
-		mps->activate_cs(spi->chip_select,
-				(spi->mode & SPI_CS_HIGH) ? 1 : 0);
+	if (mps->cs_control)
+		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 1 : 0);
 }
 
 static void mpc52xx_psc_spi_deactivate_cs(struct spi_device *spi)
 {
 	struct mpc52xx_psc_spi *mps = spi_master_get_devdata(spi->master);
 
-	if (mps->deactivate_cs)
-		mps->deactivate_cs(spi->chip_select,
-				(spi->mode & SPI_CS_HIGH) ? 1 : 0);
+	if (mps->cs_control)
+		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 0 : 1);
 }
 
 #define MPC52xx_PSC_BUFSIZE (MPC52xx_PSC_RFNUM_MASK + 1)
@@ -382,15 +380,13 @@ static int __init mpc52xx_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	mps->irq = irq;
 	if (pdata == NULL) {
 		dev_warn(dev, "probe called without platform data, no "
-				"(de)activate_cs function will be called\n");
-		mps->activate_cs = NULL;
-		mps->deactivate_cs = NULL;
+				"cs_control function will be called\n");
+		mps->cs_control = NULL;
 		mps->sysclk = 0;
 		master->bus_num = bus_num;
 		master->num_chipselect = 255;
 	} else {
-		mps->activate_cs = pdata->activate_cs;
-		mps->deactivate_cs = pdata->deactivate_cs;
+		mps->cs_control = pdata->cs_control;
 		mps->sysclk = pdata->sysclk;
 		master->bus_num = pdata->bus_num;
 		master->num_chipselect = pdata->max_chipselect;
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 244677cc082..43fc95d822d 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -79,10 +79,6 @@ struct fsl_spi_platform_data {
 	u16	max_chipselect;
 	void	(*cs_control)(struct spi_device *spi, bool on);
 	u32	sysclk;
-
-	/* Legacy hooks, used by mpc52xx_psc_spi driver. */
-	void	(*activate_cs)(u8 cs, u8 polarity);
-	void	(*deactivate_cs)(u8 cs, u8 polarity);
 };
 
 struct mpc8xx_pcmcia_ops {
-- 
cgit v1.2.3-70-g09d2


From 77906a546127e056dbdac618a7afb5b92d00c058 Mon Sep 17 00:00:00 2001
From: Daniel Silverstone <dsilvers@simtec.co.uk>
Date: Wed, 17 Jun 2009 16:26:15 -0700
Subject: pca953x: support GPIOLIB GPIO naming

Add support to the PCA953x driver to use the GPIOLIB naming facility for
GPIOs.

Signed-off-by: Daniel Silverstone <dsilvers@simtec.co.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/pca953x.c      | 4 ++++
 include/linux/i2c/pca953x.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/pca953x.c b/drivers/gpio/pca953x.c
index 8dc0164bd51..2dae94fbabf 100644
--- a/drivers/gpio/pca953x.c
+++ b/drivers/gpio/pca953x.c
@@ -50,6 +50,7 @@ struct pca953x_chip {
 
 	struct i2c_client *client;
 	struct gpio_chip gpio_chip;
+	char **names;
 };
 
 static int pca953x_write_reg(struct pca953x_chip *chip, int reg, uint16_t val)
@@ -192,6 +193,7 @@ static void pca953x_setup_gpio(struct pca953x_chip *chip, int gpios)
 	gc->label = chip->client->name;
 	gc->dev = &chip->client->dev;
 	gc->owner = THIS_MODULE;
+	gc->names = chip->names;
 }
 
 static int __devinit pca953x_probe(struct i2c_client *client,
@@ -215,6 +217,8 @@ static int __devinit pca953x_probe(struct i2c_client *client,
 
 	chip->gpio_start = pdata->gpio_base;
 
+	chip->names = pdata->names;
+
 	/* initialize cached registers from their original values.
 	 * we can't share this chip with another i2c master.
 	 */
diff --git a/include/linux/i2c/pca953x.h b/include/linux/i2c/pca953x.h
index 3c7361217df..81736d6a8db 100644
--- a/include/linux/i2c/pca953x.h
+++ b/include/linux/i2c/pca953x.h
@@ -15,4 +15,5 @@ struct pca953x_platform_data {
 	int		(*teardown)(struct i2c_client *client,
 				unsigned gpio, unsigned ngpio,
 				void *context);
+	char		**names;
 };
-- 
cgit v1.2.3-70-g09d2


From 1d965fe0eb435b3f9a10538815f6a68de0aef03c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@jeffreymahoney.com>
Date: Wed, 17 Jun 2009 16:26:29 -0700
Subject: reiserfs: fix warnings with gcc 4.4

Several code paths in reiserfs have a construct like:

 if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) ...

which, in addition to being ugly, end up causing compiler warnings with
gcc 4.4.0.  Previous compilers didn't issue a warning.

fs/reiserfs/do_balan.c:1273: warning: operation on `aux_ih' may be undefined
fs/reiserfs/lbalance.c:393: warning: operation on `ih' may be undefined
fs/reiserfs/lbalance.c:421: warning: operation on `ih' may be undefined
fs/reiserfs/lbalance.c:777: warning: operation on `ih' may be undefined

I believe this is due to the ih being passed to macros which evaluate the
argument more than once.  This is old code and we haven't seen any
problems with it, but this patch eliminates the warnings.

It converts the multiple evaluation macros to static inlines and does a
preassignment for the cases that were causing the warnings because that
code is just ugly.

Reported-by: Chris Mason <mason@oracle.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/do_balan.c      |  5 ++---
 fs/reiserfs/lbalance.c      | 10 ++++++----
 include/linux/reiserfs_fs.h | 47 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 47 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4beb964a2a3..128d3f7c8aa 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 
 					RFALSE(ih, "PAP-12210: ih must be 0");
 
-					if (is_direntry_le_ih
-					    (aux_ih =
-					     B_N_PITEM_HEAD(tbS0, item_pos))) {
+					aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
+					if (is_direntry_le_ih(aux_ih)) {
 						/* we append to directory item */
 
 						int entry_count;
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381750a155f..03d85cbf90b 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 
 	if (last_first == FIRST_TO_LAST) {
 		/* if ( if item in position item_num in buffer SOURCE is directory item ) */
-		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
 			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
 					      item_num, 0, cpy_bytes);
 		else {
@@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		}
 	} else {
 		/*  if ( if item in position item_num in buffer SOURCE is directory item ) */
-		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
 			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
 					      item_num,
 					      I_ENTRY_COUNT(ih) - cpy_bytes,
@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 			leaf_delete_items_entirely(cur_bi, first + 1,
 						   del_num - 1);
 
-			if (is_direntry_le_ih
-			    (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1)))
+			ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
+			if (is_direntry_le_ih(ih))
 				/* the last item is directory  */
 				/* len = numbers of directory entries in this item */
 				len = ih_entry_count(ih);
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 2245c78d587..dd31e7bae35 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -660,23 +660,54 @@ static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
 		   cpu_to_le32(type2uniqueness(type)))
 	    : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type));
 }
+
 static inline void set_le_ih_k_type(struct item_head *ih, int type)
 {
 	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
 }
 
-#define is_direntry_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRENTRY)
-#define is_direct_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRECT)
-#define is_indirect_le_key(version,key) (le_key_k_type (version, key) == TYPE_INDIRECT)
-#define is_statdata_le_key(version,key) (le_key_k_type (version, key) == TYPE_STAT_DATA)
+static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_DIRENTRY;
+}
+
+static inline int is_direct_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_DIRECT;
+}
+
+static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_INDIRECT;
+}
+
+static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
+{
+	return le_key_k_type(version, key) == TYPE_STAT_DATA;
+}
 
 //
 // item header has version.
 //
-#define is_direntry_le_ih(ih) is_direntry_le_key (ih_version (ih), &((ih)->ih_key))
-#define is_direct_le_ih(ih) is_direct_le_key (ih_version (ih), &((ih)->ih_key))
-#define is_indirect_le_ih(ih) is_indirect_le_key (ih_version(ih), &((ih)->ih_key))
-#define is_statdata_le_ih(ih) is_statdata_le_key (ih_version (ih), &((ih)->ih_key))
+static inline int is_direntry_le_ih(struct item_head *ih)
+{
+	return is_direntry_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_direct_le_ih(struct item_head *ih)
+{
+	return is_direct_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_indirect_le_ih(struct item_head *ih)
+{
+	return is_indirect_le_key(ih_version(ih), &ih->ih_key);
+}
+
+static inline int is_statdata_le_ih(struct item_head *ih)
+{
+	return is_statdata_le_key(ih_version(ih), &ih->ih_key);
+}
 
 //
 // key is pointer to cpu key, result is cpu
-- 
cgit v1.2.3-70-g09d2


From d69b042f3d7406ddba560143b1796020df760800 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 17 Jun 2009 16:26:34 -0700
Subject: memcg: add file-based RSS accounting

Add file RSS tracking per memory cgroup

We currently don't track file RSS, the RSS we report is actually anon RSS.
 All the file mapped pages, come in through the page cache and get
accounted there.  This patch adds support for accounting file RSS pages.
It should

1. Help improve the metrics reported by the memory resource controller
2. Will form the basis for a future shared memory accounting heuristic
   that has been proposed by Kamezawa.

Unfortunately, we cannot rename the existing "rss" keyword used in
memory.stat to "anon_rss".  We however, add "mapped_file" data and hope to
educate the end user through documentation.

[hugh.dickins@tiscali.co.uk: fix mem_cgroup_update_mapped_file_stat oops]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.cn>
Cc: Paul Menage <menage@google.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 ++++-
 mm/memcontrol.c            | 66 +++++++++++++++++++++++++++++++++++++++++++++-
 mm/page_cgroup.c           |  2 ++
 mm/rmap.c                  |  5 +++-
 4 files changed, 77 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 45add35dda1..e46a0734ab6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -117,7 +117,7 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
-
+void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -271,6 +271,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
+static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
+							int val)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 70db6e0a5ee..6f682901deb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,7 +62,8 @@ enum mem_cgroup_stat_index {
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
+	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 
@@ -900,6 +901,44 @@ static void record_last_oom(struct mem_cgroup *mem)
 	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
 }
 
+/*
+ * Currently used to update mapped file statistics, but the routine can be
+ * generalized to update other statistics as well.
+ */
+void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
+{
+	struct mem_cgroup *mem;
+	struct mem_cgroup_stat *stat;
+	struct mem_cgroup_stat_cpu *cpustat;
+	int cpu;
+	struct page_cgroup *pc;
+
+	if (!page_is_file_cache(page))
+		return;
+
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc))
+		return;
+
+	lock_page_cgroup(pc);
+	mem = pc->mem_cgroup;
+	if (!mem)
+		goto done;
+
+	if (!PageCgroupUsed(pc))
+		goto done;
+
+	/*
+	 * Preemption is already disabled, we don't need get_cpu()
+	 */
+	cpu = smp_processor_id();
+	stat = &mem->stat;
+	cpustat = &stat->cpustat[cpu];
+
+	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+done:
+	unlock_page_cgroup(pc);
+}
 
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
@@ -1098,6 +1137,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup_per_zone *from_mz, *to_mz;
 	int nid, zid;
 	int ret = -EBUSY;
+	struct page *page;
+	int cpu;
+	struct mem_cgroup_stat *stat;
+	struct mem_cgroup_stat_cpu *cpustat;
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
@@ -1118,6 +1161,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 
 	res_counter_uncharge(&from->res, PAGE_SIZE);
 	mem_cgroup_charge_statistics(from, pc, false);
+
+	page = pc->page;
+	if (page_is_file_cache(page) && page_mapped(page)) {
+		cpu = smp_processor_id();
+		/* Update mapped_file data for mem_cgroup "from" */
+		stat = &from->stat;
+		cpustat = &stat->cpustat[cpu];
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+						-1);
+
+		/* Update mapped_file data for mem_cgroup "to" */
+		stat = &to->stat;
+		cpustat = &stat->cpustat[cpu];
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+						1);
+	}
+
 	if (do_swap_account)
 		res_counter_uncharge(&from->memsw, PAGE_SIZE);
 	css_put(&from->css);
@@ -2046,6 +2106,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 enum {
 	MCS_CACHE,
 	MCS_RSS,
+	MCS_MAPPED_FILE,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_INACTIVE_ANON,
@@ -2066,6 +2127,7 @@ struct {
 } memcg_stat_strings[NR_MCS_STAT] = {
 	{"cache", "total_cache"},
 	{"rss", "total_rss"},
+	{"mapped_file", "total_mapped_file"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"inactive_anon", "total_inactive_anon"},
@@ -2086,6 +2148,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
+	s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 4f31c9b3e94..672089d5819 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 	unsigned long pfn = page_to_pfn(page);
 	struct mem_section *section = __pfn_to_section(pfn);
 
+	if (!section->page_cgroup)
+		return NULL;
 	return section->page_cgroup + pfn;
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index c9ccc1a72dc..836c6c63e1f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -703,8 +703,10 @@ void page_add_new_anon_rmap(struct page *page,
  */
 void page_add_file_rmap(struct page *page)
 {
-	if (atomic_inc_and_test(&page->_mapcount))
+	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
+		mem_cgroup_update_mapped_file_stat(page, 1);
+	}
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -753,6 +755,7 @@ void page_remove_rmap(struct page *page)
 			mem_cgroup_uncharge_page(page);
 		__dec_zone_page_state(page,
 			PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+		mem_cgroup_update_mapped_file_stat(page, -1);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
 		 * but that might overwrite a racing page_add_anon_rmap
-- 
cgit v1.2.3-70-g09d2


From 302362c5abdda80b5c2e4e57be610c2e3c2ab3c5 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Wed, 17 Jun 2009 16:27:15 -0700
Subject: memcg: remove mem_cgroup_cache_charge_swapin()

mem_cgroup_cache_charge_swapin() isn't used any more, so remove no-op
definition of it in header file.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0cedf31af0b..dca9b9999ae 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -423,12 +423,6 @@ static inline swp_entry_t get_swap_page(void)
 #define has_swap_token(x) 0
 #define disable_swap_token() do { } while(0)
 
-static inline int mem_cgroup_cache_charge_swapin(struct page *page,
-			struct mm_struct *mm, gfp_t mask, bool locked)
-{
-	return 0;
-}
-
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
-- 
cgit v1.2.3-70-g09d2


From 20ebcdda78a282d1d5266887ddf8a2d670182576 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 17 Jun 2009 16:27:16 -0700
Subject: memcg: remove unneeded forward declaration from sched.h

This forward declaration seems pointless.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 02042e7f219..d0342101756 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -92,7 +92,6 @@ struct sched_param {
 
 #include <asm/processor.h>
 
-struct mem_cgroup;
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
-- 
cgit v1.2.3-70-g09d2


From 8a9478ca7f4bcb8945cec7f95d52dae2d5e50cbd Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 17 Jun 2009 16:27:17 -0700
Subject: memcg: fix swap accounting

This patch fixes mis-accounting of swap usage in memcg.

In the current implementation, memcg's swap account is uncharged only when
swap is completely freed.  But there are several cases where swap cannot
be freed cleanly.  For handling that, this patch changes that memcg
uncharges swap account when swap has no references other than cache.

By this, memcg's swap entry accounting can be fully synchronous with the
application's behavior.

This patch also changes memcg's hooks for swap-out.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  5 +++--
 mm/memcontrol.c      | 17 ++++++++++++-----
 mm/swapfile.c        | 16 ++++++++++++----
 3 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index dca9b9999ae..c88b36665f7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -319,10 +319,11 @@ static inline void disable_swap_token(void)
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
+extern void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
 #else
 static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 }
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3468c38adde..a83e0395444 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -189,6 +189,7 @@ enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
+	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 
@@ -1493,6 +1494,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		if (page_mapped(page))
 			goto unlock_out;
 		break;
@@ -1556,18 +1558,23 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
-void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
+	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
+
+	if (!swapout) /* this was a swap cache but the swap is unused ! */
+		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
+
+	memcg = __mem_cgroup_uncharge_common(page, ctype);
 
-	memcg = __mem_cgroup_uncharge_common(page,
-					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
 	/* record memcg information */
-	if (do_swap_account && memcg) {
+	if (do_swap_account && swapout && memcg) {
 		swap_cgroup_record(ent, css_id(&memcg->css));
 		mem_cgroup_get(memcg);
 	}
-	if (memcg)
+	if (swapout && memcg)
 		css_put(&memcg->css);
 }
 #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 28faa01cf57..d1ade1a48ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -583,8 +583,9 @@ static int swap_entry_free(struct swap_info_struct *p,
 			swap_list.next = p - swap_info;
 		nr_swap_pages++;
 		p->inuse_pages--;
-		mem_cgroup_uncharge_swap(ent);
 	}
+	if (!swap_count(count))
+		mem_cgroup_uncharge_swap(ent);
 	return count;
 }
 
@@ -609,12 +610,19 @@ void swap_free(swp_entry_t entry)
 void swapcache_free(swp_entry_t entry, struct page *page)
 {
 	struct swap_info_struct *p;
+	int ret;
 
-	if (page)
-		mem_cgroup_uncharge_swapcache(page, entry);
 	p = swap_info_get(entry);
 	if (p) {
-		swap_entry_free(p, entry, SWAP_CACHE);
+		ret = swap_entry_free(p, entry, SWAP_CACHE);
+		if (page) {
+			bool swapout;
+			if (ret)
+				swapout = true; /* the end of swap out */
+			else
+				swapout = false; /* no more swap users! */
+			mem_cgroup_uncharge_swapcache(page, entry, swapout);
+		}
 		spin_unlock(&swap_lock);
 	}
 	return;
-- 
cgit v1.2.3-70-g09d2


From c5b947b28828e82814605824e5db0bc58d66d8c0 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Wed, 17 Jun 2009 16:27:20 -0700
Subject: memcg: add interface to reset limits

We don't have an interface to reset mem.limit or memsw.limit now.

This patch allows to reset mem.limit or memsw.limit when they are being
set to -1.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memory.txt |  1 +
 include/linux/res_counter.h      |  2 ++
 kernel/res_counter.c             | 12 +++++++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index af48135bd9b..23d1262c077 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -209,6 +209,7 @@ We can alter the memory limit:
 
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
+NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
 
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 4c5bcf6ca7e..511f42fc681 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -49,6 +49,8 @@ struct res_counter {
 	struct res_counter *parent;
 };
 
+#define RESOURCE_MAX (unsigned long long)LLONG_MAX
+
 /**
  * Helpers to interact with userspace
  * res_counter_read_u64() - returns the value of the specified member.
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c80..e1338f07431 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
-	counter->limit = (unsigned long long)LLONG_MAX;
+	counter->limit = RESOURCE_MAX;
 	counter->parent = parent;
 }
 
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
 					unsigned long long *res)
 {
 	char *end;
+
+	/* return RESOURCE_MAX(unlimited) if "-1" is specified */
+	if (*buf == '-') {
+		*res = simple_strtoull(buf + 1, &end, 10);
+		if (*res != 1 || *end != '\0')
+			return -EINVAL;
+		*res = RESOURCE_MAX;
+		return 0;
+	}
+
 	/* FIXME - make memparse() take const char* args */
 	*res = memparse((char *)buf, &end);
 	if (*end != '\0')
-- 
cgit v1.2.3-70-g09d2


From 1c216279539bd65c5a3d497e25d441dbddbcf1ec Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:29 -0700
Subject: ptrace: tracehook_unsafe_exec(): remove the stale comment

tracehook_unsafe_exec() doesn't need task_lock(), remove the old comment.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index eb96603d92d..17ba82efa48 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -143,7 +143,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
  *
  * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
  *
- * Called with task_lock() held on @task.
+ * @task->cred_guard_mutex is held by the caller through the do_execve().
  */
 static inline int tracehook_unsafe_exec(struct task_struct *task)
 {
-- 
cgit v1.2.3-70-g09d2


From 8053bdd5ce15dcf043d41a4dd6cac4a5567effdc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:34 -0700
Subject: ptrace_get_task_struct: s/tasklist/rcu/, make it static

- Use rcu_read_lock() instead of tasklist_lock to find/get the task
  in ptrace_get_task_struct().

- Make it static, it has no callers outside of ptrace.c.

- The comment doesn't match the reality, this helper does not do
  any checks. Beacuse it is really trivial and static I removed the
  whole comment.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  1 -
 kernel/ptrace.c        | 16 +++-------------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 59e133d39d5..7456d7d87a1 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -81,7 +81,6 @@
 
 
 extern long arch_ptrace(struct task_struct *child, long request, long addr, long data);
-extern struct task_struct *ptrace_get_task_struct(pid_t pid);
 extern int ptrace_traceme(void);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 38fdfea1a15..a64fe75a48b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -581,26 +581,16 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-/**
- * ptrace_get_task_struct  --  grab a task struct reference for ptrace
- * @pid:       process id to grab a task_struct reference of
- *
- * This function is a helper for ptrace implementations.  It checks
- * permissions and then grabs a task struct for use of the actual
- * ptrace implementation.
- *
- * Returns the task_struct for @pid or an ERR_PTR() on failure.
- */
-struct task_struct *ptrace_get_task_struct(pid_t pid)
+static struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
 	struct task_struct *child;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	child = find_task_by_vpid(pid);
 	if (child)
 		get_task_struct(child);
+	rcu_read_unlock();
 
-	read_unlock(&tasklist_lock);
 	if (!child)
 		return ERR_PTR(-ESRCH);
 	return child;
-- 
cgit v1.2.3-70-g09d2


From 17f98dcf6010a1cfd25d179fd0ce77d3dc2685c3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Jun 2009 16:27:51 -0700
Subject: pids: clean up find_task_by_pid variants

find_task_by_pid_type_ns is only used to implement find_task_by_vpid and
find_task_by_pid_ns, but both of them pass PIDTYPE_PID as first argument.
So just fold find_task_by_pid_type_ns into find_task_by_pid_ns and use
find_task_by_pid_ns to implement find_task_by_vpid.

While we're at it also remove the exports for find_task_by_pid_ns and
find_task_by_vpid - we don't have any modular callers left as the only
modular caller of he old pre pid namespace find_task_by_pid (gfs2) was
switched to pid_task which operates on a struct pid pointer instead of a
pid_t.  Given the confusion about pid_t values vs namespace that's
generally the better option anyway and I think we're better of restricting
modules to do it that way.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  6 ------
 kernel/pid.c          | 17 +++--------------
 2 files changed, 3 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0342101756..4d075426988 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1878,9 +1878,6 @@ extern struct pid_namespace init_pid_ns;
 /*
  * find a task by one of its numerical ids
  *
- * find_task_by_pid_type_ns():
- *      it is the most generic call - it finds a task by all id,
- *      type and namespace specified
  * find_task_by_pid_ns():
  *      finds a task by its pid in the specified namespace
  * find_task_by_vpid():
@@ -1889,9 +1886,6 @@ extern struct pid_namespace init_pid_ns;
  * see also find_vpid() etc in include/linux/pid.h
  */
 
-extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
-		struct pid_namespace *ns);
-
 extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd28..31310b5d3f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
 /*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
-struct task_struct *find_task_by_pid_type_ns(int type, int nr,
-		struct pid_namespace *ns)
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-	return pid_task(find_pid_ns(nr, ns), type);
+	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 
-EXPORT_SYMBOL(find_task_by_pid_type_ns);
-
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-	return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
-			current->nsproxy->pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_vpid);
-
-struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
-{
-	return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
+	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
 }
-EXPORT_SYMBOL(find_task_by_pid_ns);
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
-- 
cgit v1.2.3-70-g09d2


From b4188def441197d38f20e0935372780ed7c0e19d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 17 Jun 2009 16:27:56 -0700
Subject: ipcns: make free_ipc_ns() static

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  1 -
 ipc/namespace.c               | 48 +++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 3bf40e246a8..804e4e4a2b6 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -94,7 +94,6 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #endif
 
 #if defined(CONFIG_IPC_NS)
-extern void free_ipc_ns(struct ipc_namespace *ns);
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 				       struct ipc_namespace *ns);
 extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 231ee5359ab..a1094ff0bef 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -83,6 +83,30 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 	up_write(&ids->rw_mutex);
 }
 
+static void free_ipc_ns(struct ipc_namespace *ns)
+{
+	/*
+	 * Unregistering the hotplug notifier at the beginning guarantees
+	 * that the ipc namespace won't be freed while we are inside the
+	 * callback routine. Since the blocking_notifier_chain_XXX routines
+	 * hold a rw lock on the notifier list, unregister_ipcns_notifier()
+	 * won't take the rw lock before blocking_notifier_call_chain() has
+	 * released the rd lock.
+	 */
+	unregister_ipcns_notifier(ns);
+	sem_exit_ns(ns);
+	msg_exit_ns(ns);
+	shm_exit_ns(ns);
+	kfree(ns);
+	atomic_dec(&nr_ipc_ns);
+
+	/*
+	 * Do the ipcns removal notification after decrementing nr_ipc_ns in
+	 * order to have a correct value when recomputing msgmni.
+	 */
+	ipcns_notify(IPCNS_REMOVED);
+}
+
 /*
  * put_ipc_ns - drop a reference to an ipc namespace.
  * @ns: the namespace to put
@@ -108,27 +132,3 @@ void put_ipc_ns(struct ipc_namespace *ns)
 		free_ipc_ns(ns);
 	}
 }
-
-void free_ipc_ns(struct ipc_namespace *ns)
-{
-	/*
-	 * Unregistering the hotplug notifier at the beginning guarantees
-	 * that the ipc namespace won't be freed while we are inside the
-	 * callback routine. Since the blocking_notifier_chain_XXX routines
-	 * hold a rw lock on the notifier list, unregister_ipcns_notifier()
-	 * won't take the rw lock before blocking_notifier_call_chain() has
-	 * released the rd lock.
-	 */
-	unregister_ipcns_notifier(ns);
-	sem_exit_ns(ns);
-	msg_exit_ns(ns);
-	shm_exit_ns(ns);
-	kfree(ns);
-	atomic_dec(&nr_ipc_ns);
-
-	/*
-	 * Do the ipcns removal notification after decrementing nr_ipc_ns in
-	 * order to have a correct value when recomputing msgmni.
-	 */
-	ipcns_notify(IPCNS_REMOVED);
-}
-- 
cgit v1.2.3-70-g09d2


From 665c7741fb63c7ceeb515f1d1ed8b016efe65bf3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 17 Jun 2009 16:27:57 -0700
Subject: ipcns: move free_ipcs() proto

Function is really private to ipc/ and avoid struct kern_ipc_perm
forward declaration.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h | 4 ----
 ipc/util.h                    | 3 ++-
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 804e4e4a2b6..e408722a84c 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -96,10 +96,6 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #if defined(CONFIG_IPC_NS)
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 				       struct ipc_namespace *ns);
-extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
-		      void (*free)(struct ipc_namespace *,
-				   struct kern_ipc_perm *));
-
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
diff --git a/ipc/util.h b/ipc/util.h
index f9fe90e4886..ab3ebf2621b 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -171,5 +171,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 			struct ipc_ops *ops, struct ipc_params *params);
-
+void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
+		void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));
 #endif
-- 
cgit v1.2.3-70-g09d2


From b99b87f70c7785ab1e253c6220f4b0b57ce3a7f7 Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Date: Wed, 17 Jun 2009 16:28:03 -0700
Subject: kernel: constructor support

Call constructors (gcc-generated initcall-like functions) during kernel
start and module load.  Constructors are e.g.  used for gcov data
initialization.

Disable constructor support for usermode Linux to prevent conflicts with
host glibc.

Signed-off-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Li Wei <W.Li@Sun.COM>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Heiko Carstens <heicars2@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <mschwid2@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/sections.h    |  3 +++
 include/asm-generic/vmlinux.lds.h |  9 +++++++++
 include/linux/init.h              |  3 +++
 include/linux/module.h            |  6 ++++++
 init/Kconfig                      |  5 +++++
 init/main.c                       | 12 ++++++++++++
 kernel/module.c                   | 16 ++++++++++++++++
 7 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 4ce48e87853..d083561337f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -14,6 +14,9 @@ extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
+/* Start and end of .ctors section - used for constructor calls. */
+extern char __ctors_start[], __ctors_end[];
+
 /* function descriptor handling (if any).  Override
  * in asm/sections.h */
 #ifndef dereference_function_descriptor
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6bdba10fef4..55413e568f0 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -440,12 +440,21 @@
 		INIT_TASK						\
 	}
 
+#ifdef CONFIG_CONSTRUCTORS
+#define KERNEL_CTORS()	VMLINUX_SYMBOL(__ctors_start) = .; \
+			*(.ctors)			   \
+			VMLINUX_SYMBOL(__ctors_end) = .;
+#else
+#define KERNEL_CTORS()
+#endif
+
 /* init and exit section handling */
 #define INIT_DATA							\
 	*(.init.data)							\
 	DEV_DISCARD(init.data)						\
 	CPU_DISCARD(init.data)						\
 	MEM_DISCARD(init.data)						\
+	KERNEL_CTORS()							\
 	*(.init.rodata)							\
 	DEV_DISCARD(init.rodata)					\
 	CPU_DISCARD(init.rodata)					\
diff --git a/include/linux/init.h b/include/linux/init.h
index 8c2c9989626..13b633ed695 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -134,6 +134,9 @@ typedef void (*exitcall_t)(void);
 extern initcall_t __con_initcall_start[], __con_initcall_end[];
 extern initcall_t __security_initcall_start[], __security_initcall_end[];
 
+/* Used for contructor calls. */
+typedef void (*ctor_fn_t)(void);
+
 /* Defined in init/main.c */
 extern int do_one_initcall(initcall_t fn);
 extern char __initdata boot_command_line[];
diff --git a/include/linux/module.h b/include/linux/module.h
index 505f20dcc1c..098bdb7bfac 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -363,6 +363,12 @@ struct module
 	local_t ref;
 #endif
 #endif
+
+#ifdef CONFIG_CONSTRUCTORS
+	/* Constructor functions. */
+	ctor_fn_t *ctors;
+	unsigned int num_ctors;
+#endif
 };
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/init/Kconfig b/init/Kconfig
index c4b3c6d51a7..1ce05a4cb5f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -16,6 +16,11 @@ config DEFCONFIG_LIST
 	default "$ARCH_DEFCONFIG"
 	default "arch/$ARCH/defconfig"
 
+config CONSTRUCTORS
+	bool
+	depends on !UML
+	default y
+
 menu "General setup"
 
 config EXPERIMENTAL
diff --git a/init/main.c b/init/main.c
index 0e7aedeaa05..1a65fdd0631 100644
--- a/init/main.c
+++ b/init/main.c
@@ -720,6 +720,17 @@ asmlinkage void __init start_kernel(void)
 	rest_init();
 }
 
+/* Call all constructor functions linked into the kernel. */
+static void __init do_ctors(void)
+{
+#ifdef CONFIG_CONSTRUCTORS
+	ctor_fn_t *call = (ctor_fn_t *) __ctors_start;
+
+	for (; call < (ctor_fn_t *) __ctors_end; call++)
+		(*call)();
+#endif
+}
+
 int initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 
@@ -800,6 +811,7 @@ static void __init do_basic_setup(void)
 	usermodehelper_init();
 	driver_init();
 	init_irq_proc();
+	do_ctors();
 	do_initcalls();
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 215aaab09e9..38928fcaff2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2216,6 +2216,10 @@ static noinline struct module *load_module(void __user *umod,
 	mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
 					    "__kcrctab_unused_gpl");
 #endif
+#ifdef CONFIG_CONSTRUCTORS
+	mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
+				  sizeof(*mod->ctors), &mod->num_ctors);
+#endif
 
 #ifdef CONFIG_MARKERS
 	mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2389,6 +2393,17 @@ static noinline struct module *load_module(void __user *umod,
 	goto free_hdr;
 }
 
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+	unsigned long i;
+
+	for (i = 0; i < mod->num_ctors; i++)
+		mod->ctors[i]();
+#endif
+}
+
 /* This is where the real work happens */
 SYSCALL_DEFINE3(init_module, void __user *, umod,
 		unsigned long, len, const char __user *, uargs)
@@ -2417,6 +2432,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	blocking_notifier_call_chain(&module_notify_list,
 			MODULE_STATE_COMING, mod);
 
+	do_mod_ctors(mod);
 	/* Start the module */
 	if (mod->init != NULL)
 		ret = do_one_initcall(mod->init);
-- 
cgit v1.2.3-70-g09d2


From 0b923606e75f1ab672e25b14ac039a1cdcfa382f Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Date: Wed, 17 Jun 2009 16:28:05 -0700
Subject: seq_file: add function to write binary data

seq_write() can be used to construct seq_files containing arbitrary data.
Required by the gcov-profiling interface to synthesize binary profiling
data files.

Signed-off-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Li Wei <W.Li@Sun.COM>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Heiko Carstens <heicars2@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <mschwid2@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 20 ++++++++++++++++++++
 include/linux/seq_file.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7f40f30c55c..6c959275f2d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s)
 }
 EXPORT_SYMBOL(seq_puts);
 
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+	if (seq->count + len < seq->size) {
+		memcpy(seq->buf + seq->count, data, len);
+		seq->count += len;
+		return 0;
+	}
+	seq->count = seq->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_write);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
 	struct list_head *lh;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 004f3b3342c..0c6a86b7959 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -43,6 +43,7 @@ int seq_release(struct inode *, struct file *);
 int seq_escape(struct seq_file *, const char *, const char *);
 int seq_putc(struct seq_file *m, char c);
 int seq_puts(struct seq_file *m, const char *s);
+int seq_write(struct seq_file *seq, const void *data, size_t len);
 
 int seq_printf(struct seq_file *, const char *, ...)
 	__attribute__ ((format (printf,2,3)));
-- 
cgit v1.2.3-70-g09d2


From 2521f2c228ad750701ba4702484e31d876dbc386 Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Date: Wed, 17 Jun 2009 16:28:08 -0700
Subject: gcov: add gcov profiling infrastructure

Enable the use of GCC's coverage testing tool gcov [1] with the Linux
kernel.  gcov may be useful for:

 * debugging (has this code been reached at all?)
 * test improvement (how do I change my test to cover these lines?)
 * minimizing kernel configurations (do I need this option if the
   associated code is never run?)

The profiling patch incorporates the following changes:

 * change kbuild to include profiling flags
 * provide functions needed by profiling code
 * present profiling data as files in debugfs

Note that on some architectures, enabling gcc's profiling option
"-fprofile-arcs" for the entire kernel may trigger compile/link/
run-time problems, some of which are caused by toolchain bugs and
others which require adjustment of architecture code.

For this reason profiling the entire kernel is initially restricted
to those architectures for which it is known to work without changes.
This restriction can be lifted once an architecture has been tested
and found compatible with gcc's profiling. Profiling of single files
or directories is still available on all platforms (see config help
text).

[1] http://gcc.gnu.org/onlinedocs/gcc/Gcov.html

Signed-off-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Li Wei <W.Li@Sun.COM>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Heiko Carstens <heicars2@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <mschwid2@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gcov.txt              | 246 +++++++++++++
 Documentation/kernel-parameters.txt |   7 +
 Makefile                            |  11 +-
 arch/Kconfig                        |   2 +
 include/linux/compiler-gcc3.h       |   6 +
 kernel/Makefile                     |   1 +
 kernel/gcov/Kconfig                 |  48 +++
 kernel/gcov/Makefile                |   3 +
 kernel/gcov/base.c                  | 148 ++++++++
 kernel/gcov/fs.c                    | 673 ++++++++++++++++++++++++++++++++++++
 kernel/gcov/gcc_3_4.c               | 447 ++++++++++++++++++++++++
 kernel/gcov/gcov.h                  | 128 +++++++
 scripts/Makefile.lib                |  11 +
 13 files changed, 1726 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/gcov.txt
 create mode 100644 kernel/gcov/Kconfig
 create mode 100644 kernel/gcov/Makefile
 create mode 100644 kernel/gcov/base.c
 create mode 100644 kernel/gcov/fs.c
 create mode 100644 kernel/gcov/gcc_3_4.c
 create mode 100644 kernel/gcov/gcov.h

(limited to 'include/linux')

diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt
new file mode 100644
index 00000000000..e716aadb3a3
--- /dev/null
+++ b/Documentation/gcov.txt
@@ -0,0 +1,246 @@
+Using gcov with the Linux kernel
+================================
+
+1. Introduction
+2. Preparation
+3. Customization
+4. Files
+5. Modules
+6. Separated build and test machines
+7. Troubleshooting
+Appendix A: sample script: gather_on_build.sh
+Appendix B: sample script: gather_on_test.sh
+
+
+1. Introduction
+===============
+
+gcov profiling kernel support enables the use of GCC's coverage testing
+tool gcov [1] with the Linux kernel. Coverage data of a running kernel
+is exported in gcov-compatible format via the "gcov" debugfs directory.
+To get coverage data for a specific file, change to the kernel build
+directory and use gcov with the -o option as follows (requires root):
+
+# cd /tmp/linux-out
+# gcov -o /sys/kernel/debug/gcov/tmp/linux-out/kernel spinlock.c
+
+This will create source code files annotated with execution counts
+in the current directory. In addition, graphical gcov front-ends such
+as lcov [2] can be used to automate the process of collecting data
+for the entire kernel and provide coverage overviews in HTML format.
+
+Possible uses:
+
+* debugging (has this line been reached at all?)
+* test improvement (how do I change my test to cover these lines?)
+* minimizing kernel configurations (do I need this option if the
+  associated code is never run?)
+
+--
+
+[1] http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
+[2] http://ltp.sourceforge.net/coverage/lcov.php
+
+
+2. Preparation
+==============
+
+Configure the kernel with:
+
+        CONFIG_DEBUGFS=y
+        CONFIG_GCOV_KERNEL=y
+
+and to get coverage data for the entire kernel:
+
+        CONFIG_GCOV_PROFILE_ALL=y
+
+Note that kernels compiled with profiling flags will be significantly
+larger and run slower. Also CONFIG_GCOV_PROFILE_ALL may not be supported
+on all architectures.
+
+Profiling data will only become accessible once debugfs has been
+mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+
+3. Customization
+================
+
+To enable profiling for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                GCOV_PROFILE_main.o := y
+
+        For all files in one directory:
+                GCOV_PROFILE := y
+
+To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+is specified, use:
+
+                GCOV_PROFILE_main.o := n
+        and:
+                GCOV_PROFILE := n
+
+Only files which are linked to the main kernel image or are compiled as
+kernel modules are supported by this mechanism.
+
+
+4. Files
+========
+
+The gcov kernel support creates the following files in debugfs:
+
+        /sys/kernel/debug/gcov
+                Parent directory for all gcov-related files.
+
+        /sys/kernel/debug/gcov/reset
+                Global reset file: resets all coverage data to zero when
+                written to.
+
+        /sys/kernel/debug/gcov/path/to/compile/dir/file.gcda
+                The actual gcov data file as understood by the gcov
+                tool. Resets file coverage data to zero when written to.
+
+        /sys/kernel/debug/gcov/path/to/compile/dir/file.gcno
+                Symbolic link to a static data file required by the gcov
+                tool. This file is generated by gcc when compiling with
+                option -ftest-coverage.
+
+
+5. Modules
+==========
+
+Kernel modules may contain cleanup code which is only run during
+module unload time. The gcov mechanism provides a means to collect
+coverage data for such code by keeping a copy of the data associated
+with the unloaded module. This data remains available through debugfs.
+Once the module is loaded again, the associated coverage counters are
+initialized with the data from its previous instantiation.
+
+This behavior can be deactivated by specifying the gcov_persist kernel
+parameter:
+
+        gcov_persist=0
+
+At run-time, a user can also choose to discard data for an unloaded
+module by writing to its data file or the global reset file.
+
+
+6. Separated build and test machines
+====================================
+
+The gcov kernel profiling infrastructure is designed to work out-of-the
+box for setups where kernels are built and run on the same machine. In
+cases where the kernel runs on a separate machine, special preparations
+must be made, depending on where the gcov tool is used:
+
+a) gcov is run on the TEST machine
+
+The gcov tool version on the test machine must be compatible with the
+gcc version used for kernel build. Also the following files need to be
+copied from build to test machine:
+
+from the source tree:
+  - all C source files + headers
+
+from the build tree:
+  - all C source files + headers
+  - all .gcda and .gcno files
+  - all links to directories
+
+It is important to note that these files need to be placed into the
+exact same file system location on the test machine as on the build
+machine. If any of the path components is symbolic link, the actual
+directory needs to be used instead (due to make's CURDIR handling).
+
+b) gcov is run on the BUILD machine
+
+The following files need to be copied after each test case from test
+to build machine:
+
+from the gcov directory in sysfs:
+  - all .gcda files
+  - all links to .gcno files
+
+These files can be copied to any location on the build machine. gcov
+must then be called with the -o option pointing to that directory.
+
+Example directory setup on the build machine:
+
+  /tmp/linux:    kernel source tree
+  /tmp/out:      kernel build directory as specified by make O=
+  /tmp/coverage: location of the files copied from the test machine
+
+  [user@build] cd /tmp/out
+  [user@build] gcov -o /tmp/coverage/tmp/out/init main.c
+
+
+7. Troubleshooting
+==================
+
+Problem:  Compilation aborts during linker step.
+Cause:    Profiling flags are specified for source files which are not
+          linked to the main kernel or which are linked by a custom
+          linker procedure.
+Solution: Exclude affected source files from profiling by specifying
+          GCOV_PROFILE := n or GCOV_PROFILE_basename.o := n in the
+          corresponding Makefile.
+
+
+Appendix A: gather_on_build.sh
+==============================
+
+Sample script to gather coverage meta files on the build machine
+(see 6a):
+
+#!/bin/bash
+
+KSRC=$1
+KOBJ=$2
+DEST=$3
+
+if [ -z "$KSRC" ] || [ -z "$KOBJ" ] || [ -z "$DEST" ]; then
+  echo "Usage: $0 <ksrc directory> <kobj directory> <output.tar.gz>" >&2
+  exit 1
+fi
+
+KSRC=$(cd $KSRC; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
+KOBJ=$(cd $KOBJ; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
+
+find $KSRC $KOBJ \( -name '*.gcno' -o -name '*.[ch]' -o -type l \) -a \
+                 -perm /u+r,g+r | tar cfz $DEST -P -T -
+
+if [ $? -eq 0 ] ; then
+  echo "$DEST successfully created, copy to test system and unpack with:"
+  echo "  tar xfz $DEST -P"
+else
+  echo "Could not create file $DEST"
+fi
+
+
+Appendix B: gather_on_test.sh
+=============================
+
+Sample script to gather coverage data files on the test machine
+(see 6b):
+
+#!/bin/bash
+
+DEST=$1
+GCDA=/sys/kernel/debug/gcov
+
+if [ -z "$DEST" ] ; then
+  echo "Usage: $0 <output.tar.gz>" >&2
+  exit 1
+fi
+
+find $GCDA -name '*.gcno' -o -name '*.gcda' | tar cfz $DEST -T -
+
+if [ $? -eq 0 ] ; then
+  echo "$DEST successfully created, copy to build system and unpack with:"
+  echo "  tar xfz $DEST"
+else
+  echo "Could not create file $DEST"
+fi
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5578248c18a..08def8deb5f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -48,6 +48,7 @@ parameter is applicable:
 	EFI	EFI Partitioning (GPT) is enabled
 	EIDE	EIDE/ATAPI support is enabled.
 	FB	The frame buffer device is enabled.
+	GCOV	GCOV profiling is enabled.
 	HW	Appropriate hardware is enabled.
 	IA-64	IA-64 architecture is enabled.
 	IMA     Integrity measurement architecture is enabled.
@@ -796,6 +797,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format: off | on
 			default: on
 
+	gcov_persist=	[GCOV] When non-zero (default), profiling data for
+			kernel modules is saved and remains accessible via
+			debugfs, even when the module is unloaded/reloaded.
+			When zero, profiling data is discarded and associated
+			debugfs files are removed at module unload time.
+
 	gdth=		[HW,SCSI]
 			See header of drivers/scsi/gdth.c.
 
diff --git a/Makefile b/Makefile
index ea63667617f..46e1c9d03d5 100644
--- a/Makefile
+++ b/Makefile
@@ -330,6 +330,7 @@ AFLAGS_MODULE   = $(MODFLAGS)
 LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
+CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
 
 
 # Use LINUXINCLUDE when you must reference the include/ directory.
@@ -356,7 +357,7 @@ export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 
 # When compiling out-of-tree modules, put MODVERDIR in the module
@@ -1216,8 +1217,8 @@ clean: archclean $(clean-dirs)
 		\( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
 		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
 		-o -name '*.symtypes' -o -name 'modules.order' \
-		-o -name 'Module.markers' -o -name '.tmp_*.o.*' \) \
-		-type f -print | xargs rm -f
+		-o -name 'Module.markers' -o -name '.tmp_*.o.*' \
+		-o -name '*.gcno' \) -type f -print | xargs rm -f
 
 # mrproper - Delete all generated files, including .config
 #
@@ -1421,8 +1422,8 @@ clean: $(clean-dirs)
 	$(call cmd,rmfiles)
 	@find $(KBUILD_EXTMOD) $(RCS_FIND_IGNORE) \
 		\( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
-		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \) \
-		-type f -print | xargs rm -f
+		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
+		-o -name '*.gcno' \) -type f -print | xargs rm -f
 
 help:
 	@echo  '  Building external modules.'
diff --git a/arch/Kconfig b/arch/Kconfig
index 78a35e9dc10..99193b16023 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -112,3 +112,5 @@ config HAVE_DMA_API_DEBUG
 
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
+
+source "kernel/gcov/Kconfig"
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index 8005effc04f..b721129e046 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -16,6 +16,12 @@
 #define __must_check		__attribute__((warn_unused_result))
 #endif
 
+#ifdef CONFIG_GCOV_KERNEL
+# if __GNUC_MINOR__ < 4
+#   error "GCOV profiling support for gcc versions below 3.4 not included"
+# endif /* __GNUC_MINOR__ */
+#endif /* CONFIG_GCOV_KERNEL */
+
 /*
  * A trick to suppress uninitialized variable warning without generating any
  * code
diff --git a/kernel/Makefile b/kernel/Makefile
index 9df4501cb92..0a32cb21ec9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 00000000000..aab593abfba
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
+menu "GCOV-based kernel profiling"
+
+config GCOV_KERNEL
+	bool "Enable gcov-based kernel profiling"
+	depends on DEBUG_FS && CONSTRUCTORS
+	default n
+	---help---
+	This option enables gcov-based code profiling (e.g. for code coverage
+	measurements).
+
+	If unsure, say N.
+
+	Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
+	for the entire kernel. To enable profiling for specific files or
+	directories, add a line similar to the following to the respective
+	Makefile:
+
+	For a single file (e.g. main.o):
+	        GCOV_PROFILE_main.o := y
+
+	For all files in one directory:
+	        GCOV_PROFILE := y
+
+	To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+	is specified, use:
+
+	        GCOV_PROFILE_main.o := n
+	and:
+	        GCOV_PROFILE := n
+
+	Note that the debugfs filesystem has to be mounted to access
+	profiling data.
+
+config GCOV_PROFILE_ALL
+	bool "Profile entire Kernel"
+	depends on GCOV_KERNEL
+	depends on S390 || X86_32
+	default n
+	---help---
+	This options activates profiling for the entire kernel.
+
+	If unsure, say N.
+
+	Note that a kernel compiled with profiling flags will be significantly
+	larger and run slower. Also be sure to exclude files from profiling
+	which are not linked to the kernel image to prevent linker errors.
+
+endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 00000000000..3f761001d51
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 00000000000..9b22d03cc58
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
+/*
+ *  This code maintains a list of active profiling data structures.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ *    Based on the gcov-kernel patch by:
+ *		 Hubertus Franke <frankeh@us.ibm.com>
+ *		 Nigel Hinds <nhinds@us.ibm.com>
+ *		 Rajan Ravindran <rajancr@us.ibm.com>
+ *		 Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *		 Paul Larson
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+static struct gcov_info *gcov_info_head;
+static int gcov_events_enabled;
+static DEFINE_MUTEX(gcov_lock);
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+	static unsigned int gcov_version;
+
+	mutex_lock(&gcov_lock);
+	if (gcov_version == 0) {
+		gcov_version = info->version;
+		/*
+		 * Printing gcc's version magic may prove useful for debugging
+		 * incompatibility reports.
+		 */
+		pr_info("version magic: 0x%x\n", gcov_version);
+	}
+	/*
+	 * Add new profiling data structure to list and inform event
+	 * listener.
+	 */
+	info->next = gcov_info_head;
+	gcov_info_head = info;
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+/**
+ * gcov_enable_events - enable event reporting through gcov_event()
+ *
+ * Turn on reporting of profiling data load/unload-events through the
+ * gcov_event() callback. Also replay all previous events once. This function
+ * is needed because some events are potentially generated too early for the
+ * callback implementation to handle them initially.
+ */
+void gcov_enable_events(void)
+{
+	struct gcov_info *info;
+
+	mutex_lock(&gcov_lock);
+	gcov_events_enabled = 1;
+	/* Perform event callback for previously registered entries. */
+	for (info = gcov_info_head; info; info = info->next)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+
+#ifdef CONFIG_MODULES
+static inline int within(void *addr, void *start, unsigned long size)
+{
+	return ((addr >= start) && (addr < start + size));
+}
+
+/* Update list and generate events when modules are unloaded. */
+static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
+				void *data)
+{
+	struct module *mod = data;
+	struct gcov_info *info;
+	struct gcov_info *prev;
+
+	if (event != MODULE_STATE_GOING)
+		return NOTIFY_OK;
+	mutex_lock(&gcov_lock);
+	prev = NULL;
+	/* Remove entries located in module from linked list. */
+	for (info = gcov_info_head; info; info = info->next) {
+		if (within(info, mod->module_core, mod->core_size)) {
+			if (prev)
+				prev->next = info->next;
+			else
+				gcov_info_head = info->next;
+			if (gcov_events_enabled)
+				gcov_event(GCOV_REMOVE, info);
+		} else
+			prev = info;
+	}
+	mutex_unlock(&gcov_lock);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block gcov_nb = {
+	.notifier_call	= gcov_module_notifier,
+};
+
+static int __init gcov_init(void)
+{
+	return register_module_notifier(&gcov_nb);
+}
+device_initcall(gcov_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 00000000000..ef3c3f88a7a
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
+/*
+ *  This code exports profiling data as debugfs files to userspace.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ *    Based on the gcov-kernel patch by:
+ *		 Hubertus Franke <frankeh@us.ibm.com>
+ *		 Nigel Hinds <nhinds@us.ibm.com>
+ *		 Rajan Ravindran <rajancr@us.ibm.com>
+ *		 Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *		 Paul Larson
+ *		 Yi CDL Yang
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include "gcov.h"
+
+/**
+ * struct gcov_node - represents a debugfs entry
+ * @list: list head for child node list
+ * @children: child nodes
+ * @all: list head for list of all nodes
+ * @parent: parent node
+ * @info: associated profiling data structure if not a directory
+ * @ghost: when an object file containing profiling data is unloaded we keep a
+ *         copy of the profiling data here to allow collecting coverage data
+ *         for cleanup code. Such a node is called a "ghost".
+ * @dentry: main debugfs entry, either a directory or data file
+ * @links: associated symbolic links
+ * @name: data file basename
+ *
+ * struct gcov_node represents an entity within the gcov/ subdirectory
+ * of debugfs. There are directory and data file nodes. The latter represent
+ * the actual synthesized data file plus any associated symbolic links which
+ * are needed by the gcov tool to work correctly.
+ */
+struct gcov_node {
+	struct list_head list;
+	struct list_head children;
+	struct list_head all;
+	struct gcov_node *parent;
+	struct gcov_info *info;
+	struct gcov_info *ghost;
+	struct dentry *dentry;
+	struct dentry **links;
+	char name[0];
+};
+
+static const char objtree[] = OBJTREE;
+static const char srctree[] = SRCTREE;
+static struct gcov_node root_node;
+static struct dentry *reset_dentry;
+static LIST_HEAD(all_head);
+static DEFINE_MUTEX(node_lock);
+
+/* If non-zero, keep copies of profiling data for unloaded modules. */
+static int gcov_persist = 1;
+
+static int __init gcov_persist_setup(char *str)
+{
+	unsigned long val;
+
+	if (strict_strtoul(str, 0, &val)) {
+		pr_warning("invalid gcov_persist parameter '%s'\n", str);
+		return 0;
+	}
+	gcov_persist = val;
+	pr_info("setting gcov_persist to %d\n", gcov_persist);
+
+	return 1;
+}
+__setup("gcov_persist=", gcov_persist_setup);
+
+/*
+ * seq_file.start() implementation for gcov data files. Note that the
+ * gcov_iterator interface is designed to be more restrictive than seq_file
+ * (no start from arbitrary position, etc.), to simplify the iterator
+ * implementation.
+ */
+static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	loff_t i;
+
+	gcov_iter_start(seq->private);
+	for (i = 0; i < *pos; i++) {
+		if (gcov_iter_next(seq->private))
+			return NULL;
+	}
+	return seq->private;
+}
+
+/* seq_file.next() implementation for gcov data files. */
+static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
+{
+	struct gcov_iterator *iter = data;
+
+	if (gcov_iter_next(iter))
+		return NULL;
+	(*pos)++;
+
+	return iter;
+}
+
+/* seq_file.show() implementation for gcov data files. */
+static int gcov_seq_show(struct seq_file *seq, void *data)
+{
+	struct gcov_iterator *iter = data;
+
+	if (gcov_iter_write(iter, seq))
+		return -EINVAL;
+	return 0;
+}
+
+static void gcov_seq_stop(struct seq_file *seq, void *data)
+{
+	/* Unused. */
+}
+
+static const struct seq_operations gcov_seq_ops = {
+	.start	= gcov_seq_start,
+	.next	= gcov_seq_next,
+	.show	= gcov_seq_show,
+	.stop	= gcov_seq_stop,
+};
+
+/*
+ * Return the profiling data set for a given node. This can either be the
+ * original profiling data structure or a duplicate (also called "ghost")
+ * in case the associated object file has been unloaded.
+ */
+static struct gcov_info *get_node_info(struct gcov_node *node)
+{
+	if (node->info)
+		return node->info;
+
+	return node->ghost;
+}
+
+/*
+ * open() implementation for gcov data files. Create a copy of the profiling
+ * data set and initialize the iterator and seq_file interface.
+ */
+static int gcov_seq_open(struct inode *inode, struct file *file)
+{
+	struct gcov_node *node = inode->i_private;
+	struct gcov_iterator *iter;
+	struct seq_file *seq;
+	struct gcov_info *info;
+	int rc = -ENOMEM;
+
+	mutex_lock(&node_lock);
+	/*
+	 * Read from a profiling data copy to minimize reference tracking
+	 * complexity and concurrent access.
+	 */
+	info = gcov_info_dup(get_node_info(node));
+	if (!info)
+		goto out_unlock;
+	iter = gcov_iter_new(info);
+	if (!iter)
+		goto err_free_info;
+	rc = seq_open(file, &gcov_seq_ops);
+	if (rc)
+		goto err_free_iter_info;
+	seq = file->private_data;
+	seq->private = iter;
+out_unlock:
+	mutex_unlock(&node_lock);
+	return rc;
+
+err_free_iter_info:
+	gcov_iter_free(iter);
+err_free_info:
+	gcov_info_free(info);
+	goto out_unlock;
+}
+
+/*
+ * release() implementation for gcov data files. Release resources allocated
+ * by open().
+ */
+static int gcov_seq_release(struct inode *inode, struct file *file)
+{
+	struct gcov_iterator *iter;
+	struct gcov_info *info;
+	struct seq_file *seq;
+
+	seq = file->private_data;
+	iter = seq->private;
+	info = gcov_iter_get_info(iter);
+	gcov_iter_free(iter);
+	gcov_info_free(info);
+	seq_release(inode, file);
+
+	return 0;
+}
+
+/*
+ * Find a node by the associated data file name. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *get_node_by_name(const char *name)
+{
+	struct gcov_node *node;
+	struct gcov_info *info;
+
+	list_for_each_entry(node, &all_head, all) {
+		info = get_node_info(node);
+		if (info && (strcmp(info->filename, name) == 0))
+			return node;
+	}
+
+	return NULL;
+}
+
+static void remove_node(struct gcov_node *node);
+
+/*
+ * write() implementation for gcov data files. Reset profiling data for the
+ * associated file. If the object file has been unloaded (i.e. this is
+ * a "ghost" node), remove the debug fs node as well.
+ */
+static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
+			      size_t len, loff_t *pos)
+{
+	struct seq_file *seq;
+	struct gcov_info *info;
+	struct gcov_node *node;
+
+	seq = file->private_data;
+	info = gcov_iter_get_info(seq->private);
+	mutex_lock(&node_lock);
+	node = get_node_by_name(info->filename);
+	if (node) {
+		/* Reset counts or remove node for unloaded modules. */
+		if (node->ghost)
+			remove_node(node);
+		else
+			gcov_info_reset(node->info);
+	}
+	/* Reset counts for open file. */
+	gcov_info_reset(info);
+	mutex_unlock(&node_lock);
+
+	return len;
+}
+
+/*
+ * Given a string <path> representing a file path of format:
+ *   path/to/file.gcda
+ * construct and return a new string:
+ *   <dir/>path/to/file.<ext>
+ */
+static char *link_target(const char *dir, const char *path, const char *ext)
+{
+	char *target;
+	char *old_ext;
+	char *copy;
+
+	copy = kstrdup(path, GFP_KERNEL);
+	if (!copy)
+		return NULL;
+	old_ext = strrchr(copy, '.');
+	if (old_ext)
+		*old_ext = '\0';
+	if (dir)
+		target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
+	else
+		target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
+	kfree(copy);
+
+	return target;
+}
+
+/*
+ * Construct a string representing the symbolic link target for the given
+ * gcov data file name and link type. Depending on the link type and the
+ * location of the data file, the link target can either point to a
+ * subdirectory of srctree, objtree or in an external location.
+ */
+static char *get_link_target(const char *filename, const struct gcov_link *ext)
+{
+	const char *rel;
+	char *result;
+
+	if (strncmp(filename, objtree, strlen(objtree)) == 0) {
+		rel = filename + strlen(objtree) + 1;
+		if (ext->dir == SRC_TREE)
+			result = link_target(srctree, rel, ext->ext);
+		else
+			result = link_target(objtree, rel, ext->ext);
+	} else {
+		/* External compilation. */
+		result = link_target(NULL, filename, ext->ext);
+	}
+
+	return result;
+}
+
+#define SKEW_PREFIX	".tmp_"
+
+/*
+ * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
+ * for filename skewing caused by the mod-versioning mechanism.
+ */
+static const char *deskew(const char *basename)
+{
+	if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
+		return basename + sizeof(SKEW_PREFIX) - 1;
+	return basename;
+}
+
+/*
+ * Create links to additional files (usually .c and .gcno files) which the
+ * gcov tool expects to find in the same directory as the gcov data file.
+ */
+static void add_links(struct gcov_node *node, struct dentry *parent)
+{
+	char *basename;
+	char *target;
+	int num;
+	int i;
+
+	for (num = 0; gcov_link[num].ext; num++)
+		/* Nothing. */;
+	node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
+	if (!node->links)
+		return;
+	for (i = 0; i < num; i++) {
+		target = get_link_target(get_node_info(node)->filename,
+					 &gcov_link[i]);
+		if (!target)
+			goto out_err;
+		basename = strrchr(target, '/');
+		if (!basename)
+			goto out_err;
+		basename++;
+		node->links[i] = debugfs_create_symlink(deskew(basename),
+							parent,	target);
+		if (!node->links[i])
+			goto out_err;
+		kfree(target);
+	}
+
+	return;
+out_err:
+	kfree(target);
+	while (i-- > 0)
+		debugfs_remove(node->links[i]);
+	kfree(node->links);
+	node->links = NULL;
+}
+
+static const struct file_operations gcov_data_fops = {
+	.open		= gcov_seq_open,
+	.release	= gcov_seq_release,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= gcov_seq_write,
+};
+
+/* Basic initialization of a new node. */
+static void init_node(struct gcov_node *node, struct gcov_info *info,
+		      const char *name, struct gcov_node *parent)
+{
+	INIT_LIST_HEAD(&node->list);
+	INIT_LIST_HEAD(&node->children);
+	INIT_LIST_HEAD(&node->all);
+	node->info = info;
+	node->parent = parent;
+	if (name)
+		strcpy(node->name, name);
+}
+
+/*
+ * Create a new node and associated debugfs entry. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *new_node(struct gcov_node *parent,
+				  struct gcov_info *info, const char *name)
+{
+	struct gcov_node *node;
+
+	node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
+	if (!node) {
+		pr_warning("out of memory\n");
+		return NULL;
+	}
+	init_node(node, info, name, parent);
+	/* Differentiate between gcov data file nodes and directory nodes. */
+	if (info) {
+		node->dentry = debugfs_create_file(deskew(node->name), 0600,
+					parent->dentry, node, &gcov_data_fops);
+	} else
+		node->dentry = debugfs_create_dir(node->name, parent->dentry);
+	if (!node->dentry) {
+		pr_warning("could not create file\n");
+		kfree(node);
+		return NULL;
+	}
+	if (info)
+		add_links(node, parent->dentry);
+	list_add(&node->list, &parent->children);
+	list_add(&node->all, &all_head);
+
+	return node;
+}
+
+/* Remove symbolic links associated with node. */
+static void remove_links(struct gcov_node *node)
+{
+	int i;
+
+	if (!node->links)
+		return;
+	for (i = 0; gcov_link[i].ext; i++)
+		debugfs_remove(node->links[i]);
+	kfree(node->links);
+	node->links = NULL;
+}
+
+/*
+ * Remove node from all lists and debugfs and release associated resources.
+ * Needs to be called with node_lock held.
+ */
+static void release_node(struct gcov_node *node)
+{
+	list_del(&node->list);
+	list_del(&node->all);
+	debugfs_remove(node->dentry);
+	remove_links(node);
+	if (node->ghost)
+		gcov_info_free(node->ghost);
+	kfree(node);
+}
+
+/* Release node and empty parents. Needs to be called with node_lock held. */
+static void remove_node(struct gcov_node *node)
+{
+	struct gcov_node *parent;
+
+	while ((node != &root_node) && list_empty(&node->children)) {
+		parent = node->parent;
+		release_node(node);
+		node = parent;
+	}
+}
+
+/*
+ * Find child node with given basename. Needs to be called with node_lock
+ * held.
+ */
+static struct gcov_node *get_child_by_name(struct gcov_node *parent,
+					   const char *name)
+{
+	struct gcov_node *node;
+
+	list_for_each_entry(node, &parent->children, list) {
+		if (strcmp(node->name, name) == 0)
+			return node;
+	}
+
+	return NULL;
+}
+
+/*
+ * write() implementation for reset file. Reset all profiling data to zero
+ * and remove ghost nodes.
+ */
+static ssize_t reset_write(struct file *file, const char __user *addr,
+			   size_t len, loff_t *pos)
+{
+	struct gcov_node *node;
+
+	mutex_lock(&node_lock);
+restart:
+	list_for_each_entry(node, &all_head, all) {
+		if (node->info)
+			gcov_info_reset(node->info);
+		else if (list_empty(&node->children)) {
+			remove_node(node);
+			/* Several nodes may have gone - restart loop. */
+			goto restart;
+		}
+	}
+	mutex_unlock(&node_lock);
+
+	return len;
+}
+
+/* read() implementation for reset file. Unused. */
+static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
+			  loff_t *pos)
+{
+	/* Allow read operation so that a recursive copy won't fail. */
+	return 0;
+}
+
+static const struct file_operations gcov_reset_fops = {
+	.write	= reset_write,
+	.read	= reset_read,
+};
+
+/*
+ * Create a node for a given profiling data set and add it to all lists and
+ * debugfs. Needs to be called with node_lock held.
+ */
+static void add_node(struct gcov_info *info)
+{
+	char *filename;
+	char *curr;
+	char *next;
+	struct gcov_node *parent;
+	struct gcov_node *node;
+
+	filename = kstrdup(info->filename, GFP_KERNEL);
+	if (!filename)
+		return;
+	parent = &root_node;
+	/* Create directory nodes along the path. */
+	for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
+		if (curr == next)
+			continue;
+		*next = 0;
+		if (strcmp(curr, ".") == 0)
+			continue;
+		if (strcmp(curr, "..") == 0) {
+			if (!parent->parent)
+				goto err_remove;
+			parent = parent->parent;
+			continue;
+		}
+		node = get_child_by_name(parent, curr);
+		if (!node) {
+			node = new_node(parent, NULL, curr);
+			if (!node)
+				goto err_remove;
+		}
+		parent = node;
+	}
+	/* Create file node. */
+	node = new_node(parent, info, curr);
+	if (!node)
+		goto err_remove;
+out:
+	kfree(filename);
+	return;
+
+err_remove:
+	remove_node(parent);
+	goto out;
+}
+
+/*
+ * The profiling data set associated with this node is being unloaded. Store a
+ * copy of the profiling data and turn this node into a "ghost".
+ */
+static int ghost_node(struct gcov_node *node)
+{
+	node->ghost = gcov_info_dup(node->info);
+	if (!node->ghost) {
+		pr_warning("could not save data for '%s' (out of memory)\n",
+			   node->info->filename);
+		return -ENOMEM;
+	}
+	node->info = NULL;
+
+	return 0;
+}
+
+/*
+ * Profiling data for this node has been loaded again. Add profiling data
+ * from previous instantiation and turn this node into a regular node.
+ */
+static void revive_node(struct gcov_node *node, struct gcov_info *info)
+{
+	if (gcov_info_is_compatible(node->ghost, info))
+		gcov_info_add(info, node->ghost);
+	else {
+		pr_warning("discarding saved data for '%s' (version changed)\n",
+			   info->filename);
+	}
+	gcov_info_free(node->ghost);
+	node->ghost = NULL;
+	node->info = info;
+}
+
+/*
+ * Callback to create/remove profiling files when code compiled with
+ * -fprofile-arcs is loaded/unloaded.
+ */
+void gcov_event(enum gcov_action action, struct gcov_info *info)
+{
+	struct gcov_node *node;
+
+	mutex_lock(&node_lock);
+	node = get_node_by_name(info->filename);
+	switch (action) {
+	case GCOV_ADD:
+		/* Add new node or revive ghost. */
+		if (!node) {
+			add_node(info);
+			break;
+		}
+		if (gcov_persist)
+			revive_node(node, info);
+		else {
+			pr_warning("could not add '%s' (already exists)\n",
+				   info->filename);
+		}
+		break;
+	case GCOV_REMOVE:
+		/* Remove node or turn into ghost. */
+		if (!node) {
+			pr_warning("could not remove '%s' (not found)\n",
+				   info->filename);
+			break;
+		}
+		if (gcov_persist) {
+			if (!ghost_node(node))
+				break;
+		}
+		remove_node(node);
+		break;
+	}
+	mutex_unlock(&node_lock);
+}
+
+/* Create debugfs entries. */
+static __init int gcov_fs_init(void)
+{
+	int rc = -EIO;
+
+	init_node(&root_node, NULL, NULL, NULL);
+	/*
+	 * /sys/kernel/debug/gcov will be parent for the reset control file
+	 * and all profiling files.
+	 */
+	root_node.dentry = debugfs_create_dir("gcov", NULL);
+	if (!root_node.dentry)
+		goto err_remove;
+	/*
+	 * Create reset file which resets all profiling counts when written
+	 * to.
+	 */
+	reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+					   NULL, &gcov_reset_fops);
+	if (!reset_dentry)
+		goto err_remove;
+	/* Replay previous events to get our fs hierarchy up-to-date. */
+	gcov_enable_events();
+	return 0;
+
+err_remove:
+	pr_err("init failed\n");
+	if (root_node.dentry)
+		debugfs_remove(root_node.dentry);
+
+	return rc;
+}
+device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 00000000000..ae5bb426003
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
+/*
+ *  This code provides functions to handle gcc's profiling data format
+ *  introduced with gcc 3.4. Future versions of gcc may change the gcov
+ *  format (as happened before), so all format-specific information needs
+ *  to be kept modular and easily exchangeable.
+ *
+ *  This file is based on gcc-internal definitions. Functions and data
+ *  structures are defined to be compatible with gcc counterparts.
+ *  For a better understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
+	{ 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Based on gcc magic. Doesn't change
+ * at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+	return (1 << type) & info->ctr_mask;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+	unsigned int i;
+	unsigned int result = 0;
+
+	for (i = 0; i < GCOV_COUNTERS; i++) {
+		if (counter_active(info, i))
+			result++;
+	}
+	return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+	unsigned int active = num_counter_active(info);
+	unsigned int i;
+
+	for (i = 0; i < active; i++) {
+		memset(info->counts[i].values, 0,
+		       info->counts[i].num * sizeof(gcov_type));
+	}
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+	return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
+{
+	unsigned int i;
+	unsigned int j;
+
+	for (i = 0; i < num_counter_active(dest); i++) {
+		for (j = 0; j < dest->counts[i].num; j++) {
+			dest->counts[i].values[j] +=
+				source->counts[i].values[j];
+		}
+	}
+}
+
+/* Get size of function info entry. Based on gcc magic. */
+static size_t get_fn_size(struct gcov_info *info)
+{
+	size_t size;
+
+	size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
+	       sizeof(unsigned int);
+	if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
+		size = ALIGN(size, __alignof__(struct gcov_fn_info));
+	return size;
+}
+
+/* Get address of function info entry. Based on gcc magic. */
+static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
+{
+	return (struct gcov_fn_info *)
+		((char *) info->functions + fn * get_fn_size(info));
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+	struct gcov_info *dup;
+	unsigned int i;
+	unsigned int active;
+
+	/* Duplicate gcov_info. */
+	active = num_counter_active(info);
+	dup = kzalloc(sizeof(struct gcov_info) +
+		      sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	dup->version		= info->version;
+	dup->stamp		= info->stamp;
+	dup->n_functions	= info->n_functions;
+	dup->ctr_mask		= info->ctr_mask;
+	/* Duplicate filename. */
+	dup->filename		= kstrdup(info->filename, GFP_KERNEL);
+	if (!dup->filename)
+		goto err_free;
+	/* Duplicate table of functions. */
+	dup->functions = kmemdup(info->functions, info->n_functions *
+				 get_fn_size(info), GFP_KERNEL);
+	if (!dup->functions)
+		goto err_free;
+	/* Duplicate counter arrays. */
+	for (i = 0; i < active ; i++) {
+		struct gcov_ctr_info *ctr = &info->counts[i];
+		size_t size = ctr->num * sizeof(gcov_type);
+
+		dup->counts[i].num = ctr->num;
+		dup->counts[i].merge = ctr->merge;
+		dup->counts[i].values = vmalloc(size);
+		if (!dup->counts[i].values)
+			goto err_free;
+		memcpy(dup->counts[i].values, ctr->values, size);
+	}
+	return dup;
+
+err_free:
+	gcov_info_free(dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+	unsigned int active = num_counter_active(info);
+	unsigned int i;
+
+	for (i = 0; i < active ; i++)
+		vfree(info->counts[i].values);
+	kfree(info->functions);
+	kfree(info->filename);
+	kfree(info);
+}
+
+/**
+ * struct type_info - iterator helper array
+ * @ctr_type: counter type
+ * @offset: index of the first value of the current function for this type
+ *
+ * This array is needed to convert the in-memory data format into the in-file
+ * data format:
+ *
+ * In-memory:
+ *   for each counter type
+ *     for each function
+ *       values
+ *
+ * In-file:
+ *   for each function
+ *     for each counter type
+ *       values
+ *
+ * See gcc source gcc/gcov-io.h for more information on data organization.
+ */
+struct type_info {
+	int ctr_type;
+	unsigned int offset;
+};
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @record: record type
+ * @function: function number
+ * @type: counter type
+ * @count: index into values array
+ * @num_types: number of counter types
+ * @type_info: helper array to get values-array offset for current function
+ */
+struct gcov_iterator {
+	struct gcov_info *info;
+
+	int record;
+	unsigned int function;
+	unsigned int type;
+	unsigned int count;
+
+	int num_types;
+	struct type_info type_info[0];
+};
+
+static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
+{
+	return get_fn_info(iter->info, iter->function);
+}
+
+static struct type_info *get_type(struct gcov_iterator *iter)
+{
+	return &iter->type_info[iter->type];
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+	struct gcov_iterator *iter;
+
+	iter = kzalloc(sizeof(struct gcov_iterator) +
+		       num_counter_active(info) * sizeof(struct type_info),
+		       GFP_KERNEL);
+	if (iter)
+		iter->info = info;
+
+	return iter;
+}
+
+/**
+ * gcov_iter_free - release memory for iterator
+ * @iter: file iterator to free
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+	kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+	return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+	int i;
+
+	iter->record = 0;
+	iter->function = 0;
+	iter->type = 0;
+	iter->count = 0;
+	iter->num_types = 0;
+	for (i = 0; i < GCOV_COUNTERS; i++) {
+		if (counter_active(iter->info, i)) {
+			iter->type_info[iter->num_types].ctr_type = i;
+			iter->type_info[iter->num_types++].offset = 0;
+		}
+	}
+}
+
+/* Mapping of logical record number to actual file content. */
+#define RECORD_FILE_MAGIC	0
+#define RECORD_GCOV_VERSION	1
+#define RECORD_TIME_STAMP	2
+#define RECORD_FUNCTION_TAG	3
+#define RECORD_FUNCTON_TAG_LEN	4
+#define RECORD_FUNCTION_IDENT	5
+#define RECORD_FUNCTION_CHECK	6
+#define RECORD_COUNT_TAG	7
+#define RECORD_COUNT_LEN	8
+#define RECORD_COUNT		9
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+	switch (iter->record) {
+	case RECORD_FILE_MAGIC:
+	case RECORD_GCOV_VERSION:
+	case RECORD_FUNCTION_TAG:
+	case RECORD_FUNCTON_TAG_LEN:
+	case RECORD_FUNCTION_IDENT:
+	case RECORD_COUNT_TAG:
+		/* Advance to next record */
+		iter->record++;
+		break;
+	case RECORD_COUNT:
+		/* Advance to next count */
+		iter->count++;
+		/* fall through */
+	case RECORD_COUNT_LEN:
+		if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
+			iter->record = 9;
+			break;
+		}
+		/* Advance to next counter type */
+		get_type(iter)->offset += iter->count;
+		iter->count = 0;
+		iter->type++;
+		/* fall through */
+	case RECORD_FUNCTION_CHECK:
+		if (iter->type < iter->num_types) {
+			iter->record = 7;
+			break;
+		}
+		/* Advance to next function */
+		iter->type = 0;
+		iter->function++;
+		/* fall through */
+	case RECORD_TIME_STAMP:
+		if (iter->function < iter->info->n_functions)
+			iter->record = 3;
+		else
+			iter->record = -1;
+		break;
+	}
+	/* Check for EOF. */
+	if (iter->record == -1)
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/**
+ * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file.
+ */
+static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
+{
+	return seq_write(seq, &v, sizeof(v));
+}
+
+/**
+ * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
+{
+	u32 data[2];
+
+	data[0] = (v & 0xffffffffUL);
+	data[1] = (v >> 32);
+	return seq_write(seq, data, sizeof(data));
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+	int rc = -EINVAL;
+
+	switch (iter->record) {
+	case RECORD_FILE_MAGIC:
+		rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
+		break;
+	case RECORD_GCOV_VERSION:
+		rc = seq_write_gcov_u32(seq, iter->info->version);
+		break;
+	case RECORD_TIME_STAMP:
+		rc = seq_write_gcov_u32(seq, iter->info->stamp);
+		break;
+	case RECORD_FUNCTION_TAG:
+		rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
+		break;
+	case RECORD_FUNCTON_TAG_LEN:
+		rc = seq_write_gcov_u32(seq, 2);
+		break;
+	case RECORD_FUNCTION_IDENT:
+		rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
+		break;
+	case RECORD_FUNCTION_CHECK:
+		rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+		break;
+	case RECORD_COUNT_TAG:
+		rc = seq_write_gcov_u32(seq,
+			GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
+		break;
+	case RECORD_COUNT_LEN:
+		rc = seq_write_gcov_u32(seq,
+				get_func(iter)->n_ctrs[iter->type] * 2);
+		break;
+	case RECORD_COUNT:
+		rc = seq_write_gcov_u64(seq,
+			iter->info->counts[iter->type].
+				values[iter->count + get_type(iter)->offset]);
+		break;
+	}
+	return rc;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 00000000000..060073ebf7a
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
+/*
+ *  Profiling infrastructure declarations.
+ *
+ *  This file is based on gcc-internal definitions. Data structures are
+ *  defined to be compatible with gcc counterparts. For a better
+ *  understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ *    Copyright IBM Corp. 2009
+ *    Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ *    Uses gcc-internal data definitions.
+ */
+
+#ifndef GCOV_H
+#define GCOV_H GCOV_H
+
+#include <linux/types.h>
+
+/*
+ * Profiling data types used for gcc 3.4 and above - these are defined by
+ * gcc and need to be kept as close to the original definition as possible to
+ * remain compatible.
+ */
+#define GCOV_COUNTERS		5
+#define GCOV_DATA_MAGIC		((unsigned int) 0x67636461)
+#define GCOV_TAG_FUNCTION	((unsigned int) 0x01000000)
+#define GCOV_TAG_COUNTER_BASE	((unsigned int) 0x01a10000)
+#define GCOV_TAG_FOR_COUNTER(count)					\
+	(GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
+
+#if BITS_PER_LONG >= 64
+typedef long gcov_type;
+#else
+typedef long long gcov_type;
+#endif
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+	unsigned int ident;
+	unsigned int checksum;
+	unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+	unsigned int	num;
+	gcov_type	*values;
+	void		(*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+	unsigned int			version;
+	struct gcov_info		*next;
+	unsigned int			stamp;
+	const char			*filename;
+	unsigned int			n_functions;
+	const struct gcov_fn_info	*functions;
+	unsigned int			ctr_mask;
+	struct gcov_ctr_info		counts[0];
+};
+
+/* Base interface. */
+enum gcov_action {
+	GCOV_ADD,
+	GCOV_REMOVE,
+};
+
+void gcov_event(enum gcov_action action, struct gcov_info *info);
+void gcov_enable_events(void);
+
+/* Iterator control. */
+struct seq_file;
+struct gcov_iterator;
+
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
+void gcov_iter_free(struct gcov_iterator *iter);
+void gcov_iter_start(struct gcov_iterator *iter);
+int gcov_iter_next(struct gcov_iterator *iter);
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+
+/* gcov_info control. */
+void gcov_info_reset(struct gcov_info *info);
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
+struct gcov_info *gcov_info_dup(struct gcov_info *info);
+void gcov_info_free(struct gcov_info *info);
+
+struct gcov_link {
+	enum {
+		OBJ_TREE,
+		SRC_TREE,
+	} dir;
+	const char *ext;
+};
+extern const struct gcov_link gcov_link[];
+
+#endif /* GCOV_H */
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 2b706617c89..7a7778746ea 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -116,6 +116,17 @@ _a_flags       = $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(KBUILD_SUBDIR_ASFLAGS) \
                  $(asflags-y) $(AFLAGS_$(basetarget).o)
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))
 
+#
+# Enable gcov profiling flags for a file, directory or for all files depending
+# on variables GCOV_PROFILE_obj.o, GCOV_PROFILE and CONFIG_GCOV_PROFILE_ALL
+# (in this order)
+#
+ifeq ($(CONFIG_GCOV_KERNEL),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(GCOV_PROFILE_$(basetarget).o)$(GCOV_PROFILE)$(CONFIG_GCOV_PROFILE_ALL)), \
+		$(CFLAGS_GCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
-- 
cgit v1.2.3-70-g09d2


From c8a06c1ef0bc45915fc45a170c7c60426971304c Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 17 Jun 2009 16:28:15 -0700
Subject: w1-gpio: add external pull-up enable callback

On embedded devices, sleep mode conditions can be tricky to handle,
Especially when processors tend to pull-down the w1 bus during sleep.  Bus
slaves (such as the ds2760) may interpret this as a reason for power-down
conditions and entirely switch off the device.

This patch adds a callback function pointer to let users switch on and off
the external pull-up resistor.  This lets the outside world know whether
the processor is currently actively driving the bus or not.

When this callback is not provided, the code behaviour won't change.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Acked-by: Ville Syrjala <syrjala@sci.fi>
Acked-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/w1/masters/w1-gpio.c | 35 +++++++++++++++++++++++++++++++++++
 include/linux/w1-gpio.h      |  1 +
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/w1/masters/w1-gpio.c b/drivers/w1/masters/w1-gpio.c
index a411702413d..6f8866d6a90 100644
--- a/drivers/w1/masters/w1-gpio.c
+++ b/drivers/w1/masters/w1-gpio.c
@@ -74,6 +74,9 @@ static int __init w1_gpio_probe(struct platform_device *pdev)
 	if (err)
 		goto free_gpio;
 
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(1);
+
 	platform_set_drvdata(pdev, master);
 
 	return 0;
@@ -91,6 +94,9 @@ static int __exit w1_gpio_remove(struct platform_device *pdev)
 	struct w1_bus_master *master = platform_get_drvdata(pdev);
 	struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
 
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(0);
+
 	w1_remove_master_device(master);
 	gpio_free(pdata->pin);
 	kfree(master);
@@ -98,12 +104,41 @@ static int __exit w1_gpio_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+
+static int w1_gpio_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(0);
+
+	return 0;
+}
+
+static int w1_gpio_resume(struct platform_device *pdev)
+{
+	struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+	if (pdata->enable_external_pullup)
+		pdata->enable_external_pullup(1);
+
+	return 0;
+}
+
+#else
+#define w1_gpio_suspend	NULL
+#define w1_gpio_resume	NULL
+#endif
+
 static struct platform_driver w1_gpio_driver = {
 	.driver = {
 		.name	= "w1-gpio",
 		.owner	= THIS_MODULE,
 	},
 	.remove	= __exit_p(w1_gpio_remove),
+	.suspend = w1_gpio_suspend,
+	.resume = w1_gpio_resume,
 };
 
 static int __init w1_gpio_init(void)
diff --git a/include/linux/w1-gpio.h b/include/linux/w1-gpio.h
index 9797fec7748..3adeff82212 100644
--- a/include/linux/w1-gpio.h
+++ b/include/linux/w1-gpio.h
@@ -18,6 +18,7 @@
 struct w1_gpio_platform_data {
 	unsigned int pin;
 	unsigned int is_open_drain:1;
+	void (*enable_external_pullup)(int enable);
 };
 
 #endif /* _LINUX_W1_GPIO_H */
-- 
cgit v1.2.3-70-g09d2


From eae9d2ba0cfc27a2ad9765f23efb98fb80d80234 Mon Sep 17 00:00:00 2001
From: Rodolfo Giometti <giometti@linux.it>
Date: Wed, 17 Jun 2009 16:28:37 -0700
Subject: LinuxPPS: core support

This patch adds the kernel side of the PPS support currently named
"LinuxPPS".

PPS means "pulse per second" and a PPS source is just a device which
provides a high precision signal each second so that an application can
use it to adjust system clock time.

Common use is the combination of the NTPD as userland program with a GPS
receiver as PPS source to obtain a wallclock-time with sub-millisecond
synchronisation to UTC.

To obtain this goal the userland programs shoud use the PPS API
specification (RFC 2783 - Pulse-Per-Second API for UNIX-like Operating
Systems, Version 1.0) which in part is implemented by this patch.  It
provides a set of chars devices, one per PPS source, which can be used to
get the time signal.  The RFC's functions can be implemented by accessing
to these char devices.

Signed-off-by: Rodolfo Giometti <giometti@linux.it>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg KH <greg@kroah.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-pps  |  73 ++++++++
 Documentation/ioctl/ioctl-number.txt |   2 +
 Documentation/pps/pps.txt            | 172 ++++++++++++++++++
 MAINTAINERS                          |   7 +
 drivers/Kconfig                      |   2 +
 drivers/Makefile                     |   1 +
 drivers/pps/Kconfig                  |  33 ++++
 drivers/pps/Makefile                 |   8 +
 drivers/pps/kapi.c                   | 329 +++++++++++++++++++++++++++++++++++
 drivers/pps/pps.c                    | 312 +++++++++++++++++++++++++++++++++
 drivers/pps/sysfs.c                  |  98 +++++++++++
 include/linux/Kbuild                 |   1 +
 include/linux/pps.h                  | 122 +++++++++++++
 include/linux/pps_kernel.h           |  89 ++++++++++
 14 files changed, 1249 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-pps
 create mode 100644 Documentation/pps/pps.txt
 create mode 100644 drivers/pps/Kconfig
 create mode 100644 drivers/pps/Makefile
 create mode 100644 drivers/pps/kapi.c
 create mode 100644 drivers/pps/pps.c
 create mode 100644 drivers/pps/sysfs.c
 create mode 100644 include/linux/pps.h
 create mode 100644 include/linux/pps_kernel.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-pps b/Documentation/ABI/testing/sysfs-pps
new file mode 100644
index 00000000000..25028c7bc37
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-pps
@@ -0,0 +1,73 @@
+What:		/sys/class/pps/
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ directory will contain files and
+		directories that will provide a unified interface to
+		the PPS sources.
+
+What:		/sys/class/pps/ppsX/
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/ directory is related to X-th
+		PPS source into the system. Each directory will
+		contain files to manage and control its PPS source.
+
+What:		/sys/class/pps/ppsX/assert
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/assert file reports the assert events
+		and the assert sequence number of the X-th source in the form:
+
+			<secs>.<nsec>#<sequence>
+
+		If the source has no assert events the content of this file
+		is empty.
+
+What:		/sys/class/pps/ppsX/clear
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/clear file reports the clear events
+		and the clear sequence number of the X-th source in the form:
+
+			<secs>.<nsec>#<sequence>
+
+		If the source has no clear events the content of this file
+		is empty.
+
+What:		/sys/class/pps/ppsX/mode
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/mode file reports the functioning
+		mode of the X-th source in hexadecimal encoding.
+
+		Please, refer to linux/include/linux/pps.h for further
+		info.
+
+What:		/sys/class/pps/ppsX/echo
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/echo file reports if the X-th does
+		or does not support an "echo" function.
+
+What:		/sys/class/pps/ppsX/name
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/name file reports the name of the
+		X-th source.
+
+What:		/sys/class/pps/ppsX/path
+Date:		February 2008
+Contact:	Rodolfo Giometti <giometti@linux.it>
+Description:
+		The /sys/class/pps/ppsX/path file reports the path name of
+		the device connected with the X-th source.
+
+		If the source is not connected with any device the content
+		of this file is empty.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1f779a25c70..7bb0d934b6d 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -149,6 +149,8 @@ Code	Seq#	Include File		Comments
 'p'	40-7F	linux/nvram.h
 'p'	80-9F				user-space parport
 					<mailto:tim@cyberelk.net>
+'p'	a1-a4	linux/pps.h		LinuxPPS
+					<mailto:giometti@linux.it>
 'q'	00-1F	linux/serio.h
 'q'	80-FF				Internet PhoneJACK, Internet LineJACK
 					<http://www.quicknet.net>
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
new file mode 100644
index 00000000000..125f4ab4899
--- /dev/null
+++ b/Documentation/pps/pps.txt
@@ -0,0 +1,172 @@
+
+			PPS - Pulse Per Second
+			----------------------
+
+(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+
+
+Overview
+--------
+
+LinuxPPS provides a programming interface (API) to define in the
+system several PPS sources.
+
+PPS means "pulse per second" and a PPS source is just a device which
+provides a high precision signal each second so that an application
+can use it to adjust system clock time.
+
+A PPS source can be connected to a serial port (usually to the Data
+Carrier Detect pin) or to a parallel port (ACK-pin) or to a special
+CPU's GPIOs (this is the common case in embedded systems) but in each
+case when a new pulse arrives the system must apply to it a timestamp
+and record it for userland.
+
+Common use is the combination of the NTPD as userland program, with a
+GPS receiver as PPS source, to obtain a wallclock-time with
+sub-millisecond synchronisation to UTC.
+
+
+RFC considerations
+------------------
+
+While implementing a PPS API as RFC 2783 defines and using an embedded
+CPU GPIO-Pin as physical link to the signal, I encountered a deeper
+problem:
+
+   At startup it needs a file descriptor as argument for the function
+   time_pps_create().
+
+This implies that the source has a /dev/... entry. This assumption is
+ok for the serial and parallel port, where you can do something
+useful besides(!) the gathering of timestamps as it is the central
+task for a PPS-API. But this assumption does not work for a single
+purpose GPIO line. In this case even basic file-related functionality
+(like read() and write()) makes no sense at all and should not be a
+precondition for the use of a PPS-API.
+
+The problem can be simply solved if you consider that a PPS source is
+not always connected with a GPS data source.
+
+So your programs should check if the GPS data source (the serial port
+for instance) is a PPS source too, and if not they should provide the
+possibility to open another device as PPS source.
+
+In LinuxPPS the PPS sources are simply char devices usually mapped
+into files /dev/pps0, /dev/pps1, etc..
+
+
+Coding example
+--------------
+
+To register a PPS source into the kernel you should define a struct
+pps_source_info_s as follows:
+
+    static struct pps_source_info pps_ktimer_info = {
+	    .name         = "ktimer",
+	    .path         = "",
+	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
+			    PPS_ECHOASSERT | \
+			    PPS_CANWAIT | PPS_TSFMT_TSPEC,
+	    .echo         = pps_ktimer_echo,
+	    .owner        = THIS_MODULE,
+    };
+
+and then calling the function pps_register_source() in your
+intialization routine as follows:
+
+    source = pps_register_source(&pps_ktimer_info,
+			PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
+
+The pps_register_source() prototype is:
+
+  int pps_register_source(struct pps_source_info_s *info, int default_params)
+
+where "info" is a pointer to a structure that describes a particular
+PPS source, "default_params" tells the system what the initial default
+parameters for the device should be (it is obvious that these parameters
+must be a subset of ones defined in the struct
+pps_source_info_s which describe the capabilities of the driver).
+
+Once you have registered a new PPS source into the system you can
+signal an assert event (for example in the interrupt handler routine)
+just using:
+
+    pps_event(source, &ts, PPS_CAPTUREASSERT, ptr)
+
+where "ts" is the event's timestamp.
+
+The same function may also run the defined echo function
+(pps_ktimer_echo(), passing to it the "ptr" pointer) if the user
+asked for that... etc..
+
+Please see the file drivers/pps/clients/ktimer.c for example code.
+
+
+SYSFS support
+-------------
+
+If the SYSFS filesystem is enabled in the kernel it provides a new class:
+
+   $ ls /sys/class/pps/
+   pps0/  pps1/  pps2/
+
+Every directory is the ID of a PPS sources defined in the system and
+inside you find several files:
+
+   $ ls /sys/class/pps/pps0/
+   assert	clear  echo  mode  name  path  subsystem@  uevent
+
+Inside each "assert" and "clear" file you can find the timestamp and a
+sequence number:
+
+   $ cat /sys/class/pps/pps0/assert
+   1170026870.983207967#8
+
+Where before the "#" is the timestamp in seconds; after it is the
+sequence number. Other files are:
+
+* echo: reports if the PPS source has an echo function or not;
+
+* mode: reports available PPS functioning modes;
+
+* name: reports the PPS source's name;
+
+* path: reports the PPS source's device path, that is the device the
+  PPS source is connected to (if it exists).
+
+
+Testing the PPS support
+-----------------------
+
+In order to test the PPS support even without specific hardware you can use
+the ktimer driver (see the client subsection in the PPS configuration menu)
+and the userland tools provided into Documentaion/pps/ directory.
+
+Once you have enabled the compilation of ktimer just modprobe it (if
+not statically compiled):
+
+   # modprobe ktimer
+
+and the run ppstest as follow:
+
+   $ ./ppstest /dev/pps0
+   trying PPS source "/dev/pps1"
+   found PPS source "/dev/pps1"
+   ok, found 1 source(s), now start fetching data...
+   source 0 - assert 1186592699.388832443, sequence: 364 - clear  0.000000000, sequence: 0
+   source 0 - assert 1186592700.388931295, sequence: 365 - clear  0.000000000, sequence: 0
+   source 0 - assert 1186592701.389032765, sequence: 366 - clear  0.000000000, sequence: 0
+
+Please, note that to compile userland programs you need the file timepps.h
+(see Documentation/pps/).
diff --git a/MAINTAINERS b/MAINTAINERS
index 5b7a08d88e4..035df9d2660 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4577,6 +4577,13 @@ S:	Maintained
 F:	drivers/net/pppol2tp.c
 F:	include/linux/if_pppol2tp.h
 
+PPS SUPPORT
+P:	Rodolfo Giometti
+M:	giometti@enneenne.com
+W:	http://wiki.enneenne.com/index.php/LinuxPPS_support
+L:	linuxpps@ml.enneenne.com (subscribers-only)
+S:	Maintained
+
 PREEMPTIBLE KERNEL
 P:	Robert Love
 M:	rml@tech9.net
diff --git a/drivers/Kconfig b/drivers/Kconfig
index a442c8f29fc..48bbdbe43e6 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -52,6 +52,8 @@ source "drivers/i2c/Kconfig"
 
 source "drivers/spi/Kconfig"
 
+source "drivers/pps/Kconfig"
+
 source "drivers/gpio/Kconfig"
 
 source "drivers/w1/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 00b44f4ccf0..bc4205d2fc3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_INPUT)		+= input/
 obj-$(CONFIG_I2O)		+= message/
 obj-$(CONFIG_RTC_LIB)		+= rtc/
 obj-y				+= i2c/ media/
+obj-$(CONFIG_PPS)		+= pps/
 obj-$(CONFIG_W1)		+= w1/
 obj-$(CONFIG_POWER_SUPPLY)	+= power/
 obj-$(CONFIG_HWMON)		+= hwmon/
diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig
new file mode 100644
index 00000000000..cc2eb8edb51
--- /dev/null
+++ b/drivers/pps/Kconfig
@@ -0,0 +1,33 @@
+#
+# PPS support configuration
+#
+
+menu "PPS support"
+
+config PPS
+	tristate "PPS support"
+	depends on EXPERIMENTAL
+	---help---
+	  PPS (Pulse Per Second) is a special pulse provided by some GPS
+	  antennae. Userland can use it to get a high-precision time
+	  reference.
+
+	  Some antennae's PPS signals are connected with the CD (Carrier
+	  Detect) pin of the serial line they use to communicate with the
+	  host. In this case use the SERIAL_LINE client support.
+
+	  Some antennae's PPS signals are connected with some special host
+	  inputs so you have to enable the corresponding client support.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pps_core.ko.
+
+config PPS_DEBUG
+	bool "PPS debugging messages"
+	depends on PPS
+	help
+	  Say Y here if you want the PPS support to produce a bunch of debug
+	  messages to the system log.  Select this if you are having a
+	  problem with PPS support and want to see more of what is going on.
+
+endmenu
diff --git a/drivers/pps/Makefile b/drivers/pps/Makefile
new file mode 100644
index 00000000000..19ea582f431
--- /dev/null
+++ b/drivers/pps/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the PPS core.
+#
+
+pps_core-y			:= pps.o kapi.o sysfs.o
+obj-$(CONFIG_PPS)		:= pps_core.o
+
+ccflags-$(CONFIG_PPS_DEBUG) := -DDEBUG
diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
new file mode 100644
index 00000000000..35a0b192d76
--- /dev/null
+++ b/drivers/pps/kapi.c
@@ -0,0 +1,329 @@
+/*
+ * kernel API
+ *
+ *
+ * Copyright (C) 2005-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/pps_kernel.h>
+
+/*
+ * Global variables
+ */
+
+DEFINE_SPINLOCK(pps_idr_lock);
+DEFINE_IDR(pps_idr);
+
+/*
+ * Local functions
+ */
+
+static void pps_add_offset(struct pps_ktime *ts, struct pps_ktime *offset)
+{
+	ts->nsec += offset->nsec;
+	while (ts->nsec >= NSEC_PER_SEC) {
+		ts->nsec -= NSEC_PER_SEC;
+		ts->sec++;
+	}
+	while (ts->nsec < 0) {
+		ts->nsec += NSEC_PER_SEC;
+		ts->sec--;
+	}
+	ts->sec += offset->sec;
+}
+
+/*
+ * Exported functions
+ */
+
+/* pps_get_source - find a PPS source
+ * @source: the PPS source ID.
+ *
+ * This function is used to find an already registered PPS source into the
+ * system.
+ *
+ * The function returns NULL if found nothing, otherwise it returns a pointer
+ * to the PPS source data struct (the refcounter is incremented by 1).
+ */
+
+struct pps_device *pps_get_source(int source)
+{
+	struct pps_device *pps;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pps_idr_lock, flags);
+
+	pps = idr_find(&pps_idr, source);
+	if (pps != NULL)
+		atomic_inc(&pps->usage);
+
+	spin_unlock_irqrestore(&pps_idr_lock, flags);
+
+	return pps;
+}
+
+/* pps_put_source - free the PPS source data
+ * @pps: a pointer to the PPS source.
+ *
+ * This function is used to free a PPS data struct if its refcount is 0.
+ */
+
+void pps_put_source(struct pps_device *pps)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pps_idr_lock, flags);
+	BUG_ON(atomic_read(&pps->usage) == 0);
+
+	if (!atomic_dec_and_test(&pps->usage)) {
+		pps = NULL;
+		goto exit;
+	}
+
+	/* No more reference to the PPS source. We can safely remove the
+	 * PPS data struct.
+	 */
+	idr_remove(&pps_idr, pps->id);
+
+exit:
+	spin_unlock_irqrestore(&pps_idr_lock, flags);
+	kfree(pps);
+}
+
+/* pps_register_source - add a PPS source in the system
+ * @info: the PPS info struct
+ * @default_params: the default PPS parameters of the new source
+ *
+ * This function is used to add a new PPS source in the system. The new
+ * source is described by info's fields and it will have, as default PPS
+ * parameters, the ones specified into default_params.
+ *
+ * The function returns, in case of success, the PPS source ID.
+ */
+
+int pps_register_source(struct pps_source_info *info, int default_params)
+{
+	struct pps_device *pps;
+	int id;
+	int err;
+
+	/* Sanity checks */
+	if ((info->mode & default_params) != default_params) {
+		printk(KERN_ERR "pps: %s: unsupported default parameters\n",
+					info->name);
+		err = -EINVAL;
+		goto pps_register_source_exit;
+	}
+	if ((info->mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) != 0 &&
+			info->echo == NULL) {
+		printk(KERN_ERR "pps: %s: echo function is not defined\n",
+					info->name);
+		err = -EINVAL;
+		goto pps_register_source_exit;
+	}
+	if ((info->mode & (PPS_TSFMT_TSPEC | PPS_TSFMT_NTPFP)) == 0) {
+		printk(KERN_ERR "pps: %s: unspecified time format\n",
+					info->name);
+		err = -EINVAL;
+		goto pps_register_source_exit;
+	}
+
+	/* Allocate memory for the new PPS source struct */
+	pps = kzalloc(sizeof(struct pps_device), GFP_KERNEL);
+	if (pps == NULL) {
+		err = -ENOMEM;
+		goto pps_register_source_exit;
+	}
+
+	/* These initializations must be done before calling idr_get_new()
+	 * in order to avoid reces into pps_event().
+	 */
+	pps->params.api_version = PPS_API_VERS;
+	pps->params.mode = default_params;
+	pps->info = *info;
+
+	init_waitqueue_head(&pps->queue);
+	spin_lock_init(&pps->lock);
+	atomic_set(&pps->usage, 1);
+
+	/* Get new ID for the new PPS source */
+	if (idr_pre_get(&pps_idr, GFP_KERNEL) == 0) {
+		err = -ENOMEM;
+		goto kfree_pps;
+	}
+
+	spin_lock_irq(&pps_idr_lock);
+
+	/* Now really allocate the PPS source.
+	 * After idr_get_new() calling the new source will be freely available
+	 * into the kernel.
+	 */
+	err = idr_get_new(&pps_idr, pps, &id);
+	if (err < 0) {
+		spin_unlock_irq(&pps_idr_lock);
+		goto kfree_pps;
+	}
+
+	id = id & MAX_ID_MASK;
+	if (id >= PPS_MAX_SOURCES) {
+		spin_unlock_irq(&pps_idr_lock);
+
+		printk(KERN_ERR "pps: %s: too many PPS sources in the system\n",
+					info->name);
+		err = -EBUSY;
+		goto free_idr;
+	}
+	pps->id = id;
+
+	spin_unlock_irq(&pps_idr_lock);
+
+	/* Create the char device */
+	err = pps_register_cdev(pps);
+	if (err < 0) {
+		printk(KERN_ERR "pps: %s: unable to create char device\n",
+					info->name);
+		goto free_idr;
+	}
+
+	pr_info("new PPS source %s at ID %d\n", info->name, id);
+
+	return id;
+
+free_idr:
+	spin_lock_irq(&pps_idr_lock);
+	idr_remove(&pps_idr, id);
+	spin_unlock_irq(&pps_idr_lock);
+
+kfree_pps:
+	kfree(pps);
+
+pps_register_source_exit:
+	printk(KERN_ERR "pps: %s: unable to register source\n", info->name);
+
+	return err;
+}
+EXPORT_SYMBOL(pps_register_source);
+
+/* pps_unregister_source - remove a PPS source from the system
+ * @source: the PPS source ID
+ *
+ * This function is used to remove a previously registered PPS source from
+ * the system.
+ */
+
+void pps_unregister_source(int source)
+{
+	struct pps_device *pps;
+
+	spin_lock_irq(&pps_idr_lock);
+	pps = idr_find(&pps_idr, source);
+
+	if (!pps) {
+		BUG();
+		spin_unlock_irq(&pps_idr_lock);
+		return;
+	}
+	spin_unlock_irq(&pps_idr_lock);
+
+	pps_unregister_cdev(pps);
+	pps_put_source(pps);
+}
+EXPORT_SYMBOL(pps_unregister_source);
+
+/* pps_event - register a PPS event into the system
+ * @source: the PPS source ID
+ * @ts: the event timestamp
+ * @event: the event type
+ * @data: userdef pointer
+ *
+ * This function is used by each PPS client in order to register a new
+ * PPS event into the system (it's usually called inside an IRQ handler).
+ *
+ * If an echo function is associated with the PPS source it will be called
+ * as:
+ *	pps->info.echo(source, event, data);
+ */
+
+void pps_event(int source, struct pps_ktime *ts, int event, void *data)
+{
+	struct pps_device *pps;
+	unsigned long flags;
+
+	if ((event & (PPS_CAPTUREASSERT | PPS_CAPTURECLEAR)) == 0) {
+		printk(KERN_ERR "pps: unknown event (%x) for source %d\n",
+			event, source);
+		return;
+	}
+
+	pps = pps_get_source(source);
+	if (!pps)
+		return;
+
+	pr_debug("PPS event on source %d at %llu.%06u\n",
+			pps->id, (unsigned long long) ts->sec, ts->nsec);
+
+	spin_lock_irqsave(&pps->lock, flags);
+
+	/* Must call the echo function? */
+	if ((pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)))
+		pps->info.echo(source, event, data);
+
+	/* Check the event */
+	pps->current_mode = pps->params.mode;
+	if (event & PPS_CAPTUREASSERT) {
+		/* We have to add an offset? */
+		if (pps->params.mode & PPS_OFFSETASSERT)
+			pps_add_offset(ts, &pps->params.assert_off_tu);
+
+		/* Save the time stamp */
+		pps->assert_tu = *ts;
+		pps->assert_sequence++;
+		pr_debug("capture assert seq #%u for source %d\n",
+			pps->assert_sequence, source);
+	}
+	if (event & PPS_CAPTURECLEAR) {
+		/* We have to add an offset? */
+		if (pps->params.mode & PPS_OFFSETCLEAR)
+			pps_add_offset(ts, &pps->params.clear_off_tu);
+
+		/* Save the time stamp */
+		pps->clear_tu = *ts;
+		pps->clear_sequence++;
+		pr_debug("capture clear seq #%u for source %d\n",
+			pps->clear_sequence, source);
+	}
+
+	pps->go = ~0;
+	wake_up_interruptible(&pps->queue);
+
+	kill_fasync(&pps->async_queue, SIGIO, POLL_IN);
+
+	spin_unlock_irqrestore(&pps->lock, flags);
+
+	/* Now we can release the PPS source for (possible) deregistration */
+	pps_put_source(pps);
+}
+EXPORT_SYMBOL(pps_event);
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
new file mode 100644
index 00000000000..ac8cc8cea1e
--- /dev/null
+++ b/drivers/pps/pps.c
@@ -0,0 +1,312 @@
+/*
+ * PPS core file
+ *
+ *
+ * Copyright (C) 2005-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/idr.h>
+#include <linux/cdev.h>
+#include <linux/poll.h>
+#include <linux/pps_kernel.h>
+
+/*
+ * Local variables
+ */
+
+static dev_t pps_devt;
+static struct class *pps_class;
+
+/*
+ * Char device methods
+ */
+
+static unsigned int pps_cdev_poll(struct file *file, poll_table *wait)
+{
+	struct pps_device *pps = file->private_data;
+
+	poll_wait(file, &pps->queue, wait);
+
+	return POLLIN | POLLRDNORM;
+}
+
+static int pps_cdev_fasync(int fd, struct file *file, int on)
+{
+	struct pps_device *pps = file->private_data;
+	return fasync_helper(fd, file, on, &pps->async_queue);
+}
+
+static long pps_cdev_ioctl(struct file *file,
+		unsigned int cmd, unsigned long arg)
+{
+	struct pps_device *pps = file->private_data;
+	struct pps_kparams params;
+	struct pps_fdata fdata;
+	unsigned long ticks;
+	void __user *uarg = (void __user *) arg;
+	int __user *iuarg = (int __user *) arg;
+	int err;
+
+	switch (cmd) {
+	case PPS_GETPARAMS:
+		pr_debug("PPS_GETPARAMS: source %d\n", pps->id);
+
+		/* Return current parameters */
+		err = copy_to_user(uarg, &pps->params,
+						sizeof(struct pps_kparams));
+		if (err)
+			return -EFAULT;
+
+		break;
+
+	case PPS_SETPARAMS:
+		pr_debug("PPS_SETPARAMS: source %d\n", pps->id);
+
+		/* Check the capabilities */
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		err = copy_from_user(&params, uarg, sizeof(struct pps_kparams));
+		if (err)
+			return -EFAULT;
+		if (!(params.mode & (PPS_CAPTUREASSERT | PPS_CAPTURECLEAR))) {
+			pr_debug("capture mode unspecified (%x)\n",
+								params.mode);
+			return -EINVAL;
+		}
+
+		/* Check for supported capabilities */
+		if ((params.mode & ~pps->info.mode) != 0) {
+			pr_debug("unsupported capabilities (%x)\n",
+								params.mode);
+			return -EINVAL;
+		}
+
+		spin_lock_irq(&pps->lock);
+
+		/* Save the new parameters */
+		pps->params = params;
+
+		/* Restore the read only parameters */
+		if ((params.mode & (PPS_TSFMT_TSPEC | PPS_TSFMT_NTPFP)) == 0) {
+			/* section 3.3 of RFC 2783 interpreted */
+			pr_debug("time format unspecified (%x)\n",
+								params.mode);
+			pps->params.mode |= PPS_TSFMT_TSPEC;
+		}
+		if (pps->info.mode & PPS_CANWAIT)
+			pps->params.mode |= PPS_CANWAIT;
+		pps->params.api_version = PPS_API_VERS;
+
+		spin_unlock_irq(&pps->lock);
+
+		break;
+
+	case PPS_GETCAP:
+		pr_debug("PPS_GETCAP: source %d\n", pps->id);
+
+		err = put_user(pps->info.mode, iuarg);
+		if (err)
+			return -EFAULT;
+
+		break;
+
+	case PPS_FETCH:
+		pr_debug("PPS_FETCH: source %d\n", pps->id);
+
+		err = copy_from_user(&fdata, uarg, sizeof(struct pps_fdata));
+		if (err)
+			return -EFAULT;
+
+		pps->go = 0;
+
+		/* Manage the timeout */
+		if (fdata.timeout.flags & PPS_TIME_INVALID)
+			err = wait_event_interruptible(pps->queue, pps->go);
+		else {
+			pr_debug("timeout %lld.%09d\n",
+					(long long) fdata.timeout.sec,
+					fdata.timeout.nsec);
+			ticks = fdata.timeout.sec * HZ;
+			ticks += fdata.timeout.nsec / (NSEC_PER_SEC / HZ);
+
+			if (ticks != 0) {
+				err = wait_event_interruptible_timeout(
+						pps->queue, pps->go, ticks);
+				if (err == 0)
+					return -ETIMEDOUT;
+			}
+		}
+
+		/* Check for pending signals */
+		if (err == -ERESTARTSYS) {
+			pr_debug("pending signal caught\n");
+			return -EINTR;
+		}
+
+		/* Return the fetched timestamp */
+		spin_lock_irq(&pps->lock);
+
+		fdata.info.assert_sequence = pps->assert_sequence;
+		fdata.info.clear_sequence = pps->clear_sequence;
+		fdata.info.assert_tu = pps->assert_tu;
+		fdata.info.clear_tu = pps->clear_tu;
+		fdata.info.current_mode = pps->current_mode;
+
+		spin_unlock_irq(&pps->lock);
+
+		err = copy_to_user(uarg, &fdata, sizeof(struct pps_fdata));
+		if (err)
+			return -EFAULT;
+
+		break;
+
+	default:
+		return -ENOTTY;
+		break;
+	}
+
+	return 0;
+}
+
+static int pps_cdev_open(struct inode *inode, struct file *file)
+{
+	struct pps_device *pps = container_of(inode->i_cdev,
+						struct pps_device, cdev);
+	int found;
+
+	found = pps_get_source(pps->id) != 0;
+	if (!found)
+		return -ENODEV;
+
+	file->private_data = pps;
+
+	return 0;
+}
+
+static int pps_cdev_release(struct inode *inode, struct file *file)
+{
+	struct pps_device *pps = file->private_data;
+
+	/* Free the PPS source and wake up (possible) deregistration */
+	pps_put_source(pps);
+
+	return 0;
+}
+
+/*
+ * Char device stuff
+ */
+
+static const struct file_operations pps_cdev_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.poll		= pps_cdev_poll,
+	.fasync		= pps_cdev_fasync,
+	.unlocked_ioctl	= pps_cdev_ioctl,
+	.open		= pps_cdev_open,
+	.release	= pps_cdev_release,
+};
+
+int pps_register_cdev(struct pps_device *pps)
+{
+	int err;
+
+	pps->devno = MKDEV(MAJOR(pps_devt), pps->id);
+	cdev_init(&pps->cdev, &pps_cdev_fops);
+	pps->cdev.owner = pps->info.owner;
+
+	err = cdev_add(&pps->cdev, pps->devno, 1);
+	if (err) {
+		printk(KERN_ERR "pps: %s: failed to add char device %d:%d\n",
+				pps->info.name, MAJOR(pps_devt), pps->id);
+		return err;
+	}
+	pps->dev = device_create(pps_class, pps->info.dev, pps->devno, NULL,
+							"pps%d", pps->id);
+	if (err)
+		goto del_cdev;
+	dev_set_drvdata(pps->dev, pps);
+
+	pr_debug("source %s got cdev (%d:%d)\n", pps->info.name,
+			MAJOR(pps_devt), pps->id);
+
+	return 0;
+
+del_cdev:
+	cdev_del(&pps->cdev);
+
+	return err;
+}
+
+void pps_unregister_cdev(struct pps_device *pps)
+{
+	device_destroy(pps_class, pps->devno);
+	cdev_del(&pps->cdev);
+}
+
+/*
+ * Module stuff
+ */
+
+static void __exit pps_exit(void)
+{
+	class_destroy(pps_class);
+	unregister_chrdev_region(pps_devt, PPS_MAX_SOURCES);
+}
+
+static int __init pps_init(void)
+{
+	int err;
+
+	pps_class = class_create(THIS_MODULE, "pps");
+	if (!pps_class) {
+		printk(KERN_ERR "pps: failed to allocate class\n");
+		return -ENOMEM;
+	}
+	pps_class->dev_attrs = pps_attrs;
+
+	err = alloc_chrdev_region(&pps_devt, 0, PPS_MAX_SOURCES, "pps");
+	if (err < 0) {
+		printk(KERN_ERR "pps: failed to allocate char device region\n");
+		goto remove_class;
+	}
+
+	pr_info("LinuxPPS API ver. %d registered\n", PPS_API_VERS);
+	pr_info("Software ver. %s - Copyright 2005-2007 Rodolfo Giometti "
+		"<giometti@linux.it>\n", PPS_VERSION);
+
+	return 0;
+
+remove_class:
+	class_destroy(pps_class);
+
+	return err;
+}
+
+subsys_initcall(pps_init);
+module_exit(pps_exit);
+
+MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
+MODULE_DESCRIPTION("LinuxPPS support (RFC 2783) - ver. " PPS_VERSION);
+MODULE_LICENSE("GPL");
diff --git a/drivers/pps/sysfs.c b/drivers/pps/sysfs.c
new file mode 100644
index 00000000000..ef0978c71ee
--- /dev/null
+++ b/drivers/pps/sysfs.c
@@ -0,0 +1,98 @@
+/*
+ * PPS sysfs support
+ *
+ *
+ * Copyright (C) 2007-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/pps_kernel.h>
+
+/*
+ * Attribute functions
+ */
+
+static ssize_t pps_show_assert(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	if (!(pps->info.mode & PPS_CAPTUREASSERT))
+		return 0;
+
+	return sprintf(buf, "%lld.%09d#%d\n",
+			(long long) pps->assert_tu.sec, pps->assert_tu.nsec,
+			pps->assert_sequence);
+}
+
+static ssize_t pps_show_clear(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	if (!(pps->info.mode & PPS_CAPTURECLEAR))
+		return 0;
+
+	return sprintf(buf, "%lld.%09d#%d\n",
+			(long long) pps->clear_tu.sec, pps->clear_tu.nsec,
+			pps->clear_sequence);
+}
+
+static ssize_t pps_show_mode(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%4x\n", pps->info.mode);
+}
+
+static ssize_t pps_show_echo(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", !!pps->info.echo);
+}
+
+static ssize_t pps_show_name(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", pps->info.name);
+}
+
+static ssize_t pps_show_path(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pps_device *pps = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", pps->info.path);
+}
+
+struct device_attribute pps_attrs[] = {
+	__ATTR(assert, S_IRUGO, pps_show_assert, NULL),
+	__ATTR(clear, S_IRUGO, pps_show_clear, NULL),
+	__ATTR(mode, S_IRUGO, pps_show_mode, NULL),
+	__ATTR(echo, S_IRUGO, pps_show_echo, NULL),
+	__ATTR(name, S_IRUGO, pps_show_name, NULL),
+	__ATTR(path, S_IRUGO, pps_show_path, NULL),
+	__ATTR_NULL,
+};
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index a2df7030d49..03f22076381 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -308,6 +308,7 @@ unifdef-y += pmu.h
 unifdef-y += poll.h
 unifdef-y += ppp_defs.h
 unifdef-y += ppp-comp.h
+unifdef-y += pps.h
 unifdef-y += ptrace.h
 unifdef-y += quota.h
 unifdef-y += random.h
diff --git a/include/linux/pps.h b/include/linux/pps.h
new file mode 100644
index 00000000000..cfe5c7214ec
--- /dev/null
+++ b/include/linux/pps.h
@@ -0,0 +1,122 @@
+/*
+ * PPS API header
+ *
+ * Copyright (C) 2005-2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#ifndef _PPS_H_
+#define _PPS_H_
+
+#define PPS_VERSION		"5.3.6"
+#define PPS_MAX_SOURCES		16		/* should be enough... */
+
+/* Implementation note: the logical states ``assert'' and ``clear''
+ * are implemented in terms of the chip register, i.e. ``assert''
+ * means the bit is set.  */
+
+/*
+ * 3.2 New data structures
+ */
+
+#define PPS_API_VERS_1		1
+#define PPS_API_VERS		PPS_API_VERS_1	/* we use API version 1 */
+#define PPS_MAX_NAME_LEN	32
+
+/* 32-bit vs. 64-bit compatibility.
+ *
+ * 0n i386, the alignment of a uint64_t is only 4 bytes, while on most other
+ * architectures it's 8 bytes. On i386, there will be no padding between the
+ * two consecutive 'struct pps_ktime' members of struct pps_kinfo and struct
+ * pps_kparams. But on most platforms there will be padding to ensure correct
+ * alignment.
+ *
+ * The simple fix is probably to add an explicit padding.
+ *					 		[David Woodhouse]
+ */
+struct pps_ktime {
+	__s64 sec;
+	__s32 nsec;
+	__u32 flags;
+};
+#define PPS_TIME_INVALID	(1<<0)	/* used to specify timeout==NULL */
+
+struct pps_kinfo {
+	__u32 assert_sequence;		/* seq. num. of assert event */
+	__u32 clear_sequence; 		/* seq. num. of clear event */
+	struct pps_ktime assert_tu;	/* time of assert event */
+	struct pps_ktime clear_tu;	/* time of clear event */
+	int current_mode;		/* current mode bits */
+};
+
+struct pps_kparams {
+	int api_version;		/* API version # */
+	int mode;			/* mode bits */
+	struct pps_ktime assert_off_tu;	/* offset compensation for assert */
+	struct pps_ktime clear_off_tu;	/* offset compensation for clear */
+};
+
+/*
+ * 3.3 Mode bit definitions
+ */
+
+/* Device/implementation parameters */
+#define PPS_CAPTUREASSERT	0x01	/* capture assert events */
+#define PPS_CAPTURECLEAR	0x02	/* capture clear events */
+#define PPS_CAPTUREBOTH		0x03	/* capture assert and clear events */
+
+#define PPS_OFFSETASSERT	0x10	/* apply compensation for assert ev. */
+#define PPS_OFFSETCLEAR		0x20	/* apply compensation for clear ev. */
+
+#define PPS_CANWAIT		0x100	/* can we wait for an event? */
+#define PPS_CANPOLL		0x200	/* bit reserved for future use */
+
+/* Kernel actions */
+#define PPS_ECHOASSERT		0x40	/* feed back assert event to output */
+#define PPS_ECHOCLEAR		0x80	/* feed back clear event to output */
+
+/* Timestamp formats */
+#define PPS_TSFMT_TSPEC		0x1000	/* select timespec format */
+#define PPS_TSFMT_NTPFP		0x2000	/* select NTP format */
+
+/*
+ * 3.4.4 New functions: disciplining the kernel timebase
+ */
+
+/* Kernel consumers */
+#define PPS_KC_HARDPPS		0	/* hardpps() (or equivalent) */
+#define PPS_KC_HARDPPS_PLL	1	/* hardpps() constrained to
+					   use a phase-locked loop */
+#define PPS_KC_HARDPPS_FLL	2	/* hardpps() constrained to
+					   use a frequency-locked loop */
+/*
+ * Here begins the implementation-specific part!
+ */
+
+struct pps_fdata {
+	struct pps_kinfo info;
+	struct pps_ktime timeout;
+};
+
+#include <linux/ioctl.h>
+
+#define PPS_GETPARAMS		_IOR('p', 0xa1, struct pps_kparams *)
+#define PPS_SETPARAMS		_IOW('p', 0xa2, struct pps_kparams *)
+#define PPS_GETCAP		_IOR('p', 0xa3, int *)
+#define PPS_FETCH		_IOWR('p', 0xa4, struct pps_fdata *)
+
+#endif /* _PPS_H_ */
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
new file mode 100644
index 00000000000..e0a193f830e
--- /dev/null
+++ b/include/linux/pps_kernel.h
@@ -0,0 +1,89 @@
+/*
+ * PPS API kernel header
+ *
+ * Copyright (C) 2009   Rodolfo Giometti <giometti@linux.it>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pps.h>
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/time.h>
+
+/*
+ * Global defines
+ */
+
+/* The specific PPS source info */
+struct pps_source_info {
+	char name[PPS_MAX_NAME_LEN];		/* simbolic name */
+	char path[PPS_MAX_NAME_LEN];		/* path of connected device */
+	int mode;				/* PPS's allowed mode */
+
+	void (*echo)(int source, int event, void *data); /* PPS echo function */
+
+	struct module *owner;
+	struct device *dev;
+};
+
+/* The main struct */
+struct pps_device {
+	struct pps_source_info info;		/* PSS source info */
+
+	struct pps_kparams params;		/* PPS's current params */
+
+	__u32 assert_sequence;			/* PPS' assert event seq # */
+	__u32 clear_sequence;			/* PPS' clear event seq # */
+	struct pps_ktime assert_tu;
+	struct pps_ktime clear_tu;
+	int current_mode;			/* PPS mode at event time */
+
+	int go;					/* PPS event is arrived? */
+	wait_queue_head_t queue;		/* PPS event queue */
+
+	unsigned int id;			/* PPS source unique ID */
+	struct cdev cdev;
+	struct device *dev;
+	int devno;
+	struct fasync_struct *async_queue;	/* fasync method */
+	spinlock_t lock;
+
+	atomic_t usage;				/* usage count */
+};
+
+/*
+ * Global variables
+ */
+
+extern spinlock_t pps_idr_lock;
+extern struct idr pps_idr;
+extern struct timespec pps_irq_ts[];
+
+extern struct device_attribute pps_attrs[];
+
+/*
+ * Exported functions
+ */
+
+struct pps_device *pps_get_source(int source);
+extern void pps_put_source(struct pps_device *pps);
+extern int pps_register_source(struct pps_source_info *info,
+				int default_params);
+extern void pps_unregister_source(int source);
+extern int pps_register_cdev(struct pps_device *pps);
+extern void pps_unregister_cdev(struct pps_device *pps);
+extern void pps_event(int source, struct pps_ktime *ts, int event, void *data);
-- 
cgit v1.2.3-70-g09d2


From d2829224619866daf336141b71550e223a198838 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Wed, 17 Jun 2009 16:28:38 -0700
Subject: lib: add lib/gcd.c

This patch adds lib/gcd.c which contains a greatest common divider
implementation taken from sound/core/pcm_timer.c

Several usages of this new library function will be sent to subsystem
maintainers.

[akpm@linux-foundation.org: use swap() (pointed out by Joe)]
[akpm@linux-foundation.org: just add gcd.o to obj-y, remove Kconfig changes]
Signed-off-by: Florian Fainelli <florian@openwrt.org>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Simon Horman <horms@verge.net.au>
Cc: Julius Volz <juliusv@google.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gcd.h |  8 ++++++++
 lib/Makefile        |  2 +-
 lib/gcd.c           | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/gcd.h
 create mode 100644 lib/gcd.c

(limited to 'include/linux')

diff --git a/include/linux/gcd.h b/include/linux/gcd.h
new file mode 100644
index 00000000000..69f5e8a01ba
--- /dev/null
+++ b/include/linux/gcd.h
@@ -0,0 +1,8 @@
+#ifndef _GCD_H
+#define _GCD_H
+
+#include <linux/compiler.h>
+
+unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__;
+
+#endif /* _GCD_H */
diff --git a/lib/Makefile b/lib/Makefile
index 8e9bcf9d326..b6d1857bbf0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -21,7 +21,7 @@ lib-y	+= kobject.o kref.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 string_helpers.o
+	 string_helpers.o gcd.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/gcd.c b/lib/gcd.c
new file mode 100644
index 00000000000..f879033d982
--- /dev/null
+++ b/lib/gcd.c
@@ -0,0 +1,18 @@
+#include <linux/kernel.h>
+#include <linux/gcd.h>
+#include <linux/module.h>
+
+/* Greatest common divisor */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r;
+
+	if (a < b)
+		swap(a, b);
+	while ((r = a % b) != 0) {
+		a = b;
+		b = r;
+	}
+	return b;
+}
+EXPORT_SYMBOL_GPL(gcd);
-- 
cgit v1.2.3-70-g09d2


From dcce284a259373f9e5570f2e33f79eca84fcf565 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Jun 2009 13:24:12 +1000
Subject: mm: Extend gfp masking to the page allocator

The page allocator also needs the masking of gfp flags during boot,
so this moves it out of slab/slub and uses it with the page allocator
as well.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  9 ++++++++-
 init/main.c         |  4 ++++
 mm/page_alloc.c     |  3 +++
 mm/slab.c           | 15 ++-------------
 mm/slub.c           | 12 +-----------
 5 files changed, 18 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index cfdb35d71bc..7c777a0da17 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -99,7 +99,7 @@ struct vm_area_struct;
 			__GFP_NORETRY|__GFP_NOMEMALLOC)
 
 /* Control slab gfp mask during early boot */
-#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
+#define GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
 
 /* Control allocation constraints */
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
@@ -348,4 +348,11 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
+extern gfp_t gfp_allowed_mask;
+
+static inline void set_gfp_allowed_mask(gfp_t mask)
+{
+	gfp_allowed_mask = mask;
+}
+
 #endif /* __LINUX_GFP_H */
diff --git a/init/main.c b/init/main.c
index 1a65fdd0631..09131ec090c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -642,6 +642,10 @@ asmlinkage void __init start_kernel(void)
 				 "enabled early\n");
 	early_boot_irqs_on();
 	local_irq_enable();
+
+	/* Interrupts are enabled now so all GFP allocations are safe. */
+	set_gfp_allowed_mask(__GFP_BITS_MASK);
+
 	kmem_cache_init_late();
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a5f3c278c57..6f0753fe694 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -73,6 +73,7 @@ unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 unsigned long highest_memmap_pfn __read_mostly;
 int percpu_pagelist_fraction;
+gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
@@ -1863,6 +1864,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 
+	gfp_mask &= gfp_allowed_mask;
+
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
diff --git a/mm/slab.c b/mm/slab.c
index d08692303f6..e74a16e4ced 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -304,12 +304,6 @@ struct kmem_list3 {
 	int free_touched;		/* updated without locking */
 };
 
-/*
- * The slab allocator is initialized with interrupts disabled. Therefore, make
- * sure early boot allocations don't accidentally enable interrupts.
- */
-static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
-
 /*
  * Need this for bootstrapping a per node allocator.
  */
@@ -1559,11 +1553,6 @@ void __init kmem_cache_init_late(void)
 {
 	struct kmem_cache *cachep;
 
-	/*
-	 * Interrupts are enabled now so all GFP allocations are safe.
-	 */
-	slab_gfp_mask = __GFP_BITS_MASK;
-
 	/* 6) resize the head arrays to their final sizes */
 	mutex_lock(&cache_chain_mutex);
 	list_for_each_entry(cachep, &cache_chain, next)
@@ -3307,7 +3296,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
-	flags &= slab_gfp_mask;
+	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
 
@@ -3392,7 +3381,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
-	flags &= slab_gfp_mask;
+	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
 
diff --git a/mm/slub.c b/mm/slub.c
index 4c6449310a0..ce62b770e2f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -179,12 +179,6 @@ static enum {
 	SYSFS		/* Sysfs up */
 } slab_state = DOWN;
 
-/*
- * The slab allocator is initialized with interrupts disabled. Therefore, make
- * sure early boot allocations don't accidentally enable interrupts.
- */
-static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
-
 /* A list of all slab caches on the system */
 static DECLARE_RWSEM(slub_lock);
 static LIST_HEAD(slab_caches);
@@ -1692,7 +1686,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
-	gfpflags &= slab_gfp_mask;
+	gfpflags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfpflags);
 	might_sleep_if(gfpflags & __GFP_WAIT);
@@ -3220,10 +3214,6 @@ void __init kmem_cache_init(void)
 
 void __init kmem_cache_init_late(void)
 {
-	/*
-	 * Interrupts are enabled now so all GFP allocations are safe.
-	 */
-	slab_gfp_mask = __GFP_BITS_MASK;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 71e308a239c098673570d0b417d42262bb535909 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 18 Jun 2009 12:45:08 -0400
Subject: function-graph: add stack frame test

In case gcc does something funny with the stack frames, or the return
from function code, we would like to detect that.

An arch may implement passing of a variable that is unique to the
function and can be saved on entering a function and can be tested
when exiting the function. Usually the frame pointer can be used for
this purpose.

This patch also implements this for x86. Where it passes in the stack
frame of the parent function, and will test that frame on exit.

There was a case in x86_32 with optimize for size (-Os) where, for a
few functions, gcc would align the stack frame and place a copy of the
return address into it. The function graph tracer modified the copy and
not the actual return address. On return from the funtion, it did not go
to the tracer hook, but returned to the parent. This broke the function
graph tracer, because the return of the parent (where gcc did not do
this funky manipulation) returned to the location that the child function
was suppose to. This caused strange kernel crashes.

This test detected the problem and pointed out where the issue was.

This modifies the parameters of one of the functions that the arch
specific code calls, so it includes changes to arch code to accommodate
the new prototype.

Note, I notice that the parsic arch implements its own push_return_trace.
This is now a generic function and the ftrace_push_return_trace should be
used instead. This patch does not touch that code.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/powerpc/kernel/ftrace.c         |  2 +-
 arch/s390/kernel/ftrace.c            |  2 +-
 arch/x86/Kconfig                     |  1 +
 arch/x86/kernel/entry_32.S           |  2 ++
 arch/x86/kernel/entry_64.S           |  2 ++
 arch/x86/kernel/ftrace.c             |  6 ++++--
 include/linux/ftrace.h               |  4 +++-
 kernel/trace/Kconfig                 |  7 +++++++
 kernel/trace/trace_functions_graph.c | 36 ++++++++++++++++++++++++++++++++----
 9 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 2d182f119d1..58d6a610904 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -605,7 +605,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth, 0) == -EBUSY) {
 		*parent = old;
 		return;
 	}
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 82ddfd3a75a..3e298e64f0d 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -190,7 +190,7 @@ unsigned long prepare_ftrace_return(unsigned long ip, unsigned long parent)
 		goto out;
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))
 		goto out;
-	if (ftrace_push_return_trace(parent, ip, &trace.depth) == -EBUSY)
+	if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY)
 		goto out;
 	trace.func = ftrace_mcount_call_adjust(ip) & PSW_ADDR_INSN;
 	/* Only trace if the calling function expects to. */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 356d2ec8e2f..fcf12af0742 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,6 +33,7 @@ config X86
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_GRAPH_FP_TEST
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 	select HAVE_FTRACE_SYSCALLS
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add475c..0d4b28564c1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1154,6 +1154,7 @@ ENTRY(ftrace_graph_caller)
 	pushl %edx
 	movl 0xc(%esp), %edx
 	lea 0x4(%ebp), %eax
+	movl (%ebp), %ecx
 	subl $MCOUNT_INSN_SIZE, %edx
 	call prepare_ftrace_return
 	popl %edx
@@ -1168,6 +1169,7 @@ return_to_handler:
 	pushl %eax
 	pushl %ecx
 	pushl %edx
+	movl %ebp, %eax
 	call ftrace_return_to_handler
 	movl %eax, 0xc(%esp)
 	popl %edx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index de74f0a3e0e..c251be74510 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller)
 
 	leaq 8(%rbp), %rdi
 	movq 0x38(%rsp), %rsi
+	movq (%rbp), %rdx
 	subq $MCOUNT_INSN_SIZE, %rsi
 
 	call	prepare_ftrace_return
@@ -150,6 +151,7 @@ GLOBAL(return_to_handler)
 	/* Save the return values */
 	movq %rax, (%rsp)
 	movq %rdx, 8(%rsp)
+	movq %rbp, %rdi
 
 	call ftrace_return_to_handler
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b79c5533c42..d94e1ea3b9f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void)
  * Hook the return address and push it in the stack of return addrs
  * in current thread info.
  */
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+			   unsigned long frame_pointer)
 {
 	unsigned long old;
 	int faulted;
@@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth,
+		    frame_pointer) == -EBUSY) {
 		*parent = old;
 		return;
 	}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 39b95c56587..dc3b1328aae 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -362,6 +362,7 @@ struct ftrace_ret_stack {
 	unsigned long func;
 	unsigned long long calltime;
 	unsigned long long subtime;
+	unsigned long fp;
 };
 
 /*
@@ -372,7 +373,8 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 extern int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+			 unsigned long frame_pointer);
 
 /*
  * Sometimes we don't want to trace a function with the function
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1eac85253ce..b17ed8787de 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
 config HAVE_FUNCTION_GRAPH_TRACER
 	bool
 
+config HAVE_FUNCTION_GRAPH_FP_TEST
+	bool
+	help
+	 An arch may pass in a unique value (frame pointer) to both the
+	 entering and exiting of a function. On exit, the value is compared
+	 and if it does not match, then it will panic the kernel.
+
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	bool
 	help
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b..d2249abafb5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+			 unsigned long frame_pointer)
 {
 	unsigned long long calltime;
 	int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
 	current->ret_stack[index].subtime = 0;
+	current->ret_stack[index].fp = frame_pointer;
 	*depth = index;
 
 	return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 
 /* Retrieve a function return address to the trace stack on thread info.*/
 static void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+			unsigned long frame_pointer)
 {
 	int index;
 
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 		return;
 	}
 
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+	/*
+	 * The arch may choose to record the frame pointer used
+	 * and check it here to make sure that it is what we expect it
+	 * to be. If gcc does not set the place holder of the return
+	 * address in the frame pointer, and does a copy instead, then
+	 * the function graph trace will fail. This test detects this
+	 * case.
+	 *
+	 * Currently, x86_32 with optimize for size (-Os) makes the latest
+	 * gcc do the above.
+	 */
+	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+		ftrace_graph_stop();
+		WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+		     "  from func %pF return to %lx\n",
+		     current->ret_stack[index].fp,
+		     frame_pointer,
+		     (void *)current->ret_stack[index].func,
+		     current->ret_stack[index].ret);
+		*ret = (unsigned long)panic;
+		return;
+	}
+#endif
+
 	*ret = current->ret_stack[index].ret;
 	trace->func = current->ret_stack[index].func;
 	trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
  * Send the trace to the ring-buffer.
  * @return the original return address.
  */
-unsigned long ftrace_return_to_handler(void)
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 {
 	struct ftrace_graph_ret trace;
 	unsigned long ret;
 
-	ftrace_pop_return_trace(&trace, &ret);
+	ftrace_pop_return_trace(&trace, &ret, frame_pointer);
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 	barrier();
-- 
cgit v1.2.3-70-g09d2


From 07613ba2f464f59949266f4337b75b91eb610795 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 12 Jun 2009 14:11:41 +1000
Subject: agp: switch AGP to use page array instead of unsigned long array

This switches AGP to use an array of pages for tracking the
pages allocated to the GART. This should enable GEM on PAE to work
a lot better as we can pass highmem pages to the PAT code and it will
do the right thing with them.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/char/agp/agp.h                | 12 +++---
 drivers/char/agp/ali-agp.c            | 26 ++++++-------
 drivers/char/agp/amd-k7-agp.c         |  2 +-
 drivers/char/agp/amd64-agp.c          |  2 +-
 drivers/char/agp/ati-agp.c            |  5 ++-
 drivers/char/agp/backend.c            |  8 ++--
 drivers/char/agp/efficeon-agp.c       |  5 ++-
 drivers/char/agp/generic.c            | 69 ++++++++++++++---------------------
 drivers/char/agp/hp-agp.c             |  9 ++---
 drivers/char/agp/i460-agp.c           | 47 ++++++++++++++----------
 drivers/char/agp/intel-agp.c          | 49 ++++++++++++-------------
 drivers/char/agp/nvidia-agp.c         |  2 +-
 drivers/char/agp/parisc-agp.c         | 20 ++++++++--
 drivers/char/agp/sgi-agp.c            |  9 +++--
 drivers/char/agp/sworks-agp.c         |  2 +-
 drivers/char/agp/uninorth-agp.c       | 12 +++---
 drivers/gpu/drm/drm_agpsupport.c      |  2 +-
 drivers/gpu/drm/drm_memory.c          |  8 ++--
 drivers/gpu/drm/drm_vm.c              |  4 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c |  3 +-
 include/linux/agp_backend.h           |  2 +-
 21 files changed, 152 insertions(+), 146 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 46f50753117..178e2e9e9f0 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -107,7 +107,7 @@ struct agp_bridge_driver {
 	void (*agp_enable)(struct agp_bridge_data *, u32);
 	void (*cleanup)(void);
 	void (*tlb_flush)(struct agp_memory *);
-	unsigned long (*mask_memory)(struct agp_bridge_data *, unsigned long, int);
+	unsigned long (*mask_memory)(struct agp_bridge_data *, struct page *, int);
 	void (*cache_flush)(void);
 	int (*create_gatt_table)(struct agp_bridge_data *);
 	int (*free_gatt_table)(struct agp_bridge_data *);
@@ -115,9 +115,9 @@ struct agp_bridge_driver {
 	int (*remove_memory)(struct agp_memory *, off_t, int);
 	struct agp_memory *(*alloc_by_type) (size_t, int);
 	void (*free_by_type)(struct agp_memory *);
-	void *(*agp_alloc_page)(struct agp_bridge_data *);
+	struct page *(*agp_alloc_page)(struct agp_bridge_data *);
 	int (*agp_alloc_pages)(struct agp_bridge_data *, struct agp_memory *, size_t);
-	void (*agp_destroy_page)(void *, int flags);
+	void (*agp_destroy_page)(struct page *, int flags);
 	void (*agp_destroy_pages)(struct agp_memory *);
 	int (*agp_type_to_mask_type) (struct agp_bridge_data *, int);
 	void (*chipset_flush)(struct agp_bridge_data *);
@@ -278,10 +278,10 @@ int agp_generic_insert_memory(struct agp_memory *mem, off_t pg_start, int type);
 int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type);
 struct agp_memory *agp_generic_alloc_by_type(size_t page_count, int type);
 void agp_generic_free_by_type(struct agp_memory *curr);
-void *agp_generic_alloc_page(struct agp_bridge_data *bridge);
+struct page *agp_generic_alloc_page(struct agp_bridge_data *bridge);
 int agp_generic_alloc_pages(struct agp_bridge_data *agp_bridge,
 			    struct agp_memory *memory, size_t page_count);
-void agp_generic_destroy_page(void *addr, int flags);
+void agp_generic_destroy_page(struct page *page, int flags);
 void agp_generic_destroy_pages(struct agp_memory *memory);
 void agp_free_key(int key);
 int agp_num_entries(void);
@@ -291,7 +291,7 @@ int agp_3_5_enable(struct agp_bridge_data *bridge);
 void global_cache_flush(void);
 void get_agp_version(struct agp_bridge_data *bridge);
 unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge,
-	unsigned long addr, int type);
+				      struct page *page, int type);
 int agp_generic_type_to_mask_type(struct agp_bridge_data *bridge,
 				  int type);
 struct agp_bridge_data *agp_generic_find_bridge(struct pci_dev *pdev);
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 58f26ed2a96..201ef3ffd48 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -141,37 +141,37 @@ static void m1541_cache_flush(void)
 	}
 }
 
-static void *m1541_alloc_page(struct agp_bridge_data *bridge)
+static struct page *m1541_alloc_page(struct agp_bridge_data *bridge)
 {
-	void *addr = agp_generic_alloc_page(agp_bridge);
+	struct page *page = agp_generic_alloc_page(agp_bridge);
 	u32 temp;
 
-	if (!addr)
+	if (!page)
 		return NULL;
 
 	pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp);
 	pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL,
 			(((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
-			  virt_to_gart(addr)) | ALI_CACHE_FLUSH_EN ));
-	return addr;
+			  phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN ));
+	return page;
 }
 
-static void ali_destroy_page(void * addr, int flags)
+static void ali_destroy_page(struct page *page, int flags)
 {
-	if (addr) {
+	if (page) {
 		if (flags & AGP_PAGE_DESTROY_UNMAP) {
 			global_cache_flush();	/* is this really needed?  --hch */
-			agp_generic_destroy_page(addr, flags);
+			agp_generic_destroy_page(page, flags);
 		} else
-			agp_generic_destroy_page(addr, flags);
+			agp_generic_destroy_page(page, flags);
 	}
 }
 
-static void m1541_destroy_page(void * addr, int flags)
+static void m1541_destroy_page(struct page *page, int flags)
 {
 	u32 temp;
 
-	if (addr == NULL)
+	if (page == NULL)
 		return;
 
 	if (flags & AGP_PAGE_DESTROY_UNMAP) {
@@ -180,9 +180,9 @@ static void m1541_destroy_page(void * addr, int flags)
 		pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp);
 		pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL,
 				       (((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
-					 virt_to_gart(addr)) | ALI_CACHE_FLUSH_EN));
+					 phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN));
 	}
-	agp_generic_destroy_page(addr, flags);
+	agp_generic_destroy_page(page, flags);
 }
 
 
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index 3f98254b911..ba9bde71eaa 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = GET_GATT(addr);
 		writel(agp_generic_mask_memory(agp_bridge,
-			mem->memory[i], mem->type), cur_gatt+GET_GATT_OFF(addr));
+			mem->pages[i], mem->type), cur_gatt+GET_GATT_OFF(addr));
 		readl(cur_gatt+GET_GATT_OFF(addr));	/* PCI Posting. */
 	}
 	amd_irongate_tlbflush(mem);
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index d765afda9c2..3bf5dda90f4 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		tmp = agp_bridge->driver->mask_memory(agp_bridge,
-			mem->memory[i], mask_type);
+			mem->pages[i], mask_type);
 
 		BUG_ON(tmp & 0xffffff0000000ffcULL);
 		pte = (tmp & 0x000000ff00000000ULL) >> 28;
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index f1537eece07..4d38baa209f 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -296,8 +296,9 @@ static int ati_insert_memory(struct agp_memory * mem,
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = GET_GATT(addr);
-		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			mem->memory[i], mem->type), cur_gatt+GET_GATT_OFF(addr));
+		writel(agp_bridge->driver->mask_memory(agp_bridge,	
+						       mem->pages[i], mem->type),
+		       cur_gatt+GET_GATT_OFF(addr));
 		readl(cur_gatt+GET_GATT_OFF(addr));	/* PCI Posting. */
 	}
 	agp_bridge->driver->tlb_flush(mem);
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 8c617ad7497..cfa5a649dfe 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -141,17 +141,17 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
 	bridge->version = &agp_current_version;
 
 	if (bridge->driver->needs_scratch_page) {
-		void *addr = bridge->driver->agp_alloc_page(bridge);
+		struct page *page = bridge->driver->agp_alloc_page(bridge);
 
-		if (!addr) {
+		if (!page) {
 			dev_err(&bridge->dev->dev,
 				"can't get memory for scratch page\n");
 			return -ENOMEM;
 		}
 
-		bridge->scratch_page_real = virt_to_gart(addr);
+		bridge->scratch_page_real = phys_to_gart(page_to_phys(page));
 		bridge->scratch_page =
-		    bridge->driver->mask_memory(bridge, bridge->scratch_page_real, 0);
+		    bridge->driver->mask_memory(bridge, page, 0);
 	}
 
 	size_value = bridge->driver->fetch_size();
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 453543a1f29..35d50f2861b 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -65,8 +65,9 @@ static const struct gatt_mask efficeon_generic_masks[] =
 };
 
 /* This function does the same thing as mask_memory() for this chipset... */
-static inline unsigned long efficeon_mask_memory(unsigned long addr)
+static inline unsigned long efficeon_mask_memory(struct page *page)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	return addr | 0x00000001;
 }
 
@@ -257,7 +258,7 @@ static int efficeon_insert_memory(struct agp_memory * mem, off_t pg_start, int t
 	last_page = NULL;
 	for (i = 0; i < count; i++) {
 		int index = pg_start + i;
-		unsigned long insert = efficeon_mask_memory(mem->memory[i]);
+		unsigned long insert = efficeon_mask_memory(mem->pages[i]);
 
 		page = (unsigned int *) efficeon_private.l1_table[index >> 10];
 
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 2224b762b7f..1e8b461b91f 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -95,13 +95,13 @@ EXPORT_SYMBOL(agp_flush_chipset);
 
 void agp_alloc_page_array(size_t size, struct agp_memory *mem)
 {
-	mem->memory = NULL;
+	mem->pages = NULL;
 	mem->vmalloc_flag = false;
 
 	if (size <= 2*PAGE_SIZE)
-		mem->memory = kmalloc(size, GFP_KERNEL | __GFP_NORETRY);
-	if (mem->memory == NULL) {
-		mem->memory = vmalloc(size);
+		mem->pages = kmalloc(size, GFP_KERNEL | __GFP_NORETRY);
+	if (mem->pages == NULL) {
+		mem->pages = vmalloc(size);
 		mem->vmalloc_flag = true;
 	}
 }
@@ -110,9 +110,9 @@ EXPORT_SYMBOL(agp_alloc_page_array);
 void agp_free_page_array(struct agp_memory *mem)
 {
 	if (mem->vmalloc_flag) {
-		vfree(mem->memory);
+		vfree(mem->pages);
 	} else {
-		kfree(mem->memory);
+		kfree(mem->pages);
 	}
 }
 EXPORT_SYMBOL(agp_free_page_array);
@@ -136,7 +136,7 @@ static struct agp_memory *agp_create_user_memory(unsigned long num_agp_pages)
 
 	agp_alloc_page_array(alloc_size, new);
 
-	if (new->memory == NULL) {
+	if (new->pages == NULL) {
 		agp_free_key(new->key);
 		kfree(new);
 		return NULL;
@@ -162,7 +162,7 @@ struct agp_memory *agp_create_memory(int scratch_pages)
 
 	agp_alloc_page_array(PAGE_SIZE * scratch_pages, new);
 
-	if (new->memory == NULL) {
+	if (new->pages == NULL) {
 		agp_free_key(new->key);
 		kfree(new);
 		return NULL;
@@ -206,15 +206,13 @@ void agp_free_memory(struct agp_memory *curr)
 		} else {
 
 			for (i = 0; i < curr->page_count; i++) {
-				curr->memory[i] = (unsigned long)gart_to_virt(
-					curr->memory[i]);
 				curr->bridge->driver->agp_destroy_page(
-					(void *)curr->memory[i],
+					curr->pages[i],
 					AGP_PAGE_DESTROY_UNMAP);
 			}
 			for (i = 0; i < curr->page_count; i++) {
 				curr->bridge->driver->agp_destroy_page(
-					(void *)curr->memory[i],
+					curr->pages[i],
 					AGP_PAGE_DESTROY_FREE);
 			}
 		}
@@ -282,13 +280,13 @@ struct agp_memory *agp_allocate_memory(struct agp_bridge_data *bridge,
 	}
 
 	for (i = 0; i < page_count; i++) {
-		void *addr = bridge->driver->agp_alloc_page(bridge);
+		struct page *page = bridge->driver->agp_alloc_page(bridge);
 
-		if (addr == NULL) {
+		if (page == NULL) {
 			agp_free_memory(new);
 			return NULL;
 		}
-		new->memory[i] = virt_to_gart(addr);
+		new->pages[i] = page;
 		new->page_count++;
 	}
 	new->bridge = bridge;
@@ -1134,7 +1132,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
 	}
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
-		writel(bridge->driver->mask_memory(bridge, mem->memory[i], mask_type),
+		writel(bridge->driver->mask_memory(bridge, mem->pages[i], mask_type),
 		       bridge->gatt_table+j);
 	}
 	readl(bridge->gatt_table+j-1);	/* PCI Posting. */
@@ -1204,7 +1202,7 @@ struct agp_memory *agp_generic_alloc_user(size_t page_count, int type)
 		return NULL;
 
 	for (i = 0; i < page_count; i++)
-		new->memory[i] = 0;
+		new->pages[i] = 0;
 	new->page_count = 0;
 	new->type = type;
 	new->num_scratch_pages = pages;
@@ -1237,23 +1235,20 @@ int agp_generic_alloc_pages(struct agp_bridge_data *bridge, struct agp_memory *m
 		get_page(page);
 		atomic_inc(&agp_bridge->current_memory_agp);
 
-		/* set_memory_array_uc() needs virtual address */
-		mem->memory[i] = (unsigned long)page_address(page);
+		mem->pages[i] = page;
 		mem->page_count++;
 	}
 
 #ifdef CONFIG_X86
-	set_memory_array_uc(mem->memory, num_pages);
+	set_pages_array_uc(mem->pages, num_pages);
 #endif
 	ret = 0;
 out:
-	for (i = 0; i < mem->page_count; i++)
-		mem->memory[i] = virt_to_gart((void *)mem->memory[i]);
 	return ret;
 }
 EXPORT_SYMBOL(agp_generic_alloc_pages);
 
-void *agp_generic_alloc_page(struct agp_bridge_data *bridge)
+struct page *agp_generic_alloc_page(struct agp_bridge_data *bridge)
 {
 	struct page * page;
 
@@ -1265,56 +1260,47 @@ void *agp_generic_alloc_page(struct agp_bridge_data *bridge)
 
 	get_page(page);
 	atomic_inc(&agp_bridge->current_memory_agp);
-	return page_address(page);
+	return page;
 }
 EXPORT_SYMBOL(agp_generic_alloc_page);
 
 void agp_generic_destroy_pages(struct agp_memory *mem)
 {
 	int i;
-	void *addr;
 	struct page *page;
 
 	if (!mem)
 		return;
 
-	for (i = 0; i < mem->page_count; i++)
-		mem->memory[i] = (unsigned long)gart_to_virt(mem->memory[i]);
-
 #ifdef CONFIG_X86
-	set_memory_array_wb(mem->memory, mem->page_count);
+	set_pages_array_wb(mem->pages, mem->page_count);
 #endif
 
 	for (i = 0; i < mem->page_count; i++) {
-		addr = (void *)mem->memory[i];
-		page = virt_to_page(addr);
+		page = mem->pages[i];
 
 #ifndef CONFIG_X86
 		unmap_page_from_agp(page);
 #endif
-
 		put_page(page);
-		free_page((unsigned long)addr);
+		__free_page(page);
 		atomic_dec(&agp_bridge->current_memory_agp);
-		mem->memory[i] = 0;
+		mem->pages[i] = NULL;
 	}
 }
 EXPORT_SYMBOL(agp_generic_destroy_pages);
 
-void agp_generic_destroy_page(void *addr, int flags)
+void agp_generic_destroy_page(struct page *page, int flags)
 {
-	struct page *page;
-
-	if (addr == NULL)
+	if (page == NULL)
 		return;
 
-	page = virt_to_page(addr);
 	if (flags & AGP_PAGE_DESTROY_UNMAP)
 		unmap_page_from_agp(page);
 
 	if (flags & AGP_PAGE_DESTROY_FREE) {
 		put_page(page);
-		free_page((unsigned long)addr);
+		__free_page(page);
 		atomic_dec(&agp_bridge->current_memory_agp);
 	}
 }
@@ -1361,8 +1347,9 @@ void global_cache_flush(void)
 EXPORT_SYMBOL(global_cache_flush);
 
 unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge,
-	unsigned long addr, int type)
+				      struct page *page, int type)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	/* memory type is ignored in the generic routine */
 	if (bridge->driver->masks)
 		return addr | bridge->driver->masks[0].mask;
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
index 183ac3fe44f..abea273dcc2 100644
--- a/drivers/char/agp/hp-agp.c
+++ b/drivers/char/agp/hp-agp.c
@@ -361,13 +361,11 @@ hp_zx1_insert_memory (struct agp_memory *mem, off_t pg_start, int type)
 	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
 		unsigned long paddr;
 
-		paddr = mem->memory[i];
+		paddr = page_to_phys(mem->pages[i]);
 		for (k = 0;
 		     k < hp->io_pages_per_kpage;
 		     k++, j++, paddr += hp->io_page_size) {
-			hp->gatt[j] =
-				agp_bridge->driver->mask_memory(agp_bridge,
-					paddr, type);
+			hp->gatt[j] = HP_ZX1_PDIR_VALID_BIT | paddr;
 		}
 	}
 
@@ -397,8 +395,9 @@ hp_zx1_remove_memory (struct agp_memory *mem, off_t pg_start, int type)
 
 static unsigned long
 hp_zx1_mask_memory (struct agp_bridge_data *bridge,
-	unsigned long addr, int type)
+		    struct page *page, int type)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	return HP_ZX1_PDIR_VALID_BIT | addr;
 }
 
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index 10da687d131..60cc35bb5db 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -60,6 +60,9 @@
  */
 #define WR_FLUSH_GATT(index)	RD_GATT(index)
 
+static unsigned long i460_mask_memory (struct agp_bridge_data *bridge,
+				       unsigned long addr, int type);
+
 static struct {
 	void *gatt;				/* ioremap'd GATT area */
 
@@ -74,6 +77,7 @@ static struct {
 		unsigned long *alloced_map;	/* bitmap of kernel-pages in use */
 		int refcount;			/* number of kernel pages using the large page */
 		u64 paddr;			/* physical address of large page */
+		struct page *page; 		/* page pointer */
 	} *lp_desc;
 } i460;
 
@@ -294,7 +298,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem,
 	void *temp;
 
 	pr_debug("i460_insert_memory_small_io_page(mem=%p, pg_start=%ld, type=%d, paddr0=0x%lx)\n",
-		 mem, pg_start, type, mem->memory[0]);
+		 mem, pg_start, type, page_to_phys(mem->pages[0]));
 
 	if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
 		return -EINVAL;
@@ -321,10 +325,9 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem,
 
 	io_page_size = 1UL << I460_IO_PAGE_SHIFT;
 	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		paddr = mem->memory[i];
+		paddr = phys_to_gart(page_to_phys(mem->pages[i]));
 		for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size)
-			WR_GATT(j, agp_bridge->driver->mask_memory(agp_bridge,
-				paddr, mem->type));
+			WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type));
 	}
 	WR_FLUSH_GATT(j - 1);
 	return 0;
@@ -364,10 +367,9 @@ static int i460_alloc_large_page (struct lp_desc *lp)
 {
 	unsigned long order = I460_IO_PAGE_SHIFT - PAGE_SHIFT;
 	size_t map_size;
-	void *lpage;
 
-	lpage = (void *) __get_free_pages(GFP_KERNEL, order);
-	if (!lpage) {
+	lp->page = alloc_pages(GFP_KERNEL, order);
+	if (!lp->page) {
 		printk(KERN_ERR PFX "Couldn't alloc 4M GART page...\n");
 		return -ENOMEM;
 	}
@@ -375,12 +377,12 @@ static int i460_alloc_large_page (struct lp_desc *lp)
 	map_size = ((I460_KPAGES_PER_IOPAGE + BITS_PER_LONG - 1) & -BITS_PER_LONG)/8;
 	lp->alloced_map = kzalloc(map_size, GFP_KERNEL);
 	if (!lp->alloced_map) {
-		free_pages((unsigned long) lpage, order);
+		__free_pages(lp->page, order);
 		printk(KERN_ERR PFX "Out of memory, we're in trouble...\n");
 		return -ENOMEM;
 	}
 
-	lp->paddr = virt_to_gart(lpage);
+	lp->paddr = phys_to_gart(page_to_phys(lp->page));
 	lp->refcount = 0;
 	atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
 	return 0;
@@ -391,7 +393,7 @@ static void i460_free_large_page (struct lp_desc *lp)
 	kfree(lp->alloced_map);
 	lp->alloced_map = NULL;
 
-	free_pages((unsigned long) gart_to_virt(lp->paddr), I460_IO_PAGE_SHIFT - PAGE_SHIFT);
+	__free_pages(lp->page, I460_IO_PAGE_SHIFT - PAGE_SHIFT);
 	atomic_sub(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
 }
 
@@ -439,8 +441,8 @@ static int i460_insert_memory_large_io_page (struct agp_memory *mem,
 			if (i460_alloc_large_page(lp) < 0)
 				return -ENOMEM;
 			pg = lp - i460.lp_desc;
-			WR_GATT(pg, agp_bridge->driver->mask_memory(agp_bridge,
-				lp->paddr, 0));
+			WR_GATT(pg, i460_mask_memory(agp_bridge,
+						     lp->paddr, 0));
 			WR_FLUSH_GATT(pg);
 		}
 
@@ -448,7 +450,7 @@ static int i460_insert_memory_large_io_page (struct agp_memory *mem,
 		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
 		     idx++, i++)
 		{
-			mem->memory[i] = lp->paddr + idx*PAGE_SIZE;
+			mem->pages[i] = lp->page;
 			__set_bit(idx, lp->alloced_map);
 			++lp->refcount;
 		}
@@ -463,7 +465,7 @@ static int i460_remove_memory_large_io_page (struct agp_memory *mem,
 	struct lp_desc *start, *end, *lp;
 	void *temp;
 
-	temp = agp_bridge->driver->current_size;
+	temp = agp_bridge->current_size;
 	num_entries = A_SIZE_8(temp)->num_entries;
 
 	/* Figure out what pg_start means in terms of our large GART pages */
@@ -477,7 +479,7 @@ static int i460_remove_memory_large_io_page (struct agp_memory *mem,
 		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
 		     idx++, i++)
 		{
-			mem->memory[i] = 0;
+			mem->pages[i] = NULL;
 			__clear_bit(idx, lp->alloced_map);
 			--lp->refcount;
 		}
@@ -521,7 +523,7 @@ static int i460_remove_memory (struct agp_memory *mem,
  * Let's just hope nobody counts on the allocated AGP memory being there before bind time
  * (I don't think current drivers do)...
  */
-static void *i460_alloc_page (struct agp_bridge_data *bridge)
+static struct page *i460_alloc_page (struct agp_bridge_data *bridge)
 {
 	void *page;
 
@@ -534,7 +536,7 @@ static void *i460_alloc_page (struct agp_bridge_data *bridge)
 	return page;
 }
 
-static void i460_destroy_page (void *page, int flags)
+static void i460_destroy_page (struct page *page, int flags)
 {
 	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
 		agp_generic_destroy_page(page, flags);
@@ -544,13 +546,20 @@ static void i460_destroy_page (void *page, int flags)
 #endif /* I460_LARGE_IO_PAGES */
 
 static unsigned long i460_mask_memory (struct agp_bridge_data *bridge,
-	unsigned long addr, int type)
+				       unsigned long addr, int type)
 {
 	/* Make sure the returned address is a valid GATT entry */
 	return bridge->driver->masks[0].mask
 		| (((addr & ~((1 << I460_IO_PAGE_SHIFT) - 1)) & 0xfffff000) >> 12);
 }
 
+static unsigned long i460_page_mask_memory(struct agp_bridge_data *bridge,
+					   struct page *page, int type)
+{
+	unsigned long addr = phys_to_gart(page_to_phys(page));
+	return i460_mask_memory(bridge, addr, type);
+}
+
 const struct agp_bridge_driver intel_i460_driver = {
 	.owner			= THIS_MODULE,
 	.aperture_sizes		= i460_sizes,
@@ -560,7 +569,7 @@ const struct agp_bridge_driver intel_i460_driver = {
 	.fetch_size		= i460_fetch_size,
 	.cleanup		= i460_cleanup,
 	.tlb_flush		= i460_tlb_flush,
-	.mask_memory		= i460_mask_memory,
+	.mask_memory		= i460_page_mask_memory,
 	.masks			= i460_masks,
 	.agp_enable		= agp_generic_enable,
 	.cache_flush		= global_cache_flush,
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 7a748fa0dfc..35977bfb699 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -257,7 +257,7 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode)
 }
 
 /* Exists to support ARGB cursors */
-static void *i8xx_alloc_pages(void)
+static struct page *i8xx_alloc_pages(void)
 {
 	struct page *page;
 
@@ -272,17 +272,14 @@ static void *i8xx_alloc_pages(void)
 	}
 	get_page(page);
 	atomic_inc(&agp_bridge->current_memory_agp);
-	return page_address(page);
+	return page;
 }
 
-static void i8xx_destroy_pages(void *addr)
+static void i8xx_destroy_pages(struct page *page)
 {
-	struct page *page;
-
-	if (addr == NULL)
+	if (page == NULL)
 		return;
 
-	page = virt_to_page(addr);
 	set_pages_wb(page, 4);
 	put_page(page);
 	__free_pages(page, 2);
@@ -346,7 +343,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start,
 			global_cache_flush();
 		for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 			writel(agp_bridge->driver->mask_memory(agp_bridge,
-							       mem->memory[i],
+							       mem->pages[i],
 							       mask_type),
 			       intel_private.registers+I810_PTE_BASE+(j*4));
 		}
@@ -389,37 +386,37 @@ static int intel_i810_remove_entries(struct agp_memory *mem, off_t pg_start,
 static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
 {
 	struct agp_memory *new;
-	void *addr;
+	struct page *page;
 
 	switch (pg_count) {
-	case 1: addr = agp_bridge->driver->agp_alloc_page(agp_bridge);
+	case 1: page = agp_bridge->driver->agp_alloc_page(agp_bridge);
 		break;
 	case 4:
 		/* kludge to get 4 physical pages for ARGB cursor */
-		addr = i8xx_alloc_pages();
+		page = i8xx_alloc_pages();
 		break;
 	default:
 		return NULL;
 	}
 
-	if (addr == NULL)
+	if (page == NULL)
 		return NULL;
 
 	new = agp_create_memory(pg_count);
 	if (new == NULL)
 		return NULL;
 
-	new->memory[0] = virt_to_gart(addr);
+	new->pages[0] = page;
 	if (pg_count == 4) {
 		/* kludge to get 4 physical pages for ARGB cursor */
-		new->memory[1] = new->memory[0] + PAGE_SIZE;
-		new->memory[2] = new->memory[1] + PAGE_SIZE;
-		new->memory[3] = new->memory[2] + PAGE_SIZE;
+		new->pages[1] = new->pages[0] + 1;
+		new->pages[2] = new->pages[1] + 1;
+		new->pages[3] = new->pages[2] + 1;
 	}
 	new->page_count = pg_count;
 	new->num_scratch_pages = pg_count;
 	new->type = AGP_PHYS_MEMORY;
-	new->physical = new->memory[0];
+	new->physical = page_to_phys(new->pages[0]);
 	return new;
 }
 
@@ -451,13 +448,11 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
 	agp_free_key(curr->key);
 	if (curr->type == AGP_PHYS_MEMORY) {
 		if (curr->page_count == 4)
-			i8xx_destroy_pages(gart_to_virt(curr->memory[0]));
+			i8xx_destroy_pages(curr->pages[0]);
 		else {
-			void *va = gart_to_virt(curr->memory[0]);
-
-			agp_bridge->driver->agp_destroy_page(va,
+			agp_bridge->driver->agp_destroy_page(curr->pages[0],
 							     AGP_PAGE_DESTROY_UNMAP);
-			agp_bridge->driver->agp_destroy_page(va,
+			agp_bridge->driver->agp_destroy_page(curr->pages[0],
 							     AGP_PAGE_DESTROY_FREE);
 		}
 		agp_free_page_array(curr);
@@ -466,8 +461,9 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
 }
 
 static unsigned long intel_i810_mask_memory(struct agp_bridge_data *bridge,
-	unsigned long addr, int type)
+					    struct page *page, int type)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	/* Type checking must be done elsewhere */
 	return addr | bridge->driver->masks[type].mask;
 }
@@ -855,7 +851,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-						       mem->memory[i], mask_type),
+						       mem->pages[i], mask_type),
 		       intel_private.registers+I810_PTE_BASE+(j*4));
 	}
 	readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
@@ -1085,7 +1081,7 @@ static int intel_i915_insert_entries(struct agp_memory *mem, off_t pg_start,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			mem->memory[i], mask_type), intel_private.gtt+j);
+						       mem->pages[i], mask_type), intel_private.gtt+j);
 	}
 
 	readl(intel_private.gtt+j-1);
@@ -1200,8 +1196,9 @@ static int intel_i915_create_gatt_table(struct agp_bridge_data *bridge)
  * this conditional.
  */
 static unsigned long intel_i965_mask_memory(struct agp_bridge_data *bridge,
-	unsigned long addr, int type)
+					    struct page *page, int type)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	/* Shift high bits down */
 	addr |= (addr >> 28) & 0xf0;
 
diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c
index 16acee2de11..263d71dd441 100644
--- a/drivers/char/agp/nvidia-agp.c
+++ b/drivers/char/agp/nvidia-agp.c
@@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type
 	}
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			mem->memory[i], mask_type),
+			mem->pages[i], mask_type),
 			agp_bridge->gatt_table+nvidia_private.pg_offset+j);
 	}
 
diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c
index 699e3422ad9..f4bb43fb801 100644
--- a/drivers/char/agp/parisc-agp.c
+++ b/drivers/char/agp/parisc-agp.c
@@ -31,6 +31,10 @@
 #define AGP8X_MODE_BIT		3
 #define AGP8X_MODE		(1 << AGP8X_MODE_BIT)
 
+static unsigned long
+parisc_agp_mask_memory(struct agp_bridge_data *bridge, unsigned long addr,
+		       int type);
+
 static struct _parisc_agp_info {
 	void __iomem *ioc_regs;
 	void __iomem *lba_regs;
@@ -149,12 +153,12 @@ parisc_agp_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
 		unsigned long paddr;
 
-		paddr = mem->memory[i];
+		paddr = page_to_phys(mem->pages[i]);
 		for (k = 0;
 		     k < info->io_pages_per_kpage;
 		     k++, j++, paddr += info->io_page_size) {
 			info->gatt[j] =
-				agp_bridge->driver->mask_memory(agp_bridge,
+				parisc_agp_mask_memory(agp_bridge,
 					paddr, type);
 		}
 	}
@@ -185,9 +189,17 @@ parisc_agp_remove_memory(struct agp_memory *mem, off_t pg_start, int type)
 }
 
 static unsigned long
-parisc_agp_mask_memory(struct agp_bridge_data *bridge,
-		    unsigned long addr, int type)
+parisc_agp_mask_memory(struct agp_bridge_data *bridge, unsigned long addr,
+		       int type)
+{
+	return SBA_PDIR_VALID_BIT | addr;
+}
+
+static unsigned long
+parisc_agp_page_mask_memory(struct agp_bridge_data *bridge, struct page *page,
+			    int type)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	return SBA_PDIR_VALID_BIT | addr;
 }
 
diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c
index b972d83bb1b..d3ea2e4226b 100644
--- a/drivers/char/agp/sgi-agp.c
+++ b/drivers/char/agp/sgi-agp.c
@@ -38,7 +38,7 @@ static struct aper_size_info_fixed sgi_tioca_sizes[] = {
 	{0, 0, 0},
 };
 
-static void *sgi_tioca_alloc_page(struct agp_bridge_data *bridge)
+static struct page *sgi_tioca_alloc_page(struct agp_bridge_data *bridge)
 {
 	struct page *page;
 	int nid;
@@ -52,7 +52,7 @@ static void *sgi_tioca_alloc_page(struct agp_bridge_data *bridge)
 
 	get_page(page);
 	atomic_inc(&agp_bridge->current_memory_agp);
-	return page_address(page);
+	return page;
 }
 
 /*
@@ -71,8 +71,9 @@ static void sgi_tioca_tlbflush(struct agp_memory *mem)
  */
 static unsigned long
 sgi_tioca_mask_memory(struct agp_bridge_data *bridge,
-		      unsigned long addr, int type)
+		      struct page *page, int type)
 {
+	unsigned long addr = phys_to_gart(page_to_phys(page));
 	return tioca_physpage_to_gart(addr);
 }
 
@@ -189,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		table[j] =
-		    bridge->driver->mask_memory(bridge, mem->memory[i],
+		    bridge->driver->mask_memory(bridge, mem->pages[i],
 						mem->type);
 	}
 
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 6224df8b7f0..b964a219932 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -349,7 +349,7 @@ static int serverworks_insert_memory(struct agp_memory *mem,
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = SVRWRKS_GET_GATT(addr);
-		writel(agp_bridge->driver->mask_memory(agp_bridge, mem->memory[i], mem->type), cur_gatt+GET_GATT_OFF(addr));
+		writel(agp_bridge->driver->mask_memory(agp_bridge, mem->pages[i], mem->type), cur_gatt+GET_GATT_OFF(addr));
 	}
 	serverworks_tlbflush(mem);
 	return 0;
diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c
index 880d3f6d5b9..f192c3b9ad4 100644
--- a/drivers/char/agp/uninorth-agp.c
+++ b/drivers/char/agp/uninorth-agp.c
@@ -173,9 +173,9 @@ static int uninorth_insert_memory(struct agp_memory *mem, off_t pg_start,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		agp_bridge->gatt_table[j] =
-		    cpu_to_le32((mem->memory[i] & 0xFFFFF000UL) | 0x1UL);
-		flush_dcache_range((unsigned long)__va(mem->memory[i]),
-				   (unsigned long)__va(mem->memory[i])+0x1000);
+			cpu_to_le32((page_to_phys(mem->pages[i]) & 0xFFFFF000UL) | 0x1UL);
+		flush_dcache_range((unsigned long)__va(page_to_phys(mem->pages[i])),
+				   (unsigned long)__va(page_to_phys(mem->pages[i]))+0x1000);
 	}
 	(void)in_le32((volatile u32*)&agp_bridge->gatt_table[pg_start]);
 	mb();
@@ -219,9 +219,9 @@ static int u3_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 	}
 
 	for (i = 0; i < mem->page_count; i++) {
-		gp[i] = (mem->memory[i] >> PAGE_SHIFT) | 0x80000000UL;
-		flush_dcache_range((unsigned long)__va(mem->memory[i]),
-				   (unsigned long)__va(mem->memory[i])+0x1000);
+		gp[i] = (page_to_phys(mem->pages[i]) >> PAGE_SHIFT) | 0x80000000UL;
+		flush_dcache_range((unsigned long)__va(page_to_phys(mem->pages[i])),
+				   (unsigned long)__va(page_to_phys(mem->pages[i]))+0x1000);
 	}
 	mb();
 	flush_dcache_range((unsigned long)gp, (unsigned long) &gp[i]);
diff --git a/drivers/gpu/drm/drm_agpsupport.c b/drivers/gpu/drm/drm_agpsupport.c
index 7a0d042c8d6..d68888fe3df 100644
--- a/drivers/gpu/drm/drm_agpsupport.c
+++ b/drivers/gpu/drm/drm_agpsupport.c
@@ -482,7 +482,7 @@ drm_agp_bind_pages(struct drm_device *dev,
 	}
 
 	for (i = 0; i < num_pages; i++)
-		mem->memory[i] = phys_to_gart(page_to_phys(pages[i]));
+		mem->pages[i] = pages[i];
 	mem->page_count = num_pages;
 
 	mem->is_flushed = true;
diff --git a/drivers/gpu/drm/drm_memory.c b/drivers/gpu/drm/drm_memory.c
index 0a436184dd8..e4865f99989 100644
--- a/drivers/gpu/drm/drm_memory.c
+++ b/drivers/gpu/drm/drm_memory.c
@@ -59,10 +59,11 @@ int drm_mem_info(char *buf, char **start, off_t offset,
 static void *agp_remap(unsigned long offset, unsigned long size,
 		       struct drm_device * dev)
 {
-	unsigned long *phys_addr_map, i, num_pages =
+	unsigned long i, num_pages =
 	    PAGE_ALIGN(size) / PAGE_SIZE;
 	struct drm_agp_mem *agpmem;
 	struct page **page_map;
+	struct page **phys_page_map;
 	void *addr;
 
 	size = PAGE_ALIGN(size);
@@ -89,10 +90,9 @@ static void *agp_remap(unsigned long offset, unsigned long size,
 	if (!page_map)
 		return NULL;
 
-	phys_addr_map =
-	    agpmem->memory->memory + (offset - agpmem->bound) / PAGE_SIZE;
+	phys_page_map = (agpmem->memory->pages + (offset - agpmem->bound) / PAGE_SIZE);
 	for (i = 0; i < num_pages; ++i)
-		page_map[i] = pfn_to_page(phys_addr_map[i] >> PAGE_SHIFT);
+		page_map[i] = phys_page_map[i];
 	addr = vmap(page_map, num_pages, VM_IOREMAP, PAGE_AGP);
 	vfree(page_map);
 
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index f95d03ac978..7e1fbe5d477 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -144,14 +144,14 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * Get the page, inc the use count, and return it
 		 */
 		offset = (baddr - agpmem->bound) >> PAGE_SHIFT;
-		page = virt_to_page(__va(agpmem->memory->memory[offset]));
+		page = agpmem->memory->pages[offset];
 		get_page(page);
 		vmf->page = page;
 
 		DRM_DEBUG
 		    ("baddr = 0x%llx page = 0x%p, offset = 0x%llx, count=%d\n",
 		     (unsigned long long)baddr,
-		     __va(agpmem->memory->memory[offset]),
+		     agpmem->memory->pages[offset],
 		     (unsigned long long)offset,
 		     page_count(page));
 		return 0;
diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c b/drivers/gpu/drm/ttm/ttm_agp_backend.c
index e8f6d2229d8..4648ed2f014 100644
--- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
+++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
@@ -63,8 +63,7 @@ static int ttm_agp_populate(struct ttm_backend *backend,
 		if (!page)
 			page = dummy_read_page;
 
-		mem->memory[mem->page_count++] =
-		    phys_to_gart(page_to_phys(page));
+		mem->pages[mem->page_count++] = page;
 	}
 	agp_be->mem = mem;
 	return 0;
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index 2b8df8b420f..76fa794fdac 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -70,7 +70,7 @@ struct agp_memory {
 	struct agp_memory *next;
 	struct agp_memory *prev;
 	struct agp_bridge_data *bridge;
-	unsigned long *memory;
+	struct page **pages;
 	size_t page_count;
 	int key;
 	int num_scratch_pages;
-- 
cgit v1.2.3-70-g09d2


From ab52ae6db035fa425f90146327ab7d2c5d3e5654 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 16 Jun 2009 04:20:53 +0300
Subject: nfsd41: Backchannel: minorversion support for the back channel

Prepare to share backchannel code with NFSv4.1.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[nfsd41: use nfsd4_cb_sequence for callback minorversion]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c     | 3 ++-
 fs/nfsd/nfs4state.c        | 1 +
 include/linux/nfsd/state.h | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 353eb4a0b84..3fd23f7acec 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -141,6 +141,7 @@ struct nfs4_cb_compound_hdr {
 	u32		ident;
 	u32		nops;
 	__be32		*nops_p;
+	u32		minorversion;
 	u32		taglen;
 	char		*tag;
 };
@@ -209,7 +210,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 
 	RESERVE_SPACE(16);
 	WRITE32(0);            /* tag length is always 0 */
-	WRITE32(NFS4_MINOR_VERSION);
+	WRITE32(hdr->minorversion);
 	WRITE32(hdr->ident);
 	hdr->nops_p = p;
 	WRITE32(hdr->nops);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ef6944b19f0..980a216a48c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -984,6 +984,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 	if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
 	                 &cb->cb_addr, &cb->cb_port)))
 		goto out_err;
+	cb->cb_minorversion = 0;
 	cb->cb_prog = se->se_callback_prog;
 	cb->cb_ident = se->se_callback_ident;
 	return;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 105cc100de0..f5a95fd3431 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -85,7 +85,8 @@ struct nfs4_cb_conn {
 	u32                     cb_addr;
 	unsigned short          cb_port;
 	u32                     cb_prog;
-	u32                     cb_ident;
+	u32			cb_minorversion;
+	u32                     cb_ident;	/* minorversion 0 only */
 	/* RPC client info */
 	atomic_t		cb_set;     /* successful CB_NULL call */
 	struct rpc_clnt *       cb_client;
-- 
cgit v1.2.3-70-g09d2


From 90c699a9ee4be165966d40f1837909ccb8890a68 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 19 Jun 2009 08:08:50 +0200
Subject: block: rename CONFIG_LBD to CONFIG_LBDAF

Follow-up to "block: enable by default support for large devices
and files on 32-bit archs".

Rename CONFIG_LBD to CONFIG_LBDAF to:
- allow update of existing [def]configs for "default y" change
- reflect that it is used also for large files support nowadays

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/SubmitChecklist       |  2 +-
 Documentation/ja_JP/SubmitChecklist |  2 +-
 block/Kconfig                       |  4 ++--
 drivers/md/dm-exception-store.h     |  2 +-
 fs/ext3/resize.c                    |  2 +-
 fs/ext3/super.c                     |  2 +-
 fs/ext4/resize.c                    |  2 +-
 fs/ext4/super.c                     | 10 +++++-----
 fs/gfs2/Kconfig                     |  2 +-
 fs/ocfs2/super.c                    |  2 +-
 fs/xfs/linux-2.6/xfs_linux.h        |  2 +-
 fs/xfs/linux-2.6/xfs_super.c        |  2 +-
 include/linux/kernel.h              |  2 +-
 include/linux/types.h               |  2 +-
 14 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index ac5e0b2f109..78a9168ff37 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -54,7 +54,7 @@ kernel patches.
     CONFIG_PREEMPT.
 
 14: If the patch affects IO/Disk, etc: has been tested with and without
-    CONFIG_LBD.
+    CONFIG_LBDAF.
 
 15: All codepaths have been exercised with all lockdep features enabled.
 
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 6c42e071d72..2df4576f117 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -75,7 +75,7 @@ Linux カーネルパッチ投稿者向けチェックリスト
     ビルドした上、動作確認を行ってください。
 
 14: もしパッチがディスクのI/O性能などに影響を与えるようであれば、
-    'CONFIG_LBD'オプションを有効にした場合と無効にした場合の両方で
+    'CONFIG_LBDAF'オプションを有効にした場合と無効にした場合の両方で
     テストを実施してみてください。
 
 15: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
diff --git a/block/Kconfig b/block/Kconfig
index 2c39527aa7d..95a86adc33a 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -23,8 +23,8 @@ menuconfig BLOCK
 
 if BLOCK
 
-config LBD
-	bool "Support for large block devices and files"
+config LBDAF
+	bool "Support for large (2TB+) block devices and files"
 	depends on !64BIT
 	default y
 	help
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 0a2e6e7f67b..c92701dc500 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -111,7 +111,7 @@ struct dm_exception_store {
 /*
  * Funtions to manipulate consecutive chunks
  */
-#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#  if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 #    define DM_CHUNK_CONSECUTIVE_BITS 8
 #    define DM_CHUNK_NUMBER_BITS 56
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8a0b26340b5..8359e7b3dc8 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -990,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
 			ext3_warning(sb, __func__,
-			"CONFIG_LBD not enabled\n");
+			"CONFIG_LBDAF not enabled\n");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 26aa64dee6a..601e881e610 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1812,7 +1812,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
 			" too large to mount safely\n", sb->s_id);
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+			printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
 					"enabled\n");
 		goto failed_mount;
 	}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 27eb289eea3..68b0351fc64 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
+			ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 012c4251397..6e0c2d77c87 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1957,7 +1957,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LBD is not enabled implies the inode
+		 * CONFIG_LBDAF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -2000,7 +2000,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LBD not enabled implies that
+		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
 		 * the inode i_block field represents total file blocks in
 		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
@@ -2436,13 +2436,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LBD
+		 * mount if kernel is build with CONFIG_LBDAF
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
 			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LBD");
+					"without CONFIG_LBDAF");
 			goto failed_mount;
 		}
 	}
@@ -2566,7 +2566,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			" too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cad957cdb1e..5971359d209 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LBD)
+	depends on EXPERIMENTAL && (64BIT || LBDAF)
 	select DLM if GFS2_FS_LOCKING_DLM
 	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
 	select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d33767f17ba..0d3ed7407a0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -552,7 +552,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	/*
 	 * We might be limited by page cache size.
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index f65a53f8752..6127e24062d 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -24,7 +24,7 @@
  * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
  * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
  */
-#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS	1
 # define XFS_BIG_INUMS	1
 #else
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2e09efbca8d..a220d36f789 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -616,7 +616,7 @@ xfs_max_file_offset(
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
 	ASSERT(sizeof(sector_t) == 8);
 	pagefactor = PAGE_CACHE_SIZE;
 	bitshift = BITS_PER_LONG;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c5a71c38a95..fac104e7186 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,7 @@ extern const char linux_proc_banner[];
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
-#ifdef CONFIG_LBD
+#ifdef CONFIG_LBDAF
 # include <asm/div64.h>
 # define sector_div(a, b) do_div(a, b)
 #else
diff --git a/include/linux/types.h b/include/linux/types.h
index 5abe354020f..c42724f8c80 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -131,7 +131,7 @@ typedef		__s64		int64_t;
  *
  * blkcnt_t is the type of the inode's block count.
  */
-#ifdef CONFIG_LBD
+#ifdef CONFIG_LBDAF
 typedef u64 sector_t;
 typedef u64 blkcnt_t;
 #else
-- 
cgit v1.2.3-70-g09d2


From f9188e023c248d73f5b4a589b480e065c1864068 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 18 Jun 2009 22:20:52 +0200
Subject: perf_counter: Make callchain samples extensible

Before exposing upstream tools to a callchain-samples ABI, tidy it
up to make it more extensible in the future:

Use markers in the IP chain to denote context, use (u64)-1..-4095 range
for these context markers because we use them for ERR_PTR(), so these
addresses are unlikely to be mapped.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 29 ++++++-----------------------
 include/linux/perf_counter.h       | 28 +++++++++++++++++-----------
 2 files changed, 23 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ce1ae3f1f86..76dfef23f78 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1555,9 +1555,9 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
  */
 
 static inline
-void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+void callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < MAX_STACK_DEPTH)
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
 		entry->ip[entry->nr++] = ip;
 }
 
@@ -1602,22 +1602,10 @@ static const struct stacktrace_ops backtrace_ops = {
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
-	unsigned long bp;
-	char *stack;
-	int nr = entry->nr;
-
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
 	callchain_store(entry, regs->ip);
 
-	stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
-	get_bp(bp);
-#else
-	bp = 0;
-#endif
-
-	dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry);
-
-	entry->kernel = entry->nr - nr;
+	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
 
 /*
@@ -1669,16 +1657,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
-	int nr = entry->nr;
 
 	if (!user_mode(regs))
 		regs = task_pt_regs(current);
 
 	fp = (void __user *)regs->bp;
 
+	callchain_store(entry, PERF_CONTEXT_USER);
 	callchain_store(entry, regs->ip);
 
-	while (entry->nr < MAX_STACK_DEPTH) {
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
@@ -1691,8 +1679,6 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
-
-	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1728,9 +1714,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
-	entry->hv = 0;
-	entry->kernel = 0;
-	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0765e8e6984..e7e7e024276 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -343,23 +343,22 @@ enum perf_event_type {
 	 *	{ u64			nr;
 	 *	  { u64 id, val; }	cnt[nr];  } && PERF_SAMPLE_GROUP
 	 *
-	 *	{ u16			nr,
-	 *				hv,
-	 *				kernel,
-	 *				user;
+	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 * };
 	 */
 };
 
-#define MAX_STACK_DEPTH			255
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (__u64)-32,
+	PERF_CONTEXT_KERNEL		= (__u64)-128,
+	PERF_CONTEXT_USER		= (__u64)-512,
 
-struct perf_callchain_entry {
-	__u16				nr;
-	__u16				hv;
-	__u16				kernel;
-	__u16				user;
-	__u64				ip[MAX_STACK_DEPTH];
+	PERF_CONTEXT_GUEST		= (__u64)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
+	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
+
+	PERF_CONTEXT_MAX		= (__u64)-4095,
 };
 
 #ifdef __KERNEL__
@@ -381,6 +380,13 @@ struct perf_callchain_entry {
 #include <linux/pid_namespace.h>
 #include <asm/atomic.h>
 
+#define PERF_MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	__u64				nr;
+	__u64				ip[PERF_MAX_STACK_DEPTH];
+};
+
 struct task_struct;
 
 /**
-- 
cgit v1.2.3-70-g09d2


From e5289d4a181fb6c0b7a7607649af2ffdc491335c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Jun 2009 13:22:51 +0200
Subject: perf_counter: Simplify and fix task migration counting

The task migrations counter was causing rare and hard to decypher
memory corruptions under load. After a day of debugging and bisection
we found that the problem was introduced with:

  3f731ca: perf_counter: Fix cpu migration counter

Turning them off fixes the crashes. Incidentally, the whole
perf_counter_task_migration() logic can be done simpler as well,
by injecting a proper sw-counter event.

This cleanup also fixed the crashes. The precise failure mode is
not completely clear yet, but we are clearly not unhappy about
having a fix ;-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ----
 kernel/perf_counter.c        | 23 +----------------------
 kernel/sched.c               |  3 ++-
 3 files changed, 3 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e7e7e024276..89698d8aba5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -682,8 +682,6 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
-extern void perf_counter_task_migration(struct task_struct *task, int cpu);
-
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
@@ -724,8 +722,6 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
-static inline void perf_counter_task_migration(struct task_struct *task,
-					       int cpu)			{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7e9108efd30..8d4f0dd41c2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)
 
 static void get_ctx(struct perf_counter_context *ctx)
 {
-	atomic_inc(&ctx->refcount);
+	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 
 static void free_ctx(struct rcu_head *head)
@@ -3467,27 +3467,6 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: cpu migrations
- */
-void perf_counter_task_migration(struct task_struct *task, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx;
-
-	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_SW_CPU_MIGRATIONS,
-				 1, 1, NULL, 0);
-
-	ctx = perf_pin_task_context(task);
-	if (ctx) {
-		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_SW_CPU_MIGRATIONS,
-					 1, 1, NULL, 0);
-		perf_unpin_context(ctx);
-	}
-}
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a906aa..f46540b359c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-		perf_counter_task_migration(p, new_cpu);
+		perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+				     1, 1, NULL, 0);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
-- 
cgit v1.2.3-70-g09d2


From 352da9820e5506e3b8496e6052a2ad9c488efae8 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:17 +0200
Subject: i2c: Kill client_register and client_unregister methods

These methods were useful in the legacy binding model but no longer in
the new (standard) binding model. There are no users left so we can
drop them.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
---
 Documentation/feature-removal-schedule.txt |  3 +--
 drivers/i2c/i2c-core.c                     | 30 +-----------------------------
 include/linux/i2c.h                        |  4 ----
 3 files changed, 2 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 8d07ed31207..9163dade070 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -368,8 +368,7 @@ Who:  Krzysztof Piotr Oledzki <ole@ans.pl>
 
 ---------------------------
 
-What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
-	i2c_adapter->client_register(), i2c_adapter->client_unregister
+What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client()
 When:	2.6.30
 Check:	i2c_attach_client i2c_detach_client
 Why:	Deprecated by the new (standard) device driver binding model. Use
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 5ed622ee65c..fc18fdbffd3 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -308,14 +308,6 @@ void i2c_unregister_device(struct i2c_client *client)
 		return;
 	}
 
-	if (adapter->client_unregister) {
-		if (adapter->client_unregister(client)) {
-			dev_warn(&client->dev,
-				 "client_unregister [%s] failed\n",
-				 client->name);
-		}
-	}
-
 	mutex_lock(&adapter->clist_lock);
 	list_del(&client->list);
 	mutex_unlock(&adapter->clist_lock);
@@ -855,14 +847,6 @@ int i2c_attach_client(struct i2c_client *client)
 	dev_dbg(&adapter->dev, "client [%s] registered with bus id %s\n",
 		client->name, dev_name(&client->dev));
 
-	if (adapter->client_register)  {
-		if (adapter->client_register(client)) {
-			dev_dbg(&adapter->dev, "client_register "
-				"failed for client [%s] at 0x%02x\n",
-				client->name, client->addr);
-		}
-	}
-
 	return 0;
 
 out_err:
@@ -875,17 +859,6 @@ EXPORT_SYMBOL(i2c_attach_client);
 int i2c_detach_client(struct i2c_client *client)
 {
 	struct i2c_adapter *adapter = client->adapter;
-	int res = 0;
-
-	if (adapter->client_unregister)  {
-		res = adapter->client_unregister(client);
-		if (res) {
-			dev_err(&client->dev,
-				"client_unregister [%s] failed, "
-				"client not detached\n", client->name);
-			goto out;
-		}
-	}
 
 	mutex_lock(&adapter->clist_lock);
 	list_del(&client->list);
@@ -895,8 +868,7 @@ int i2c_detach_client(struct i2c_client *client)
 	device_unregister(&client->dev);
 	wait_for_completion(&client->released);
 
- out:
-	return res;
+	return 0;
 }
 EXPORT_SYMBOL(i2c_detach_client);
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index ad258059603..b3f4606afa0 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -352,10 +352,6 @@ struct i2c_adapter {
 	const struct i2c_algorithm *algo; /* the algorithm to access the bus */
 	void *algo_data;
 
-	/* --- administration stuff. */
-	int (*client_register)(struct i2c_client *) __deprecated;
-	int (*client_unregister)(struct i2c_client *) __deprecated;
-
 	/* data fields that are valid for all devices	*/
 	u8 level; 			/* nesting level for lockdep */
 	struct mutex bus_lock;
-- 
cgit v1.2.3-70-g09d2


From 729d6dd571464954f625e6b80950d9e4e3bd94f7 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:18 +0200
Subject: i2c: Get rid of the legacy binding model

We converted all the legacy i2c drivers so we can finally get rid of
the legacy binding model. Hooray!

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
---
 Documentation/feature-removal-schedule.txt |  9 ----
 Documentation/i2c/writing-clients          | 16 ++----
 drivers/i2c/i2c-core.c                     | 83 +++---------------------------
 include/linux/i2c.h                        | 39 ++++----------
 4 files changed, 20 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 9163dade070..f8cd450be9a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -368,15 +368,6 @@ Who:  Krzysztof Piotr Oledzki <ole@ans.pl>
 
 ---------------------------
 
-What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client()
-When:	2.6.30
-Check:	i2c_attach_client i2c_detach_client
-Why:	Deprecated by the new (standard) device driver binding model. Use
-	i2c_driver->probe() and ->remove() instead.
-Who:	Jean Delvare <khali@linux-fr.org>
-
----------------------------
-
 What:	fscher and fscpos drivers
 When:	June 2009
 Why:	Deprecated by the new fschmd driver.
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index c1a06f989cf..7860aafb483 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -126,19 +126,9 @@ different) configuration information, as do drivers handling chip variants
 that can't be distinguished by protocol probing, or which need some board
 specific information to operate correctly.
 
-Accordingly, the I2C stack now has two models for associating I2C devices
-with their drivers:  the original "legacy" model, and a newer one that's
-fully compatible with the Linux 2.6 driver model.  These models do not mix,
-since the "legacy" model requires drivers to create "i2c_client" device
-objects after SMBus style probing, while the Linux driver model expects
-drivers to be given such device objects in their probe() routines.
 
-The legacy model is deprecated now and will soon be removed, so we no
-longer document it here.
-
-
-Standard Driver Model Binding ("New Style")
--------------------------------------------
+Device/Driver Binding
+---------------------
 
 System infrastructure, typically board-specific initialization code or
 boot firmware, reports what I2C devices exist.  For example, there may be
@@ -201,7 +191,7 @@ a given I2C bus.  This is for example the case of hardware monitoring
 devices on a PC's SMBus.  In that case, you may want to let your driver
 detect supported devices automatically.  This is how the legacy model
 was working, and is now available as an extension to the standard
-driver model (so that we can finally get rid of the legacy model.)
+driver model.
 
 You simply have to define a detect callback which will attempt to
 identify supported devices (returning 0 for supported ones and -ENODEV
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index fc18fdbffd3..dc8bc913144 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -43,6 +43,7 @@ static DEFINE_IDR(i2c_adapter_idr);
 
 #define is_newstyle_driver(d) ((d)->probe || (d)->remove || (d)->detect)
 
+static int i2c_attach_client(struct i2c_client *client);
 static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver);
 
 /* ------------------------------------------------------------------------- */
@@ -83,10 +84,6 @@ static int i2c_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct i2c_client	*client = to_i2c_client(dev);
 
-	/* by definition, legacy drivers can't hotplug */
-	if (dev->driver)
-		return 0;
-
 	if (add_uevent_var(env, "MODALIAS=%s%s",
 			   I2C_MODULE_PREFIX, client->name))
 		return -ENOMEM;
@@ -455,7 +452,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 
 	dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name);
 
-	/* create pre-declared device nodes for new-style drivers */
+	/* create pre-declared device nodes */
 	if (adap->nr < __i2c_first_dynamic_bus_num)
 		i2c_scan_static_board_info(adap);
 
@@ -617,26 +614,9 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 	if (res)
 		goto out_unlock;
 
-	/* detach any active clients. This must be done first, because
-	 * it can fail; in which case we give up. */
+	/* Detach any active clients */
 	list_for_each_entry_safe_reverse(client, _n, &adap->clients, list) {
-		struct i2c_driver	*driver;
-
-		driver = client->driver;
-
-		/* new style, follow standard driver model */
-		if (!driver || is_newstyle_driver(driver)) {
-			i2c_unregister_device(client);
-			continue;
-		}
-
-		/* legacy drivers create and remove clients themselves */
-		if ((res = driver->detach_client(client))) {
-			dev_err(&adap->dev, "detach_client failed for client "
-				"[%s] at address 0x%02x\n", client->name,
-				client->addr);
-			goto out_unlock;
-		}
+		i2c_unregister_device(client);
 	}
 
 	/* clean up the sysfs representation */
@@ -680,11 +660,7 @@ static int __attach_adapter(struct device *dev, void *data)
 
 /*
  * An i2c_driver is used with one or more i2c_client (device) nodes to access
- * i2c slave chips, on a bus instance associated with some i2c_adapter.  There
- * are two models for binding the driver to its device:  "new style" drivers
- * follow the standard Linux driver model and just respond to probe() calls
- * issued if the driver core sees they match(); "legacy" drivers create device
- * nodes themselves.
+ * i2c slave chips, on a bus instance associated with some i2c_adapter.
  */
 
 int i2c_register_driver(struct module *owner, struct i2c_driver *driver)
@@ -695,21 +671,11 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver)
 	if (unlikely(WARN_ON(!i2c_bus_type.p)))
 		return -EAGAIN;
 
-	/* new style driver methods can't mix with legacy ones */
-	if (is_newstyle_driver(driver)) {
-		if (driver->detach_adapter || driver->detach_client) {
-			printk(KERN_WARNING
-					"i2c-core: driver [%s] is confused\n",
-					driver->driver.name);
-			return -EINVAL;
-		}
-	}
-
 	/* add the driver to the list of i2c drivers in the driver core */
 	driver->driver.owner = owner;
 	driver->driver.bus = &i2c_bus_type;
 
-	/* for new style drivers, when registration returns the driver core
+	/* When registration returns, the driver core
 	 * will have called probe() for all matching-but-unbound devices.
 	 */
 	res = driver_register(&driver->driver);
@@ -748,29 +714,11 @@ static int __detach_adapter(struct device *dev, void *data)
 	if (is_newstyle_driver(driver))
 		return 0;
 
-	/* Have a look at each adapter, if clients of this driver are still
-	 * attached. If so, detach them to be able to kill the driver
-	 * afterwards.
-	 */
 	if (driver->detach_adapter) {
 		if (driver->detach_adapter(adapter))
 			dev_err(&adapter->dev,
 				"detach_adapter failed for driver [%s]\n",
 				driver->driver.name);
-	} else {
-		struct i2c_client *client, *_n;
-
-		list_for_each_entry_safe(client, _n, &adapter->clients, list) {
-			if (client->driver != driver)
-				continue;
-			dev_dbg(&adapter->dev,
-				"detaching client [%s] at 0x%02x\n",
-				client->name, client->addr);
-			if (driver->detach_client(client))
-				dev_err(&adapter->dev, "detach_client "
-					"failed for client [%s] at 0x%02x\n",
-					client->name, client->addr);
-		}
 	}
 
 	return 0;
@@ -812,7 +760,7 @@ static int i2c_check_addr(struct i2c_adapter *adapter, int addr)
 	return device_for_each_child(&adapter->dev, &addr, __i2c_check_addr);
 }
 
-int i2c_attach_client(struct i2c_client *client)
+static int i2c_attach_client(struct i2c_client *client)
 {
 	struct i2c_adapter *adapter = client->adapter;
 	int res;
@@ -854,23 +802,6 @@ out_err:
 		"(%d)\n", client->name, client->addr, res);
 	return res;
 }
-EXPORT_SYMBOL(i2c_attach_client);
-
-int i2c_detach_client(struct i2c_client *client)
-{
-	struct i2c_adapter *adapter = client->adapter;
-
-	mutex_lock(&adapter->clist_lock);
-	list_del(&client->list);
-	mutex_unlock(&adapter->clist_lock);
-
-	init_completion(&client->released);
-	device_unregister(&client->dev);
-	wait_for_completion(&client->released);
-
-	return 0;
-}
-EXPORT_SYMBOL(i2c_detach_client);
 
 /**
  * i2c_use_client - increments the reference count of the i2c client structure
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index b3f4606afa0..43a3545670b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -100,9 +100,8 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
  * @class: What kind of i2c device we instantiate (for detect)
  * @attach_adapter: Callback for bus addition (for legacy drivers)
  * @detach_adapter: Callback for bus removal (for legacy drivers)
- * @detach_client: Callback for device removal (for legacy drivers)
- * @probe: Callback for device binding (new-style drivers)
- * @remove: Callback for device unbinding (new-style drivers)
+ * @probe: Callback for device binding
+ * @remove: Callback for device unbinding
  * @shutdown: Callback for device shutdown
  * @suspend: Callback for device suspend
  * @resume: Callback for device resume
@@ -137,26 +136,14 @@ struct i2c_driver {
 	int id;
 	unsigned int class;
 
-	/* Notifies the driver that a new bus has appeared. This routine
-	 * can be used by the driver to test if the bus meets its conditions
-	 * & seek for the presence of the chip(s) it supports. If found, it
-	 * registers the client(s) that are on the bus to the i2c admin. via
-	 * i2c_attach_client.  (LEGACY I2C DRIVERS ONLY)
+	/* Notifies the driver that a new bus has appeared or is about to be
+	 * removed. You should avoid using this if you can, it will probably
+	 * be removed in a near future.
 	 */
 	int (*attach_adapter)(struct i2c_adapter *);
 	int (*detach_adapter)(struct i2c_adapter *);
 
-	/* tells the driver that a client is about to be deleted & gives it
-	 * the chance to remove its private data. Also, if the client struct
-	 * has been dynamically allocated by the driver in the function above,
-	 * it must be freed here.  (LEGACY I2C DRIVERS ONLY)
-	 */
-	int (*detach_client)(struct i2c_client *) __deprecated;
-
-	/* Standard driver model interfaces, for "new style" i2c drivers.
-	 * With the driver model, device enumeration is NEVER done by drivers;
-	 * it's done by infrastructure.  (NEW STYLE DRIVERS ONLY)
-	 */
+	/* Standard driver model interfaces */
 	int (*probe)(struct i2c_client *, const struct i2c_device_id *);
 	int (*remove)(struct i2c_client *);
 
@@ -248,11 +235,10 @@ static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
  * that, such as chip type, configuration, associated IRQ, and so on.
  *
  * i2c_board_info is used to build tables of information listing I2C devices
- * that are present.  This information is used to grow the driver model tree
- * for "new style" I2C drivers.  For mainboards this is done statically using
- * i2c_register_board_info(); bus numbers identify adapters that aren't
- * yet available.  For add-on boards, i2c_new_device() does this dynamically
- * with the adapter already known.
+ * that are present.  This information is used to grow the driver model tree.
+ * For mainboards this is done statically using i2c_register_board_info();
+ * bus numbers identify adapters that aren't yet available.  For add-on boards,
+ * i2c_new_device() does this dynamically with the adapter already known.
  */
 struct i2c_board_info {
 	char		type[I2C_NAME_SIZE];
@@ -425,11 +411,6 @@ static inline int i2c_add_driver(struct i2c_driver *driver)
 	return i2c_register_driver(THIS_MODULE, driver);
 }
 
-/* These are deprecated, your driver should use the standard .probe()
- * and .remove() methods instead. */
-extern int __deprecated i2c_attach_client(struct i2c_client *);
-extern int __deprecated i2c_detach_client(struct i2c_client *);
-
 extern struct i2c_client *i2c_use_client(struct i2c_client *client);
 extern void i2c_release_client(struct i2c_client *client);
 
-- 
cgit v1.2.3-70-g09d2


From 36789b5ea52bba961122b45f4383f553ec3b5a6c Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:18 +0200
Subject: i2c: Drop i2c_probe function

The legacy i2c_probe() function has no users left, get rid of it.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
---
 drivers/i2c/i2c-core.c | 137 -------------------------------------------------
 include/linux/i2c.h    |   8 ---
 2 files changed, 145 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index dc8bc913144..d48438908e1 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1032,144 +1032,7 @@ EXPORT_SYMBOL(i2c_master_recv);
  * Will not work for 10-bit addresses!
  * ----------------------------------------------------
  */
-static int i2c_probe_address(struct i2c_adapter *adapter, int addr, int kind,
-			     int (*found_proc) (struct i2c_adapter *, int, int))
-{
-	int err;
-
-	/* Make sure the address is valid */
-	if (addr < 0x03 || addr > 0x77) {
-		dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n",
-			 addr);
-		return -EINVAL;
-	}
-
-	/* Skip if already in use */
-	if (i2c_check_addr(adapter, addr))
-		return 0;
-
-	/* Make sure there is something at this address, unless forced */
-	if (kind < 0) {
-		if (i2c_smbus_xfer(adapter, addr, 0, 0, 0,
-				   I2C_SMBUS_QUICK, NULL) < 0)
-			return 0;
-
-		/* prevent 24RF08 corruption */
-		if ((addr & ~0x0f) == 0x50)
-			i2c_smbus_xfer(adapter, addr, 0, 0, 0,
-				       I2C_SMBUS_QUICK, NULL);
-	}
-
-	/* Finally call the custom detection function */
-	err = found_proc(adapter, addr, kind);
-	/* -ENODEV can be returned if there is a chip at the given address
-	   but it isn't supported by this chip driver. We catch it here as
-	   this isn't an error. */
-	if (err == -ENODEV)
-		err = 0;
-
-	if (err)
-		dev_warn(&adapter->dev, "Client creation failed at 0x%x (%d)\n",
-			 addr, err);
-	return err;
-}
-
-int i2c_probe(struct i2c_adapter *adapter,
-	      const struct i2c_client_address_data *address_data,
-	      int (*found_proc) (struct i2c_adapter *, int, int))
-{
-	int i, err;
-	int adap_id = i2c_adapter_id(adapter);
-
-	/* Force entries are done first, and are not affected by ignore
-	   entries */
-	if (address_data->forces) {
-		const unsigned short * const *forces = address_data->forces;
-		int kind;
-
-		for (kind = 0; forces[kind]; kind++) {
-			for (i = 0; forces[kind][i] != I2C_CLIENT_END;
-			     i += 2) {
-				if (forces[kind][i] == adap_id
-				 || forces[kind][i] == ANY_I2C_BUS) {
-					dev_dbg(&adapter->dev, "found force "
-						"parameter for adapter %d, "
-						"addr 0x%02x, kind %d\n",
-						adap_id, forces[kind][i + 1],
-						kind);
-					err = i2c_probe_address(adapter,
-						forces[kind][i + 1],
-						kind, found_proc);
-					if (err)
-						return err;
-				}
-			}
-		}
-	}
-
-	/* Stop here if we can't use SMBUS_QUICK */
-	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_QUICK)) {
-		if (address_data->probe[0] == I2C_CLIENT_END
-		 && address_data->normal_i2c[0] == I2C_CLIENT_END)
-			return 0;
-
-		dev_dbg(&adapter->dev, "SMBus Quick command not supported, "
-			"can't probe for chips\n");
-		return -EOPNOTSUPP;
-	}
-
-	/* Probe entries are done second, and are not affected by ignore
-	   entries either */
-	for (i = 0; address_data->probe[i] != I2C_CLIENT_END; i += 2) {
-		if (address_data->probe[i] == adap_id
-		 || address_data->probe[i] == ANY_I2C_BUS) {
-			dev_dbg(&adapter->dev, "found probe parameter for "
-				"adapter %d, addr 0x%02x\n", adap_id,
-				address_data->probe[i + 1]);
-			err = i2c_probe_address(adapter,
-						address_data->probe[i + 1],
-						-1, found_proc);
-			if (err)
-				return err;
-		}
-	}
-
-	/* Normal entries are done last, unless shadowed by an ignore entry */
-	for (i = 0; address_data->normal_i2c[i] != I2C_CLIENT_END; i += 1) {
-		int j, ignore;
-
-		ignore = 0;
-		for (j = 0; address_data->ignore[j] != I2C_CLIENT_END;
-		     j += 2) {
-			if ((address_data->ignore[j] == adap_id ||
-			     address_data->ignore[j] == ANY_I2C_BUS)
-			 && address_data->ignore[j + 1]
-			    == address_data->normal_i2c[i]) {
-				dev_dbg(&adapter->dev, "found ignore "
-					"parameter for adapter %d, "
-					"addr 0x%02x\n", adap_id,
-					address_data->ignore[j + 1]);
-				ignore = 1;
-				break;
-			}
-		}
-		if (ignore)
-			continue;
-
-		dev_dbg(&adapter->dev, "found normal entry for adapter %d, "
-			"addr 0x%02x\n", adap_id,
-			address_data->normal_i2c[i]);
-		err = i2c_probe_address(adapter, address_data->normal_i2c[i],
-					-1, found_proc);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(i2c_probe);
 
-/* Separate detection function for new-style drivers */
 static int i2c_detect_address(struct i2c_client *temp_client, int kind,
 			      struct i2c_driver *driver)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 43a3545670b..db25a870843 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -419,14 +419,6 @@ extern void i2c_release_client(struct i2c_client *client);
 extern void i2c_clients_command(struct i2c_adapter *adap,
 				unsigned int cmd, void *arg);
 
-/* Detect function. It iterates over all possible addresses itself.
- * It will only call found_proc if some client is connected at the
- * specific address (unless a 'force' matched);
- */
-extern int i2c_probe(struct i2c_adapter *adapter,
-		const struct i2c_client_address_data *address_data,
-		int (*found_proc) (struct i2c_adapter *, int, int));
-
 extern struct i2c_adapter *i2c_get_adapter(int id);
 extern void i2c_put_adapter(struct i2c_adapter *adap);
 
-- 
cgit v1.2.3-70-g09d2


From 1e40ac12dab22d98d0178e87364cf4e36862809c Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:19 +0200
Subject: i2c: Kill is_newstyle_driver

Legacy i2c drivers are gone, all drivers are new-style now, so there
is no point to check.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
---
 drivers/i2c/i2c-core.c | 32 +-------------------------------
 include/linux/i2c.h    |  2 --
 2 files changed, 1 insertion(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 06c428b5822..95fb997b41e 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -41,8 +41,6 @@
 static DEFINE_MUTEX(core_lock);
 static DEFINE_IDR(i2c_adapter_idr);
 
-#define is_newstyle_driver(d) ((d)->probe || (d)->remove || (d)->detect)
-
 static int i2c_check_addr(struct i2c_adapter *adapter, int addr);
 static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver);
 
@@ -64,12 +62,6 @@ static int i2c_device_match(struct device *dev, struct device_driver *drv)
 	struct i2c_client	*client = to_i2c_client(dev);
 	struct i2c_driver	*driver = to_i2c_driver(drv);
 
-	/* make legacy i2c drivers bypass driver model probing entirely;
-	 * such drivers scan each i2c adapter/bus themselves.
-	 */
-	if (!is_newstyle_driver(driver))
-		return 0;
-
 	/* match on an id table if there is one */
 	if (driver->id_table)
 		return i2c_match_id(driver->id_table, client) != NULL;
@@ -172,12 +164,6 @@ static int i2c_device_resume(struct device *dev)
 	return driver->resume(to_i2c_client(dev));
 }
 
-static void i2c_client_release(struct device *dev)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	complete(&client->released);
-}
-
 static void i2c_client_dev_release(struct device *dev)
 {
 	kfree(to_i2c_client(dev));
@@ -282,12 +268,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 
 	client->dev.parent = &client->adapter->dev;
 	client->dev.bus = &i2c_bus_type;
-
-	if (client->driver && !is_newstyle_driver(client->driver)) {
-		client->dev.release = i2c_client_release;
-		dev_set_uevent_suppress(&client->dev, 1);
-	} else
-		client->dev.release = i2c_client_dev_release;
+	client->dev.release = i2c_client_dev_release;
 
 	dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap),
 		     client->addr);
@@ -321,14 +302,6 @@ EXPORT_SYMBOL_GPL(i2c_new_device);
 void i2c_unregister_device(struct i2c_client *client)
 {
 	struct i2c_adapter	*adapter = client->adapter;
-	struct i2c_driver	*driver = client->driver;
-
-	if (driver && !is_newstyle_driver(driver)) {
-		dev_err(&client->dev, "can't unregister devices "
-			"with legacy drivers\n");
-		WARN_ON(1);
-		return;
-	}
 
 	mutex_lock(&adapter->clist_lock);
 	list_del(&client->list);
@@ -736,9 +709,6 @@ static int __detach_adapter(struct device *dev, void *data)
 		i2c_unregister_device(client);
 	}
 
-	if (is_newstyle_driver(driver))
-		return 0;
-
 	if (driver->detach_adapter) {
 		if (driver->detach_adapter(adapter))
 			dev_err(&adapter->dev,
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index db25a870843..28b27282496 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -180,7 +180,6 @@ struct i2c_driver {
  * @irq: indicates the IRQ generated by this device (if any)
  * @list: list of active/busy clients (DEPRECATED)
  * @detected: member of an i2c_driver.clients list
- * @released: used to synchronize client releases & detaches and references
  *
  * An i2c_client identifies a single device (i.e. chip) connected to an
  * i2c bus. The behaviour exposed to Linux is defined by the driver
@@ -198,7 +197,6 @@ struct i2c_client {
 	int irq;			/* irq issued by device		*/
 	struct list_head list;		/* DEPRECATED */
 	struct list_head detected;
-	struct completion released;
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
 
-- 
cgit v1.2.3-70-g09d2


From e549c2b54dd90a056d6824b885d438b7437874f0 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:19 +0200
Subject: i2c: Kill the redundant client list

We used to maintain our own per-adapter list of i2c clients, but this
is redundant with what the driver core does, and no longer needed.
Just drop the redundant list.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
---
 drivers/i2c/i2c-core.c | 28 +++++++++++-----------------
 include/linux/i2c.h    |  4 ----
 2 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 95fb997b41e..b1fd00d9a5c 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -276,10 +276,6 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 	if (status)
 		goto out_err;
 
-	mutex_lock(&adap->clist_lock);
-	list_add_tail(&client->list, &adap->clients);
-	mutex_unlock(&adap->clist_lock);
-
 	dev_dbg(&adap->dev, "client [%s] registered with bus id %s\n",
 		client->name, dev_name(&client->dev));
 
@@ -301,12 +297,6 @@ EXPORT_SYMBOL_GPL(i2c_new_device);
  */
 void i2c_unregister_device(struct i2c_client *client)
 {
-	struct i2c_adapter	*adapter = client->adapter;
-
-	mutex_lock(&adapter->clist_lock);
-	list_del(&client->list);
-	mutex_unlock(&adapter->clist_lock);
-
 	device_unregister(&client->dev);
 }
 EXPORT_SYMBOL_GPL(i2c_unregister_device);
@@ -432,8 +422,6 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 		return -EAGAIN;
 
 	mutex_init(&adap->bus_lock);
-	mutex_init(&adap->clist_lock);
-	INIT_LIST_HEAD(&adap->clients);
 
 	mutex_lock(&core_lock);
 
@@ -583,6 +571,14 @@ static int i2c_do_del_adapter(struct device_driver *d, void *data)
 	return res;
 }
 
+static int __unregister_client(struct device *dev, void *dummy)
+{
+	struct i2c_client *client = i2c_verify_client(dev);
+	if (client)
+		i2c_unregister_device(client);
+	return 0;
+}
+
 /**
  * i2c_del_adapter - unregister I2C adapter
  * @adap: the adapter being unregistered
@@ -593,7 +589,6 @@ static int i2c_do_del_adapter(struct device_driver *d, void *data)
  */
 int i2c_del_adapter(struct i2c_adapter *adap)
 {
-	struct i2c_client *client, *_n;
 	int res = 0;
 
 	mutex_lock(&core_lock);
@@ -612,10 +607,9 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 	if (res)
 		goto out_unlock;
 
-	/* Detach any active clients */
-	list_for_each_entry_safe_reverse(client, _n, &adap->clients, list) {
-		i2c_unregister_device(client);
-	}
+	/* Detach any active clients. This can't fail, thus we do not
+	   checking the returned value. */
+	res = device_for_each_child(&adap->dev, NULL, __unregister_client);
 
 	/* clean up the sysfs representation */
 	init_completion(&adap->dev_released);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 28b27282496..5f8157610c6 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -178,7 +178,6 @@ struct i2c_driver {
  * @driver: device's driver, hence pointer to access routines
  * @dev: Driver model device node for the slave.
  * @irq: indicates the IRQ generated by this device (if any)
- * @list: list of active/busy clients (DEPRECATED)
  * @detected: member of an i2c_driver.clients list
  *
  * An i2c_client identifies a single device (i.e. chip) connected to an
@@ -195,7 +194,6 @@ struct i2c_client {
 	struct i2c_driver *driver;	/* and our access routines	*/
 	struct device dev;		/* the device structure		*/
 	int irq;			/* irq issued by device		*/
-	struct list_head list;		/* DEPRECATED */
 	struct list_head detected;
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
@@ -339,14 +337,12 @@ struct i2c_adapter {
 	/* data fields that are valid for all devices	*/
 	u8 level; 			/* nesting level for lockdep */
 	struct mutex bus_lock;
-	struct mutex clist_lock;
 
 	int timeout;			/* in jiffies */
 	int retries;
 	struct device dev;		/* the adapter device */
 
 	int nr;
-	struct list_head clients;	/* DEPRECATED */
 	char name[48];
 	struct completion dev_released;
 };
-- 
cgit v1.2.3-70-g09d2


From 99cd8e25875a109455b709b5a41d4891b8d8e58e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:20 +0200
Subject: i2c: Add a sysfs interface to instantiate devices

Add a sysfs interface to instantiate and delete I2C devices. This is
primarily a replacement of the force_* module parameters implemented
by some i2c drivers. These module parameters were implemented
internally by the I2C_CLIENT_INSMOD* macros, which don't scale well.

This can also be used when developing a driver on a self-soldered
board which doesn't yet have proper I2C device declaration at the
platform level, and presumably for various debugging situations.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: David Brownell <dbrownell@users.sourceforge.net>
---
 Documentation/i2c/instantiating-devices |  44 ++++++++++++
 drivers/i2c/i2c-core.c                  | 123 +++++++++++++++++++++++++++++++-
 include/linux/i2c.h                     |   3 +-
 3 files changed, 168 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
index b55ce57a84d..c740b7b4108 100644
--- a/Documentation/i2c/instantiating-devices
+++ b/Documentation/i2c/instantiating-devices
@@ -165,3 +165,47 @@ was done there. Two significant differences are:
 Once again, method 3 should be avoided wherever possible. Explicit device
 instantiation (methods 1 and 2) is much preferred for it is safer and
 faster.
+
+
+Method 4: Instantiate from user-space
+-------------------------------------
+
+In general, the kernel should know which I2C devices are connected and
+what addresses they live at. However, in certain cases, it does not, so a
+sysfs interface was added to let the user provide the information. This
+interface is made of 2 attribute files which are created in every I2C bus
+directory: new_device and delete_device. Both files are write only and you
+must write the right parameters to them in order to properly instantiate,
+respectively delete, an I2C device.
+
+File new_device takes 2 parameters: the name of the I2C device (a string)
+and the address of the I2C device (a number, typically expressed in
+hexadecimal starting with 0x, but can also be expressed in decimal.)
+
+File delete_device takes a single parameter: the address of the I2C
+device. As no two devices can live at the same address on a given I2C
+segment, the address is sufficient to uniquely identify the device to be
+deleted.
+
+Example:
+# echo eeprom 0x50 > /sys/class/i2c-adapter/i2c-3/new_device
+
+While this interface should only be used when in-kernel device declaration
+can't be done, there is a variety of cases where it can be helpful:
+* The I2C driver usually detects devices (method 3 above) but the bus
+  segment your device lives on doesn't have the proper class bit set and
+  thus detection doesn't trigger.
+* The I2C driver usually detects devices, but your device lives at an
+  unexpected address.
+* The I2C driver usually detects devices, but your device is not detected,
+  either because the detection routine is too strict, or because your
+  device is not officially supported yet but you know it is compatible.
+* You are developing a driver on a test board, where you soldered the I2C
+  device yourself.
+
+This interface is a replacement for the force_* module parameters some I2C
+drivers implement. Being implemented in i2c-core rather than in each
+device driver individually, it is much more efficient, and also has the
+advantage that you do not have to reload the driver to change a setting.
+You can also instantiate the device before the driver is loaded or even
+available, and you don't need to know what driver the device needs.
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index a2f1cd3766f..eb084fa0df8 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -38,11 +38,12 @@
 #include "i2c-core.h"
 
 
-/* core_lock protects i2c_adapter_idr, and guarantees
+/* core_lock protects i2c_adapter_idr, userspace_devices, and guarantees
    that device detection, deletion of detected devices, and attach_adapter
    and detach_adapter calls are serialized */
 static DEFINE_MUTEX(core_lock);
 static DEFINE_IDR(i2c_adapter_idr);
+static LIST_HEAD(userspace_devices);
 
 static int i2c_check_addr(struct i2c_adapter *adapter, int addr);
 static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver);
@@ -373,8 +374,128 @@ show_adapter_name(struct device *dev, struct device_attribute *attr, char *buf)
 	return sprintf(buf, "%s\n", adap->name);
 }
 
+/*
+ * Let users instantiate I2C devices through sysfs. This can be used when
+ * platform initialization code doesn't contain the proper data for
+ * whatever reason. Also useful for drivers that do device detection and
+ * detection fails, either because the device uses an unexpected address,
+ * or this is a compatible device with different ID register values.
+ *
+ * Parameter checking may look overzealous, but we really don't want
+ * the user to provide incorrect parameters.
+ */
+static ssize_t
+i2c_sysfs_new_device(struct device *dev, struct device_attribute *attr,
+		     const char *buf, size_t count)
+{
+	struct i2c_adapter *adap = to_i2c_adapter(dev);
+	struct i2c_board_info info;
+	struct i2c_client *client;
+	char *blank, end;
+	int res;
+
+	dev_warn(dev, "The new_device interface is still experimental "
+		 "and may change in a near future\n");
+	memset(&info, 0, sizeof(struct i2c_board_info));
+
+	blank = strchr(buf, ' ');
+	if (!blank) {
+		dev_err(dev, "%s: Missing parameters\n", "new_device");
+		return -EINVAL;
+	}
+	if (blank - buf > I2C_NAME_SIZE - 1) {
+		dev_err(dev, "%s: Invalid device name\n", "new_device");
+		return -EINVAL;
+	}
+	memcpy(info.type, buf, blank - buf);
+
+	/* Parse remaining parameters, reject extra parameters */
+	res = sscanf(++blank, "%hi%c", &info.addr, &end);
+	if (res < 1) {
+		dev_err(dev, "%s: Can't parse I2C address\n", "new_device");
+		return -EINVAL;
+	}
+	if (res > 1  && end != '\n') {
+		dev_err(dev, "%s: Extra parameters\n", "new_device");
+		return -EINVAL;
+	}
+
+	if (info.addr < 0x03 || info.addr > 0x77) {
+		dev_err(dev, "%s: Invalid I2C address 0x%hx\n", "new_device",
+			info.addr);
+		return -EINVAL;
+	}
+
+	client = i2c_new_device(adap, &info);
+	if (!client)
+		return -EEXIST;
+
+	/* Keep track of the added device */
+	mutex_lock(&core_lock);
+	list_add_tail(&client->detected, &userspace_devices);
+	mutex_unlock(&core_lock);
+	dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device",
+		 info.type, info.addr);
+
+	return count;
+}
+
+/*
+ * And of course let the users delete the devices they instantiated, if
+ * they got it wrong. This interface can only be used to delete devices
+ * instantiated by i2c_sysfs_new_device above. This guarantees that we
+ * don't delete devices to which some kernel code still has references.
+ *
+ * Parameter checking may look overzealous, but we really don't want
+ * the user to delete the wrong device.
+ */
+static ssize_t
+i2c_sysfs_delete_device(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct i2c_adapter *adap = to_i2c_adapter(dev);
+	struct i2c_client *client, *next;
+	unsigned short addr;
+	char end;
+	int res;
+
+	/* Parse parameters, reject extra parameters */
+	res = sscanf(buf, "%hi%c", &addr, &end);
+	if (res < 1) {
+		dev_err(dev, "%s: Can't parse I2C address\n", "delete_device");
+		return -EINVAL;
+	}
+	if (res > 1  && end != '\n') {
+		dev_err(dev, "%s: Extra parameters\n", "delete_device");
+		return -EINVAL;
+	}
+
+	/* Make sure the device was added through sysfs */
+	res = -ENOENT;
+	mutex_lock(&core_lock);
+	list_for_each_entry_safe(client, next, &userspace_devices, detected) {
+		if (client->addr == addr && client->adapter == adap) {
+			dev_info(dev, "%s: Deleting device %s at 0x%02hx\n",
+				 "delete_device", client->name, client->addr);
+
+			list_del(&client->detected);
+			i2c_unregister_device(client);
+			res = count;
+			break;
+		}
+	}
+	mutex_unlock(&core_lock);
+
+	if (res < 0)
+		dev_err(dev, "%s: Can't find device in list\n",
+			"delete_device");
+	return res;
+}
+
 static struct device_attribute i2c_adapter_attrs[] = {
 	__ATTR(name, S_IRUGO, show_adapter_name, NULL),
+	__ATTR(new_device, S_IWUSR, NULL, i2c_sysfs_new_device),
+	__ATTR(delete_device, S_IWUSR, NULL, i2c_sysfs_delete_device),
 	{ },
 };
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 5f8157610c6..844d2662a87 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -178,7 +178,8 @@ struct i2c_driver {
  * @driver: device's driver, hence pointer to access routines
  * @dev: Driver model device node for the slave.
  * @irq: indicates the IRQ generated by this device (if any)
- * @detected: member of an i2c_driver.clients list
+ * @detected: member of an i2c_driver.clients list or i2c-core's
+ *	userspace_devices list
  *
  * An i2c_client identifies a single device (i.e. chip) connected to an
  * i2c bus. The behaviour exposed to Linux is defined by the driver
-- 
cgit v1.2.3-70-g09d2


From 23af8400571ac4431fac0caefeac18f8aef490df Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 19 Jun 2009 16:58:20 +0200
Subject: i2c: Don't advertise i2c functions when not available

Surround i2c function declarations with ifdefs, so that they aren't
advertised when i2c-core isn't actually built. That way, drivers using
these functions unconditionally will result in an immediate build
failure, rather than a late linking failure which is harder to figure
out.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/i2c.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 844d2662a87..bc17f7fa7d3 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -47,6 +47,7 @@ struct i2c_driver;
 union i2c_smbus_data;
 struct i2c_board_info;
 
+#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 /*
  * The master routines are the ones normally used to transmit data to devices
  * on a bus (or read from them). Apart from two basic transfer functions to
@@ -93,6 +94,7 @@ extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client,
 extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
 					  u8 command, u8 length,
 					  const u8 *values);
+#endif /* I2C */
 
 /**
  * struct i2c_driver - represent an I2C device driver
@@ -260,6 +262,7 @@ struct i2c_board_info {
 	.type = dev_type, .addr = (dev_addr)
 
 
+#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 /* Add-on boards should register/unregister their devices; e.g. a board
  * with integrated I2C, a config eeprom, sensors, and a codec that's
  * used in conjunction with the primary hardware.
@@ -283,6 +286,7 @@ extern struct i2c_client *
 i2c_new_dummy(struct i2c_adapter *adap, u16 address);
 
 extern void i2c_unregister_device(struct i2c_client *);
+#endif /* I2C */
 
 /* Mainboard arch_initcall() code should register all its I2C devices.
  * This is done at arch_initcall time, before declaring any i2c adapters.
@@ -299,7 +303,7 @@ i2c_register_board_info(int busnum, struct i2c_board_info const *info,
 {
 	return 0;
 }
-#endif
+#endif /* I2C_BOARDINFO */
 
 /*
  * The following structs are for those who like to implement new bus drivers:
@@ -394,6 +398,7 @@ struct i2c_client_address_data {
 
 /* administration...
  */
+#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 extern int i2c_add_adapter(struct i2c_adapter *);
 extern int i2c_del_adapter(struct i2c_adapter *);
 extern int i2c_add_numbered_adapter(struct i2c_adapter *);
@@ -435,6 +440,7 @@ static inline int i2c_adapter_id(struct i2c_adapter *adap)
 {
 	return adap->nr;
 }
+#endif /* I2C */
 #endif /* __KERNEL__ */
 
 /**
-- 
cgit v1.2.3-70-g09d2


From c70366732f67dbdb32f7fe9c6aa59299b76feca6 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 19 Jun 2009 16:58:21 +0200
Subject: i2c: New macro to initialize i2c address lists on the fly

For video4linux we sometimes need to probe for a single i2c address.
Normally you would do it like this:

static const unsigned short addrs[] = {
	addr, I2C_CLIENT_END
};

client = i2c_new_probed_device(adapter, &info, addrs);

This is a bit awkward and I came up with this macro:

#define V4L2_I2C_ADDRS(addr, addrs...) \
	((const unsigned short []){ addr, ## addrs, I2C_CLIENT_END })

This can construct a list of one or more i2c addresses on the fly. But
this is something that really belongs in i2c.h, renamed to I2C_ADDRS.

With this macro we can just do:

client = i2c_new_probed_device(adapter, &info, I2C_ADDRS(addr));

Note that this can also be used to initialize an array:

static const unsigned short addrs[] = I2C_ADDRS(0x2a, 0x2c);

Whether you want to is another matter, but it works. This functionality is
also available in the oldest supported gcc (3.2).

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index bc17f7fa7d3..f4784c0fe97 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -393,6 +393,10 @@ struct i2c_client_address_data {
 /* The numbers to use to set I2C bus address */
 #define ANY_I2C_BUS		0xffff
 
+/* Construct an I2C_CLIENT_END-terminated array of i2c addresses */
+#define I2C_ADDRS(addr, addrs...) \
+	((const unsigned short []){ addr, ## addrs, I2C_CLIENT_END })
+
 
 /* ----- functions exported by i2c.o */
 
-- 
cgit v1.2.3-70-g09d2


From 06d5caf47ef4fbd9efdceae33293c42778cb7b0c Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Tue, 16 Jun 2009 15:39:51 +0100
Subject: rfkill: don't restore software blocked state on persistent devices

The setting of the "persistent" flag is also made more explicit using
a new rfkill_init_sw_state() function, instead of special-casing
rfkill_set_sw_state() when it is called before registration.

Suspend is a bit of a corner case so we try to get away without adding
another hack to rfkill-input - it's going to be removed soon.
If the state does change over suspend, users will simply have to prod
rfkill-input twice in order to toggle the state.

Userspace policy agents will be able to implement a more consistent user
experience.  For example, they can avoid the above problem if they
toggle devices individually.  Then there would be no "global state"
to get out of sync.

Currently there are only two rfkill drivers with persistent soft-blocked
state.  thinkpad-acpi already checks the software state on resume.
eeepc-laptop will require modification.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
CC: Marcel Holtmann <marcel@holtmann.org>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/platform/x86/eeepc-laptop.c  |  8 ++++----
 drivers/platform/x86/thinkpad_acpi.c | 14 ++++++-------
 include/linux/rfkill.h               | 32 ++++++++++++++++++++++++-----
 net/rfkill/core.c                    | 40 ++++++++++++++++++++++--------------
 4 files changed, 63 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 03bf522bd7a..01682eca436 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -675,8 +675,8 @@ static int eeepc_hotk_add(struct acpi_device *device)
 		if (!ehotk->eeepc_wlan_rfkill)
 			goto wlan_fail;
 
-		rfkill_set_sw_state(ehotk->eeepc_wlan_rfkill,
-				    get_acpi(CM_ASL_WLAN) != 1);
+		rfkill_init_sw_state(ehotk->eeepc_wlan_rfkill,
+				     get_acpi(CM_ASL_WLAN) != 1);
 		result = rfkill_register(ehotk->eeepc_wlan_rfkill);
 		if (result)
 			goto wlan_fail;
@@ -693,8 +693,8 @@ static int eeepc_hotk_add(struct acpi_device *device)
 		if (!ehotk->eeepc_bluetooth_rfkill)
 			goto bluetooth_fail;
 
-		rfkill_set_sw_state(ehotk->eeepc_bluetooth_rfkill,
-				    get_acpi(CM_ASL_BLUETOOTH) != 1);
+		rfkill_init_sw_state(ehotk->eeepc_bluetooth_rfkill,
+				     get_acpi(CM_ASL_BLUETOOTH) != 1);
 		result = rfkill_register(ehotk->eeepc_bluetooth_rfkill);
 		if (result)
 			goto bluetooth_fail;
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 86e958539f4..40d64c03278 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -1163,8 +1163,8 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 {
 	struct tpacpi_rfk *atp_rfk;
 	int res;
-	bool initial_sw_state = false;
-	int initial_sw_status;
+	bool sw_state = false;
+	int sw_status;
 
 	BUG_ON(id >= TPACPI_RFK_SW_MAX || tpacpi_rfkill_switches[id]);
 
@@ -1185,17 +1185,17 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 	atp_rfk->id = id;
 	atp_rfk->ops = tp_rfkops;
 
-	initial_sw_status = (tp_rfkops->get_status)();
-	if (initial_sw_status < 0) {
+	sw_status = (tp_rfkops->get_status)();
+	if (sw_status < 0) {
 		printk(TPACPI_ERR
 			"failed to read initial state for %s, error %d\n",
-			name, initial_sw_status);
+			name, sw_status);
 	} else {
-		initial_sw_state = (initial_sw_status == TPACPI_RFK_RADIO_OFF);
+		sw_state = (sw_status == TPACPI_RFK_RADIO_OFF);
 		if (set_default) {
 			/* try to keep the initial state, since we ask the
 			 * firmware to preserve it across S5 in NVRAM */
-			rfkill_set_sw_state(atp_rfk->rfkill, initial_sw_state);
+			rfkill_init_sw_state(atp_rfk->rfkill, sw_state);
 		}
 	}
 	rfkill_set_hw_state(atp_rfk->rfkill, tpacpi_rfk_check_hwblock_state());
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 16e39c7a67f..dcac724340d 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -160,8 +160,9 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
  * the rfkill structure. Before calling this function the driver needs
  * to be ready to service method calls from rfkill.
  *
- * If the software blocked state is not set before registration,
- * set_block will be called to initialize it to a default value.
+ * If rfkill_init_sw_state() is not called before registration,
+ * set_block() will be called to initialize the software blocked state
+ * to a default value.
  *
  * If the hardware blocked state is not set before registration,
  * it is assumed to be unblocked.
@@ -234,9 +235,11 @@ bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
  * rfkill drivers that get events when the soft-blocked state changes
  * (yes, some platforms directly act on input but allow changing again)
  * use this function to notify the rfkill core (and through that also
- * userspace) of the current state.  It is not necessary to notify on
- * resume; since hibernation can always change the soft-blocked state,
- * the rfkill core will unconditionally restore the previous state.
+ * userspace) of the current state.
+ *
+ * Drivers should also call this function after resume if the state has
+ * been changed by the user.  This only makes sense for "persistent"
+ * devices (see rfkill_init_sw_state()).
  *
  * This function can be called in any context, even from within rfkill
  * callbacks.
@@ -246,6 +249,21 @@ bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
  */
 bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
 
+/**
+ * rfkill_init_sw_state - Initialize persistent software block state
+ * @rfkill: pointer to the rfkill class to modify.
+ * @state: the current software block state to set
+ *
+ * rfkill drivers that preserve their software block state over power off
+ * use this function to notify the rfkill core (and through that also
+ * userspace) of their initial state.  It should only be used before
+ * registration.
+ *
+ * In addition, it marks the device as "persistent".  Persistent devices
+ * are expected to preserve preserve their own state when suspended.
+ */
+void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked);
+
 /**
  * rfkill_set_states - Set the internal rfkill block states
  * @rfkill: pointer to the rfkill class to modify.
@@ -307,6 +325,10 @@ static inline bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 	return blocked;
 }
 
+static inline void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked)
+{
+}
+
 static inline void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 {
 }
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 868d79f8ac1..dcf8df7c573 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -56,7 +56,6 @@ struct rfkill {
 	u32			idx;
 
 	bool			registered;
-	bool			suspended;
 	bool			persistent;
 
 	const struct rfkill_ops	*ops;
@@ -224,7 +223,7 @@ static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op)
 
 static void rfkill_event(struct rfkill *rfkill)
 {
-	if (!rfkill->registered || rfkill->suspended)
+	if (!rfkill->registered)
 		return;
 
 	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
@@ -508,19 +507,32 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 	blocked = blocked || hwblock;
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
-	if (!rfkill->registered) {
-		rfkill->persistent = true;
-	} else {
-		if (prev != blocked && !hwblock)
-			schedule_work(&rfkill->uevent_work);
+	if (!rfkill->registered)
+		return blocked;
 
-		rfkill_led_trigger_event(rfkill);
-	}
+	if (prev != blocked && !hwblock)
+		schedule_work(&rfkill->uevent_work);
+
+	rfkill_led_trigger_event(rfkill);
 
 	return blocked;
 }
 EXPORT_SYMBOL(rfkill_set_sw_state);
 
+void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+
+	BUG_ON(!rfkill);
+	BUG_ON(rfkill->registered);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	__rfkill_set_sw_state(rfkill, blocked);
+	rfkill->persistent = true;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+EXPORT_SYMBOL(rfkill_init_sw_state);
+
 void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 {
 	unsigned long flags;
@@ -718,8 +730,6 @@ static int rfkill_suspend(struct device *dev, pm_message_t state)
 
 	rfkill_pause_polling(rfkill);
 
-	rfkill->suspended = true;
-
 	return 0;
 }
 
@@ -728,10 +738,10 @@ static int rfkill_resume(struct device *dev)
 	struct rfkill *rfkill = to_rfkill(dev);
 	bool cur;
 
-	cur = !!(rfkill->state & RFKILL_BLOCK_SW);
-	rfkill_set_block(rfkill, cur);
-
-	rfkill->suspended = false;
+	if (!rfkill->persistent) {
+		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
+		rfkill_set_block(rfkill, cur);
+	}
 
 	rfkill_resume_polling(rfkill);
 
-- 
cgit v1.2.3-70-g09d2


From 464902e812025792c9e33e19e1555c343672d5cf Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Tue, 16 Jun 2009 14:54:04 +0100
Subject: rfkill: export persistent attribute in sysfs

This information allows userspace to implement a hybrid policy where
it can store the rfkill soft-blocked state in platform non-volatile
storage if available, and if not then file-based storage can be used.

Some users prefer platform non-volatile storage because of the behaviour
when dual-booting multiple versions of Linux, or if the rfkill setting
is changed in the BIOS setting screens, or if the BIOS responds to
wireless-toggle hotkeys itself before the relevant platform driver has
been loaded.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/rfkill.txt |  2 ++
 include/linux/rfkill.h   |  5 +++--
 net/rfkill/core.c        | 10 ++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index c8acd8659e9..b4860509c31 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -111,6 +111,8 @@ following attributes:
 
 	name: Name assigned by driver to this key (interface or driver name).
 	type: Driver type string ("wlan", "bluetooth", etc).
+	persistent: Whether the soft blocked state is initialised from
+	            non-volatile storage at startup.
 	state: Current state of the transmitter
 		0: RFKILL_STATE_SOFT_BLOCKED
 			transmitter is turned off by software
diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index dcac724340d..e73e2429a1b 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -259,8 +259,9 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
  * userspace) of their initial state.  It should only be used before
  * registration.
  *
- * In addition, it marks the device as "persistent".  Persistent devices
- * are expected to preserve preserve their own state when suspended.
+ * In addition, it marks the device as "persistent", an attribute which
+ * can be read by userspace.  Persistent devices are expected to preserve
+ * their own state when suspended.
  */
 void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked);
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index dcf8df7c573..79693fe2001 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -610,6 +610,15 @@ static ssize_t rfkill_idx_show(struct device *dev,
 	return sprintf(buf, "%d\n", rfkill->idx);
 }
 
+static ssize_t rfkill_persistent_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", rfkill->persistent);
+}
+
 static u8 user_state_from_blocked(unsigned long state)
 {
 	if (state & RFKILL_BLOCK_HW)
@@ -668,6 +677,7 @@ static struct device_attribute rfkill_dev_attrs[] = {
 	__ATTR(name, S_IRUGO, rfkill_name_show, NULL),
 	__ATTR(type, S_IRUGO, rfkill_type_show, NULL),
 	__ATTR(index, S_IRUGO, rfkill_idx_show, NULL),
+	__ATTR(persistent, S_IRUGO, rfkill_persistent_show, NULL),
 	__ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store),
 	__ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store),
 	__ATTR_NULL
-- 
cgit v1.2.3-70-g09d2


From 1e9c28599879040039f610c5b177e61ef65ff100 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Thu, 18 Jun 2009 16:48:58 -0700
Subject: gpio: driver for PrimeCell PL061 GPIO controller

This is a driver for the ARM PrimeCell PL061 GPIO AMBA peripheral.  The
driver is implemented using the gpiolib framework.

This driver also includes support for the use of the PL061 as an interrupt
controller (secondary).

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Cc: David Brownell <david-b@pacbell.net>
Acked-by: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig       |   6 +
 drivers/gpio/Makefile      |   1 +
 drivers/gpio/pl061.c       | 341 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/amba/pl061.h |  15 ++
 4 files changed, 363 insertions(+)
 create mode 100644 drivers/gpio/pl061.c
 create mode 100644 include/linux/amba/pl061.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 11f373971fa..3582c39f972 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -67,6 +67,12 @@ config GPIO_SYSFS
 
 comment "Memory mapped GPIO expanders:"
 
+config GPIO_PL061
+	bool "PrimeCell PL061 GPIO support"
+	depends on ARM_AMBA
+	help
+	  Say yes here to support the PrimeCell PL061 GPIO device
+
 config GPIO_XILINX
 	bool "Xilinx GPIO support"
 	depends on PPC_OF || MICROBLAZE
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 49ac64e515e..ef90203e8f3 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_GPIO_MAX732X)	+= max732x.o
 obj-$(CONFIG_GPIO_MCP23S08)	+= mcp23s08.o
 obj-$(CONFIG_GPIO_PCA953X)	+= pca953x.o
 obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
+obj-$(CONFIG_GPIO_PL061)	+= pl061.o
 obj-$(CONFIG_GPIO_TWL4030)	+= twl4030-gpio.o
 obj-$(CONFIG_GPIO_XILINX)	+= xilinx_gpio.o
 obj-$(CONFIG_GPIO_BT8XX)	+= bt8xxgpio.o
diff --git a/drivers/gpio/pl061.c b/drivers/gpio/pl061.c
new file mode 100644
index 00000000000..aa8e7cb020d
--- /dev/null
+++ b/drivers/gpio/pl061.c
@@ -0,0 +1,341 @@
+/*
+ *  linux/drivers/gpio/pl061.c
+ *
+ *  Copyright (C) 2008, 2009 Provigent Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Driver for the ARM PrimeCell(tm) General Purpose Input/Output (PL061)
+ *
+ * Data sheet: ARM DDI 0190B, September 2000
+ */
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/bitops.h>
+#include <linux/workqueue.h>
+#include <linux/gpio.h>
+#include <linux/device.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/pl061.h>
+
+#define GPIODIR 0x400
+#define GPIOIS  0x404
+#define GPIOIBE 0x408
+#define GPIOIEV 0x40C
+#define GPIOIE  0x410
+#define GPIORIS 0x414
+#define GPIOMIS 0x418
+#define GPIOIC  0x41C
+
+#define PL061_GPIO_NR	8
+
+struct pl061_gpio {
+	/* We use a list of pl061_gpio structs for each trigger IRQ in the main
+	 * interrupts controller of the system. We need this to support systems
+	 * in which more that one PL061s are connected to the same IRQ. The ISR
+	 * interates through this list to find the source of the interrupt.
+	 */
+	struct list_head	list;
+
+	/* Each of the two spinlocks protects a different set of hardware
+	 * regiters and data structurs. This decouples the code of the IRQ from
+	 * the GPIO code. This also makes the case of a GPIO routine call from
+	 * the IRQ code simpler.
+	 */
+	spinlock_t		lock;		/* GPIO registers */
+	spinlock_t		irq_lock;	/* IRQ registers */
+
+	void __iomem		*base;
+	unsigned		irq_base;
+	struct gpio_chip	gc;
+};
+
+static int pl061_direction_input(struct gpio_chip *gc, unsigned offset)
+{
+	struct pl061_gpio *chip = container_of(gc, struct pl061_gpio, gc);
+	unsigned long flags;
+	unsigned char gpiodir;
+
+	if (offset >= gc->ngpio)
+		return -EINVAL;
+
+	spin_lock_irqsave(&chip->lock, flags);
+	gpiodir = readb(chip->base + GPIODIR);
+	gpiodir &= ~(1 << offset);
+	writeb(gpiodir, chip->base + GPIODIR);
+	spin_unlock_irqrestore(&chip->lock, flags);
+
+	return 0;
+}
+
+static int pl061_direction_output(struct gpio_chip *gc, unsigned offset,
+		int value)
+{
+	struct pl061_gpio *chip = container_of(gc, struct pl061_gpio, gc);
+	unsigned long flags;
+	unsigned char gpiodir;
+
+	if (offset >= gc->ngpio)
+		return -EINVAL;
+
+	spin_lock_irqsave(&chip->lock, flags);
+	writeb(!!value << offset, chip->base + (1 << (offset + 2)));
+	gpiodir = readb(chip->base + GPIODIR);
+	gpiodir |= 1 << offset;
+	writeb(gpiodir, chip->base + GPIODIR);
+	spin_unlock_irqrestore(&chip->lock, flags);
+
+	return 0;
+}
+
+static int pl061_get_value(struct gpio_chip *gc, unsigned offset)
+{
+	struct pl061_gpio *chip = container_of(gc, struct pl061_gpio, gc);
+
+	return !!readb(chip->base + (1 << (offset + 2)));
+}
+
+static void pl061_set_value(struct gpio_chip *gc, unsigned offset, int value)
+{
+	struct pl061_gpio *chip = container_of(gc, struct pl061_gpio, gc);
+
+	writeb(!!value << offset, chip->base + (1 << (offset + 2)));
+}
+
+/*
+ * PL061 GPIO IRQ
+ */
+static void pl061_irq_disable(unsigned irq)
+{
+	struct pl061_gpio *chip = get_irq_chip_data(irq);
+	int offset = irq - chip->irq_base;
+	unsigned long flags;
+	u8 gpioie;
+
+	spin_lock_irqsave(&chip->irq_lock, flags);
+	gpioie = readb(chip->base + GPIOIE);
+	gpioie &= ~(1 << offset);
+	writeb(gpioie, chip->base + GPIOIE);
+	spin_unlock_irqrestore(&chip->irq_lock, flags);
+}
+
+static void pl061_irq_enable(unsigned irq)
+{
+	struct pl061_gpio *chip = get_irq_chip_data(irq);
+	int offset = irq - chip->irq_base;
+	unsigned long flags;
+	u8 gpioie;
+
+	spin_lock_irqsave(&chip->irq_lock, flags);
+	gpioie = readb(chip->base + GPIOIE);
+	gpioie |= 1 << offset;
+	writeb(gpioie, chip->base + GPIOIE);
+	spin_unlock_irqrestore(&chip->irq_lock, flags);
+}
+
+static int pl061_irq_type(unsigned irq, unsigned trigger)
+{
+	struct pl061_gpio *chip = get_irq_chip_data(irq);
+	int offset = irq - chip->irq_base;
+	unsigned long flags;
+	u8 gpiois, gpioibe, gpioiev;
+
+	if (offset < 0 || offset > PL061_GPIO_NR)
+		return -EINVAL;
+
+	spin_lock_irqsave(&chip->irq_lock, flags);
+
+	gpioiev = readb(chip->base + GPIOIEV);
+
+	gpiois = readb(chip->base + GPIOIS);
+	if (trigger & (IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW)) {
+		gpiois |= 1 << offset;
+		if (trigger & IRQ_TYPE_LEVEL_HIGH)
+			gpioiev |= 1 << offset;
+		else
+			gpioiev &= ~(1 << offset);
+	} else
+		gpiois &= ~(1 << offset);
+	writeb(gpiois, chip->base + GPIOIS);
+
+	gpioibe = readb(chip->base + GPIOIBE);
+	if ((trigger & IRQ_TYPE_EDGE_BOTH) == IRQ_TYPE_EDGE_BOTH)
+		gpioibe |= 1 << offset;
+	else {
+		gpioibe &= ~(1 << offset);
+		if (trigger & IRQ_TYPE_EDGE_RISING)
+			gpioiev |= 1 << offset;
+		else
+			gpioiev &= ~(1 << offset);
+	}
+	writeb(gpioibe, chip->base + GPIOIBE);
+
+	writeb(gpioiev, chip->base + GPIOIEV);
+
+	spin_unlock_irqrestore(&chip->irq_lock, flags);
+
+	return 0;
+}
+
+static struct irq_chip pl061_irqchip = {
+	.name		= "GPIO",
+	.enable		= pl061_irq_enable,
+	.disable	= pl061_irq_disable,
+	.set_type	= pl061_irq_type,
+};
+
+static void pl061_irq_handler(unsigned irq, struct irq_desc *desc)
+{
+	struct list_head *chip_list = get_irq_chip_data(irq);
+	struct list_head *ptr;
+	struct pl061_gpio *chip;
+
+	desc->chip->ack(irq);
+	list_for_each(ptr, chip_list) {
+		unsigned long pending;
+		int gpio;
+
+		chip = list_entry(ptr, struct pl061_gpio, list);
+		pending = readb(chip->base + GPIOMIS);
+		writeb(pending, chip->base + GPIOIC);
+
+		if (pending == 0)
+			continue;
+
+		for_each_bit(gpio, &pending, PL061_GPIO_NR)
+			generic_handle_irq(gpio_to_irq(gpio));
+	}
+	desc->chip->unmask(irq);
+}
+
+static int __init pl061_probe(struct amba_device *dev, struct amba_id *id)
+{
+	struct pl061_platform_data *pdata;
+	struct pl061_gpio *chip;
+	struct list_head *chip_list;
+	int ret, irq, i;
+	static unsigned long init_irq[BITS_TO_LONGS(NR_IRQS)];
+
+	pdata = dev->dev.platform_data;
+	if (pdata == NULL)
+		return -ENODEV;
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (chip == NULL)
+		return -ENOMEM;
+
+	if (!request_mem_region(dev->res.start,
+				resource_size(&dev->res), "pl061")) {
+		ret = -EBUSY;
+		goto free_mem;
+	}
+
+	chip->base = ioremap(dev->res.start, resource_size(&dev->res));
+	if (chip->base == NULL) {
+		ret = -ENOMEM;
+		goto release_region;
+	}
+
+	spin_lock_init(&chip->lock);
+	spin_lock_init(&chip->irq_lock);
+	INIT_LIST_HEAD(&chip->list);
+
+	chip->gc.direction_input = pl061_direction_input;
+	chip->gc.direction_output = pl061_direction_output;
+	chip->gc.get = pl061_get_value;
+	chip->gc.set = pl061_set_value;
+	chip->gc.base = pdata->gpio_base;
+	chip->gc.ngpio = PL061_GPIO_NR;
+	chip->gc.label = dev_name(&dev->dev);
+	chip->gc.dev = &dev->dev;
+	chip->gc.owner = THIS_MODULE;
+
+	chip->irq_base = pdata->irq_base;
+
+	ret = gpiochip_add(&chip->gc);
+	if (ret)
+		goto iounmap;
+
+	/*
+	 * irq_chip support
+	 */
+
+	if (chip->irq_base == (unsigned) -1)
+		return 0;
+
+	writeb(0, chip->base + GPIOIE); /* disable irqs */
+	irq = dev->irq[0];
+	if (irq < 0) {
+		ret = -ENODEV;
+		goto iounmap;
+	}
+	set_irq_chained_handler(irq, pl061_irq_handler);
+	if (!test_and_set_bit(irq, init_irq)) { /* list initialized? */
+		chip_list = kmalloc(sizeof(*chip_list), GFP_KERNEL);
+		if (chip_list == NULL) {
+			ret = -ENOMEM;
+			goto iounmap;
+		}
+		INIT_LIST_HEAD(chip_list);
+		set_irq_chip_data(irq, chip_list);
+	} else
+		chip_list = get_irq_chip_data(irq);
+	list_add(&chip->list, chip_list);
+
+	for (i = 0; i < PL061_GPIO_NR; i++) {
+		if (pdata->directions & (1 << i))
+			pl061_direction_output(&chip->gc, i,
+					pdata->values & (1 << i));
+		else
+			pl061_direction_input(&chip->gc, i);
+
+		set_irq_chip(i+chip->irq_base, &pl061_irqchip);
+		set_irq_handler(i+chip->irq_base, handle_simple_irq);
+		set_irq_flags(i+chip->irq_base, IRQF_VALID);
+		set_irq_chip_data(i+chip->irq_base, chip);
+	}
+
+	return 0;
+
+iounmap:
+	iounmap(chip->base);
+release_region:
+	release_mem_region(dev->res.start, resource_size(&dev->res));
+free_mem:
+	kfree(chip);
+
+	return ret;
+}
+
+static struct amba_id pl061_ids[] __initdata = {
+	{
+		.id	= 0x00041061,
+		.mask	= 0x000fffff,
+	},
+	{ 0, 0 },
+};
+
+static struct amba_driver pl061_gpio_driver = {
+	.drv = {
+		.name	= "pl061_gpio",
+	},
+	.id_table	= pl061_ids,
+	.probe		= pl061_probe,
+};
+
+static int __init pl061_gpio_init(void)
+{
+	return amba_driver_register(&pl061_gpio_driver);
+}
+subsys_initcall(pl061_gpio_init);
+
+MODULE_AUTHOR("Baruch Siach <baruch@tkos.co.il>");
+MODULE_DESCRIPTION("PL061 GPIO driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/amba/pl061.h b/include/linux/amba/pl061.h
new file mode 100644
index 00000000000..b4fbd986260
--- /dev/null
+++ b/include/linux/amba/pl061.h
@@ -0,0 +1,15 @@
+/* platform data for the PL061 GPIO driver */
+
+struct pl061_platform_data {
+	/* number of the first GPIO */
+	unsigned	gpio_base;
+
+	/* number of the first IRQ.
+	 * If the IRQ functionality in not desired this must be set to
+	 * (unsigned) -1.
+	 */
+	unsigned	irq_base;
+
+	u8		directions;	/* startup directions, 1: out, 0: in */
+	u8		values;		/* startup values */
+};
-- 
cgit v1.2.3-70-g09d2


From d68412b6d1e8e5ed42d568330e598bd81914ccbd Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 18 Jun 2009 16:49:18 -0700
Subject: dma-mapping: mark DMA_nBITS_MASK as deprecated

Mark them deprecated so that out-of-tree developers get notified about
this before their modules break when these macros are removed.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Yang Hongyang <yanghy@cn.fujitsu.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-mapping.h | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8083b6a36a3..38f5608d246 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -63,24 +63,26 @@ struct dma_map_ops {
 
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
 
+typedef u64 DMA_nnBIT_MASK __deprecated;
+
 /*
  * NOTE: do not use the below macros in new code and do not add new definitions
  * here.
  *
  * Instead, just open-code DMA_BIT_MASK(n) within your driver
  */
-#define DMA_64BIT_MASK	DMA_BIT_MASK(64)
-#define DMA_48BIT_MASK	DMA_BIT_MASK(48)
-#define DMA_47BIT_MASK	DMA_BIT_MASK(47)
-#define DMA_40BIT_MASK	DMA_BIT_MASK(40)
-#define DMA_39BIT_MASK	DMA_BIT_MASK(39)
-#define DMA_35BIT_MASK	DMA_BIT_MASK(35)
-#define DMA_32BIT_MASK	DMA_BIT_MASK(32)
-#define DMA_31BIT_MASK	DMA_BIT_MASK(31)
-#define DMA_30BIT_MASK	DMA_BIT_MASK(30)
-#define DMA_29BIT_MASK	DMA_BIT_MASK(29)
-#define DMA_28BIT_MASK	DMA_BIT_MASK(28)
-#define DMA_24BIT_MASK	DMA_BIT_MASK(24)
+#define DMA_64BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(64)
+#define DMA_48BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(48)
+#define DMA_47BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(47)
+#define DMA_40BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(40)
+#define DMA_39BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(39)
+#define DMA_35BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(35)
+#define DMA_32BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(32)
+#define DMA_31BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(31)
+#define DMA_30BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(30)
+#define DMA_29BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(29)
+#define DMA_28BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(28)
+#define DMA_24BIT_MASK	(DMA_nnBIT_MASK)DMA_BIT_MASK(24)
 
 #define DMA_MASK_NONE	0x0ULL
 
-- 
cgit v1.2.3-70-g09d2


From dbe6f1869188b6e04e38aa861dd198befb08bcd7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Jun 2009 16:49:19 -0700
Subject: dma-mapping: mark dma_sync_single and dma_sync_sg as deprecated

dma_sync_single() and dma_sync_sg() have been described as "Backwards
compat, remove in 2.7.x" for a long time (since 2.6.5).

This marks dma_sync_single() and dma_sync_sg() as deprecated so the users
get notified before removing them.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-mapping.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 38f5608d246..07dfd460d28 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -109,9 +109,20 @@ static inline int is_buffer_dma_capable(u64 mask, dma_addr_t addr, size_t size)
 #include <asm-generic/dma-mapping-broken.h>
 #endif
 
-/* Backwards compat, remove in 2.7.x */
-#define dma_sync_single		dma_sync_single_for_cpu
-#define dma_sync_sg		dma_sync_sg_for_cpu
+/* for backwards compatibility, removed soon */
+static inline void __deprecated dma_sync_single(struct device *dev,
+						dma_addr_t addr, size_t size,
+						enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void __deprecated dma_sync_sg(struct device *dev,
+					    struct scatterlist *sg, int nelems,
+					    enum dma_data_direction dir)
+{
+	dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
 
 static inline u64 dma_get_mask(struct device *dev)
 {
-- 
cgit v1.2.3-70-g09d2


From ec20a022aa24fc63d3ab59584cb1e5aa9a21d46c Mon Sep 17 00:00:00 2001
From: Tero Saarni <tero.saarni@gmail.com>
Date: Wed, 10 Jun 2009 23:27:24 -0700
Subject: Input: synaptics - add support for reporting x/y resolution

Synaptics uses anisotropic coordinate system.  On some wide touchpads
vertical resolution can be twice as high as horizontal which causes
unequal sensitivity on x/y directions.  Add support for reading the
resolution with EVIOCGABS ioctl.

Signed-off-by: Tero Saarni <tero.saarni@gmail.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c           | 12 +++++++++---
 drivers/input/mouse/synaptics.c | 28 ++++++++++++++++++++++++++++
 drivers/input/mouse/synaptics.h |  2 ++
 include/linux/input.h           |  2 ++
 4 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index c238116400b..114efd8dc8f 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -626,8 +626,11 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 				abs.maximum = dev->absmax[t];
 				abs.fuzz = dev->absfuzz[t];
 				abs.flat = dev->absflat[t];
+				abs.resolution = dev->absres[t];
 
-				if (copy_to_user(p, &abs, sizeof(struct input_absinfo)))
+				if (copy_to_user(p, &abs, min_t(size_t,
+								_IOC_SIZE(cmd),
+								sizeof(struct input_absinfo))))
 					return -EFAULT;
 
 				return 0;
@@ -654,8 +657,9 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 
 				t = _IOC_NR(cmd) & ABS_MAX;
 
-				if (copy_from_user(&abs, p,
-						sizeof(struct input_absinfo)))
+				if (copy_from_user(&abs, p, min_t(size_t,
+								  _IOC_SIZE(cmd),
+								  sizeof(struct input_absinfo))))
 					return -EFAULT;
 
 				/*
@@ -670,6 +674,8 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 				dev->absmax[t] = abs.maximum;
 				dev->absfuzz[t] = abs.fuzz;
 				dev->absflat[t] = abs.flat;
+				dev->absres[t] = _IOC_SIZE(cmd) < sizeof(struct input_absinfo) ?
+							0 : abs.resolution;
 
 				spin_unlock_irq(&dev->event_lock);
 
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c
index f3e4f7b0240..19984bf06ca 100644
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -180,6 +180,29 @@ static int synaptics_identify(struct psmouse *psmouse)
 	return -1;
 }
 
+/*
+ * Read touchpad resolution
+ * Resolution is left zero if touchpad does not support the query
+ */
+static int synaptics_resolution(struct psmouse *psmouse)
+{
+	struct synaptics_data *priv = psmouse->private;
+	unsigned char res[3];
+
+	if (SYN_ID_MAJOR(priv->identity) < 4)
+		return 0;
+
+	if (synaptics_send_cmd(psmouse, SYN_QUE_RESOLUTION, res))
+		return 0;
+
+	if ((res[0] != 0) && (res[1] & 0x80) && (res[2] != 0)) {
+		priv->x_res = res[0]; /* x resolution in units/mm */
+		priv->y_res = res[2]; /* y resolution in units/mm */
+	}
+
+	return 0;
+}
+
 static int synaptics_query_hardware(struct psmouse *psmouse)
 {
 	if (synaptics_identify(psmouse))
@@ -188,6 +211,8 @@ static int synaptics_query_hardware(struct psmouse *psmouse)
 		return -1;
 	if (synaptics_capability(psmouse))
 		return -1;
+	if (synaptics_resolution(psmouse))
+		return -1;
 
 	return 0;
 }
@@ -563,6 +588,9 @@ static void set_input_params(struct input_dev *dev, struct synaptics_data *priv)
 	clear_bit(EV_REL, dev->evbit);
 	clear_bit(REL_X, dev->relbit);
 	clear_bit(REL_Y, dev->relbit);
+
+	dev->absres[ABS_X] = priv->x_res;
+	dev->absres[ABS_Y] = priv->y_res;
 }
 
 static void synaptics_disconnect(struct psmouse *psmouse)
diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h
index 02aa4cf7bc7..30238215175 100644
--- a/drivers/input/mouse/synaptics.h
+++ b/drivers/input/mouse/synaptics.h
@@ -97,6 +97,8 @@ struct synaptics_data {
 	unsigned long int capabilities;		/* Capabilities */
 	unsigned long int ext_cap;		/* Extended Capabilities */
 	unsigned long int identity;		/* Identification */
+	int x_res;				/* X resolution in units/mm */
+	int y_res;				/* Y resolution in units/mm */
 
 	unsigned char pkt_type;			/* packet type - old, new, etc */
 	unsigned char mode;			/* current mode byte */
diff --git a/include/linux/input.h b/include/linux/input.h
index 6fed4f6a9c9..8b3bc3e0d14 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -53,6 +53,7 @@ struct input_absinfo {
 	__s32 maximum;
 	__s32 fuzz;
 	__s32 flat;
+	__s32 resolution;
 };
 
 #define EVIOCGVERSION		_IOR('E', 0x01, int)			/* get driver version */
@@ -1109,6 +1110,7 @@ struct input_dev {
 	int absmin[ABS_MAX + 1];
 	int absfuzz[ABS_MAX + 1];
 	int absflat[ABS_MAX + 1];
+	int absres[ABS_MAX + 1];
 
 	int (*open)(struct input_dev *dev);
 	void (*close)(struct input_dev *dev);
-- 
cgit v1.2.3-70-g09d2


From d06063cc221fdefcab86589e79ddfdb7c0e14b63 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 10 Apr 2009 09:01:23 -0700
Subject: Move FAULT_FLAG_xyz into handle_mm_fault() callers

This allows the callers to now pass down the full set of FAULT_FLAG_xyz
flags to handle_mm_fault().  All callers have been (mechanically)
converted to the new calling convention, there's almost certainly room
for architectures to clean up their code and then add FAULT_FLAG_RETRY
when that support is added.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/fault.c                   | 2 +-
 arch/arm/mm/fault.c                     | 2 +-
 arch/avr32/mm/fault.c                   | 2 +-
 arch/cris/mm/fault.c                    | 2 +-
 arch/frv/mm/fault.c                     | 2 +-
 arch/ia64/mm/fault.c                    | 2 +-
 arch/m32r/mm/fault.c                    | 2 +-
 arch/m68k/mm/fault.c                    | 2 +-
 arch/microblaze/mm/fault.c              | 2 +-
 arch/mips/mm/fault.c                    | 2 +-
 arch/mn10300/mm/fault.c                 | 2 +-
 arch/parisc/mm/fault.c                  | 2 +-
 arch/powerpc/mm/fault.c                 | 2 +-
 arch/powerpc/platforms/cell/spu_fault.c | 2 +-
 arch/s390/lib/uaccess_pt.c              | 2 +-
 arch/s390/mm/fault.c                    | 2 +-
 arch/sh/mm/fault_32.c                   | 2 +-
 arch/sh/mm/tlbflush_64.c                | 2 +-
 arch/sparc/mm/fault_32.c                | 4 ++--
 arch/sparc/mm/fault_64.c                | 2 +-
 arch/um/kernel/trap.c                   | 2 +-
 arch/x86/mm/fault.c                     | 2 +-
 arch/xtensa/mm/fault.c                  | 2 +-
 include/linux/mm.h                      | 4 ++--
 mm/memory.c                             | 8 ++++----
 25 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 4829f96585b..00a31deaa96 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -146,7 +146,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	/* If for any reason at all we couldn't handle the fault,
 	   make sure we exit gracefully rather than endlessly redo
 	   the fault.  */
-	fault = handle_mm_fault(mm, vma, address, cause > 0);
+	fault = handle_mm_fault(mm, vma, address, cause > 0 ? FAULT_FLAG_WRITE : 0);
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 0455557a289..6fdcbb70982 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -208,7 +208,7 @@ good_area:
 	 * than endlessly redo the fault.
 	 */
 survive:
-	fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
+	fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 62d4abbaa65..b61d86d3deb 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -133,7 +133,7 @@ good_area:
 	 * fault.
 	 */
 survive:
-	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index c4c76db90f9..f925115e325 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -163,7 +163,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * the fault.
 	 */
 
-	fault = handle_mm_fault(mm, vma, address, writeaccess & 1);
+	fault = handle_mm_fault(mm, vma, address, (writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 05093d41d98..30f5d100a81 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -163,7 +163,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, ear0, write);
+	fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 23088bed111..19261a99e62 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -154,7 +154,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	 * sure we exit gracefully rather than endlessly redo the
 	 * fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0);
+	fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We ran out of memory, or some other thing happened
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 4a71df4c1b3..7274b47f4c2 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -196,7 +196,7 @@ survive:
 	 */
 	addr = (address & PAGE_MASK);
 	set_thread_fault_code(error_code);
-	fault = handle_mm_fault(mm, vma, addr, write);
+	fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index f493f03231d..d0e35cf99fc 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -155,7 +155,7 @@ good_area:
 	 */
 
  survive:
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 #ifdef DEBUG
 	printk("handle_mm_fault returns %d\n",fault);
 #endif
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index 5e67cd1fab4..956607a63f4 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -232,7 +232,7 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	fault = handle_mm_fault(mm, vma, address, is_write);
+	fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 55767ad9f00..6751ce9ede9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -102,7 +102,7 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 33cf25025da..a62e1e138bc 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -258,7 +258,7 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 92c7fa4ecc3..bfb6dd6ab38 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -202,7 +202,7 @@ good_area:
 	 * fault.
 	 */
 
-	fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0);
+	fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We hit a shared mapping outside of the file, or some
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5beffc8f481..830bef0a113 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -302,7 +302,7 @@ good_area:
 	 * the fault.
 	 */
  survive:
-	ret = handle_mm_fault(mm, vma, address, is_write);
+	ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(ret & VM_FAULT_ERROR)) {
 		if (ret & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c
index 95d8dadf2d8..d06ba87f1a1 100644
--- a/arch/powerpc/platforms/cell/spu_fault.c
+++ b/arch/powerpc/platforms/cell/spu_fault.c
@@ -70,7 +70,7 @@ int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 	}
 
 	ret = 0;
-	*flt = handle_mm_fault(mm, vma, ea, is_write);
+	*flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(*flt & VM_FAULT_ERROR)) {
 		if (*flt & VM_FAULT_OOM) {
 			ret = -ENOMEM;
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index b0b84c35b0a..cb5d59eab0e 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -66,7 +66,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
 	}
 
 survive:
-	fault = handle_mm_fault(mm, vma, address, write_access);
+	fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 220a152c836..74eb26bf197 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -352,7 +352,7 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM) {
 			up_read(&mm->mmap_sem);
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
index 2c50f80fc33..cc8ddbdf3d7 100644
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -133,7 +133,7 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index 7876997ba19..fcbb6e135ce 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -187,7 +187,7 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 12e447fc854..a5e30c642ee 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -241,7 +241,7 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -484,7 +484,7 @@ good_area:
 		if(!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 	}
-	switch (handle_mm_fault(mm, vma, address, write)) {
+	switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
 	case VM_FAULT_SIGBUS:
 	case VM_FAULT_OOM:
 		goto do_sigbus;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 4ab8993b086..e5620b27c8b 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -398,7 +398,7 @@ good_area:
 			goto bad_area;
 	}
 
-	fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE));
+	fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 7384d8accfe..637c6505dc0 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -65,7 +65,7 @@ good_area:
 	do {
 		int fault;
 
-		fault = handle_mm_fault(mm, vma, address, is_write);
+		fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
 		if (unlikely(fault & VM_FAULT_ERROR)) {
 			if (fault & VM_FAULT_OOM) {
 				goto out_of_memory;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c403526d5d1..78a5fff857b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1113,7 +1113,7 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault:
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index bdd860d93f7..bc0733359a8 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -106,7 +106,7 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	fault = handle_mm_fault(mm, vma, address, is_write);
+	fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf260d848eb..d006e93d5c9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -810,11 +810,11 @@ extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access);
+			unsigned long address, unsigned int flags);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
-			int write_access)
+			unsigned int flags)
 {
 	/* should never happen if there's no MMU */
 	BUG();
diff --git a/mm/memory.c b/mm/memory.c
index e6a9700359d..98bcb90d595 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1310,8 +1310,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
-				ret = handle_mm_fault(mm, vma, start,
-						foll_flags & FOLL_WRITE);
+
+				/* FOLL_WRITE matches FAULT_FLAG_WRITE! */
+				ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE);
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
@@ -2958,13 +2959,12 @@ unlock:
  * By the time we get here, we already hold the mm semaphore
  */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, int write_access)
+		unsigned long address, unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0;
 
 	__set_current_state(TASK_RUNNING);
 
-- 
cgit v1.2.3-70-g09d2


From f9ab94cee313746573b2d693bc2afb807ebb0998 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:20 +0100
Subject: dm: introduce num_flush_requests

Introduce num_flush_requests for a target to set to say how many flush
instructions (empty barriers) it wants to receive.  These are sent by
__clone_and_map_empty_barrier with map_info->flush_request going from 0
to (num_flush_requests - 1).

Old targets without flush support won't receive any flush requests.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c               | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h | 11 +++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7d9ca709433..badb7519ccc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -750,6 +750,40 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	return clone;
 }
 
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+			  unsigned flush_nr)
+{
+	struct dm_target_io *tio = alloc_tio(ci->md);
+	struct bio *clone;
+
+	tio->io = ci->io;
+	tio->ti = ti;
+
+	memset(&tio->info, 0, sizeof(tio->info));
+	tio->info.flush_request = flush_nr;
+
+	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+	__bio_clone(clone, ci->bio);
+	clone->bi_destructor = dm_bio_destructor;
+
+	__map_bio(ti, clone, tio);
+}
+
+static int __clone_and_map_empty_barrier(struct clone_info *ci)
+{
+	unsigned target_nr = 0, flush_nr;
+	struct dm_target *ti;
+
+	while ((ti = dm_table_get_target(ci->map, target_nr++)))
+		for (flush_nr = 0; flush_nr < ti->num_flush_requests;
+		     flush_nr++)
+			__flush_target(ci, ti, flush_nr);
+
+	ci->sector_count = 0;
+
+	return 0;
+}
+
 static int __clone_and_map(struct clone_info *ci)
 {
 	struct bio *clone, *bio = ci->bio;
@@ -757,6 +791,9 @@ static int __clone_and_map(struct clone_info *ci)
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
 
+	if (unlikely(bio_empty_barrier(bio)))
+		return __clone_and_map_empty_barrier(ci);
+
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
@@ -877,6 +914,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->md = md;
 	ci.sector = bio->bi_sector;
 	ci.sector_count = bio_sectors(bio);
+	if (unlikely(bio_empty_barrier(bio)))
+		ci.sector_count = 1;
 	ci.idx = bio->bi_idx;
 
 	start_io_acct(ci.io);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 49c2362977f..fc36a4d0772 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -21,6 +21,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 union map_info {
 	void *ptr;
 	unsigned long long ll;
+	unsigned flush_request;
 };
 
 /*
@@ -167,6 +168,16 @@ struct dm_target {
 	/* Always a power of 2 */
 	sector_t split_io;
 
+	/*
+	 * A number of zero-length barrier requests that will be submitted
+	 * to the target for the purpose of flushing cache.
+	 *
+	 * The request number will be placed in union map_info->flush_request.
+	 * It is a responsibility of the target driver to remap these requests
+	 * to the real underlying devices.
+	 */
+	unsigned num_flush_requests;
+
 	/*
 	 * These are automatically filled in by
 	 * dm_table_get_device.
-- 
cgit v1.2.3-70-g09d2


From 60935eb21d3c5bac79618000f38f92c249d153c4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Mon, 22 Jun 2009 10:12:30 +0100
Subject: dm ioctl: support cookies for udev

Add support for passing a 32 bit "cookie" into the kernel with the
DM_SUSPEND, DM_DEV_RENAME and DM_DEV_REMOVE ioctls.  The (unsigned)
value of this cookie is returned to userspace alongside the uevents
issued by these ioctls in the variable DM_COOKIE.

This means the userspace process issuing these ioctls can be notified
by udev after udev has completed any actions triggered.

To minimise the interface extension, we pass the cookie into the
kernel in the event_nr field which is otherwise unused when calling
these ioctls.  Incrementing the version number allows userspace to
determine in advance whether or not the kernel supports the cookie.
If the kernel does support this but userspace does not, there should
be no impact as the new variable will just get ignored.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 14 ++++++++++----
 drivers/md/dm.c          | 25 +++++++++++++++++++------
 drivers/md/dm.h          |  3 ++-
 include/linux/dm-ioctl.h | 14 ++++++++++++--
 4 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1128d3fba79..1c871736f48 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
 	up_write(&_hash_lock);
 }
 
-static int dm_hash_rename(const char *old, const char *new)
+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 {
 	char *new_name, *old_name;
 	struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
 		dm_table_put(table);
 	}
 
-	dm_kobject_uevent(hc->md);
+	dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
 
 	dm_put(hc->md);
 	up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 
 	__hash_remove(hc);
 	up_write(&_hash_lock);
+
+	dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
+
 	dm_put(md);
 	param->data_size = 0;
 	return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 		return r;
 
 	param->data_size = 0;
-	return dm_hash_rename(param->name, new_name);
+	return dm_hash_rename(param->event_nr, param->name, new_name);
 }
 
 static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
 	if (dm_suspended(md))
 		r = dm_resume(md);
 
-	if (!r)
+
+	if (!r) {
+		dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
 		r = __dev_status(md, param);
+	}
 
 	dm_put(md);
 	return r;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 36142e947ff..a9210bb594e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
 
 #define DM_MSG_PREFIX "core"
 
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -1731,11 +1738,7 @@ int dm_resume(struct mapped_device *md)
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);
-
-	dm_kobject_uevent(md);
-
 	r = 0;
-
 out:
 	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
@@ -1746,9 +1749,19 @@ out:
 /*-----------------------------------------------------------------
  * Event notification.
  *---------------------------------------------------------------*/
-void dm_kobject_uevent(struct mapped_device *md)
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie)
 {
-	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+	char udev_cookie[DM_COOKIE_LENGTH];
+	char *envp[] = { udev_cookie, NULL };
+
+	if (!cookie)
+		kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+	else {
+		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+			 DM_COOKIE_ENV_VAR_NAME, cookie);
+		kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
+	}
 }
 
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e9..b5935c610c4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -92,7 +92,8 @@ void dm_stripe_exit(void);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
 
-void dm_kobject_uevent(struct mapped_device *md);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie);
 
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 48e44ee2b46..2ab84c83c31 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -123,6 +123,16 @@ struct dm_ioctl {
 	__u32 target_count;	/* in/out */
 	__s32 open_count;	/* out */
 	__u32 flags;		/* in/out */
+
+	/*
+	 * event_nr holds either the event number (input and output) or the
+	 * udev cookie value (input only).
+	 * The DM_DEV_WAIT ioctl takes an event number as input.
+	 * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+	 * use the field as a cookie to return in the DM_COOKIE
+	 * variable with the uevents they issue.
+	 * For output, the ioctls return the event number, not the cookie.
+	 */
 	__u32 event_nr;      	/* in/out */
 	__u32 padding;
 
@@ -256,9 +266,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	14
+#define DM_VERSION_MINOR	15
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2008-04-23)"
+#define DM_VERSION_EXTRA	"-ioctl (2009-04-01)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-70-g09d2


From 5ab97588fb266187b88d1ad893251c94388f18ba Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:32 +0100
Subject: dm table: replace struct io_restrictions with struct queue_limits

Use blk_stack_limits() to stack block limits (including topology) rather
than duplicate the equivalent within Device Mapper.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 138 +++++++++++++-----------------------------
 include/linux/device-mapper.h |  16 +----
 2 files changed, 45 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e3bcfb8b15a..41ec2bf9fbe 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -66,7 +66,7 @@ struct dm_table {
 	 * These are optimistic limits taken from all the
 	 * targets, some targets will need smaller limits.
 	 */
-	struct io_restrictions limits;
+	struct queue_limits limits;
 
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
@@ -88,43 +88,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
 	return result;
 }
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
-/*
- * Combine two io_restrictions, always taking the lower value.
- */
-static void combine_restrictions_low(struct io_restrictions *lhs,
-				     struct io_restrictions *rhs)
-{
-	lhs->max_sectors =
-		min_not_zero(lhs->max_sectors, rhs->max_sectors);
-
-	lhs->max_phys_segments =
-		min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
-
-	lhs->max_hw_segments =
-		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
-
-	lhs->logical_block_size = max(lhs->logical_block_size,
-				      rhs->logical_block_size);
-
-	lhs->max_segment_size =
-		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
-
-	lhs->max_hw_sectors =
-		min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
-
-	lhs->seg_boundary_mask =
-		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
-
-	lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
-
-	lhs->no_cluster |= rhs->no_cluster;
-}
-
 /*
  * Calculate the index of the child node of the n'th node k'th key.
  */
@@ -511,10 +474,14 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	return 0;
 }
 
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
 void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-	struct io_restrictions *rs = &ti->limits;
 	char b[BDEVNAME_SIZE];
 
 	if (unlikely(!q)) {
@@ -523,15 +490,9 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 		return;
 	}
 
-	/*
-	 * Combine the device limits low.
-	 *
-	 * FIXME: if we move an io_restriction struct
-	 *        into q this would just be a call to
-	 *        combine_restrictions_low()
-	 */
-	rs->max_sectors =
-		min_not_zero(rs->max_sectors, queue_max_sectors(q));
+	if (blk_stack_limits(&ti->limits, &q->limits, 0) < 0)
+		DMWARN("%s: target device %s is misaligned",
+		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
 	/*
 	 * Check if merge fn is supported.
@@ -540,33 +501,9 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 */
 
 	if (q->merge_bvec_fn && !ti->type->merge)
-		rs->max_sectors =
-			min_not_zero(rs->max_sectors,
+		ti->limits.max_sectors =
+			min_not_zero(ti->limits.max_sectors,
 				     (unsigned int) (PAGE_SIZE >> 9));
-
-	rs->max_phys_segments =
-		min_not_zero(rs->max_phys_segments,
-			     queue_max_phys_segments(q));
-
-	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
-
-	rs->logical_block_size = max(rs->logical_block_size,
-				     queue_logical_block_size(q));
-
-	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
-
-	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
-
-	rs->seg_boundary_mask =
-		min_not_zero(rs->seg_boundary_mask,
-			     queue_segment_boundary(q));
-
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
-
-	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
@@ -704,24 +641,32 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 	return 0;
 }
 
-static void check_for_valid_limits(struct io_restrictions *rs)
+static void init_valid_queue_limits(struct queue_limits *limits)
 {
-	if (!rs->max_sectors)
-		rs->max_sectors = SAFE_MAX_SECTORS;
-	if (!rs->max_hw_sectors)
-		rs->max_hw_sectors = SAFE_MAX_SECTORS;
-	if (!rs->max_phys_segments)
-		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
-	if (!rs->max_hw_segments)
-		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->logical_block_size)
-		rs->logical_block_size = 1 << SECTOR_SHIFT;
-	if (!rs->max_segment_size)
-		rs->max_segment_size = MAX_SEGMENT_SIZE;
-	if (!rs->seg_boundary_mask)
-		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	if (!rs->bounce_pfn)
-		rs->bounce_pfn = -1;
+	if (!limits->max_sectors)
+		limits->max_sectors = SAFE_MAX_SECTORS;
+	if (!limits->max_hw_sectors)
+		limits->max_hw_sectors = SAFE_MAX_SECTORS;
+	if (!limits->max_phys_segments)
+		limits->max_phys_segments = MAX_PHYS_SEGMENTS;
+	if (!limits->max_hw_segments)
+		limits->max_hw_segments = MAX_HW_SEGMENTS;
+	if (!limits->logical_block_size)
+		limits->logical_block_size = 1 << SECTOR_SHIFT;
+	if (!limits->physical_block_size)
+		limits->physical_block_size = 1 << SECTOR_SHIFT;
+	if (!limits->io_min)
+		limits->io_min = 1 << SECTOR_SHIFT;
+	if (!limits->max_segment_size)
+		limits->max_segment_size = MAX_SEGMENT_SIZE;
+	if (!limits->seg_boundary_mask)
+		limits->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	if (!limits->bounce_pfn)
+		limits->bounce_pfn = -1;
+	/*
+	 * The other fields (alignment_offset, io_opt, misaligned)
+	 * hold 0 from the kzalloc().
+	 */
 }
 
 /*
@@ -841,9 +786,12 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	/* FIXME: the plan is to combine high here and then have
-	 * the merge fn apply the target level restrictions. */
-	combine_restrictions_low(&t->limits, &tgt->limits);
+	if (blk_stack_limits(&t->limits, &tgt->limits, 0) < 0)
+		DMWARN("%s: target device (start sect %llu len %llu) "
+		       "is misaligned",
+		       dm_device_name(t->md),
+		       (unsigned long long) tgt->begin,
+		       (unsigned long long) tgt->len);
 	return 0;
 
  bad:
@@ -886,7 +834,7 @@ int dm_table_complete(struct dm_table *t)
 	int r = 0;
 	unsigned int leaf_nodes;
 
-	check_for_valid_limits(&t->limits);
+	init_valid_queue_limits(&t->limits);
 
 	r = validate_hardware_logical_block_alignment(t);
 	if (r)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index fc36a4d0772..236880c1dc3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -144,18 +144,6 @@ struct target_type {
 	struct list_head list;
 };
 
-struct io_restrictions {
-	unsigned long bounce_pfn;
-	unsigned long seg_boundary_mask;
-	unsigned max_hw_sectors;
-	unsigned max_sectors;
-	unsigned max_segment_size;
-	unsigned short logical_block_size;
-	unsigned short max_hw_segments;
-	unsigned short max_phys_segments;
-	unsigned char no_cluster; /* inverted so that 0 is default */
-};
-
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
@@ -164,7 +152,7 @@ struct dm_target {
 	sector_t begin;
 	sector_t len;
 
-	/* FIXME: turn this into a mask, and merge with io_restrictions */
+	/* FIXME: turn this into a mask, and merge with queue_limits */
 	/* Always a power of 2 */
 	sector_t split_io;
 
@@ -182,7 +170,7 @@ struct dm_target {
 	 * These are automatically filled in by
 	 * dm_table_get_device.
 	 */
-	struct io_restrictions limits;
+	struct queue_limits limits;
 
 	/* target specific data */
 	void *private;
-- 
cgit v1.2.3-70-g09d2


From af4874e03ed82f050d5872d8c39ce64bf16b5c38 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:33 +0100
Subject: dm target:s introduce iterate devices fn

Add .iterate_devices to 'struct target_type' to allow a function to be
called for all devices in a DM target.  Implemented it for all targets
except those in dm-snap.c (origin and snapshot).

(The raid1 version number jumps to 1.12 because we originally reserved
1.1 to 1.11 for 'block_on_error' but ended up using 'handle_errors'
instead.)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: martin.petersen@oracle.com
---
 drivers/md/dm-crypt.c         | 11 ++++++++++-
 drivers/md/dm-delay.c         | 20 +++++++++++++++++++-
 drivers/md/dm-linear.c        | 11 ++++++++++-
 drivers/md/dm-mpath.c         | 23 ++++++++++++++++++++++-
 drivers/md/dm-raid1.c         | 17 ++++++++++++++++-
 drivers/md/dm-stripe.c        | 18 +++++++++++++++++-
 include/linux/device-mapper.h | 11 +++++++++++
 7 files changed, 105 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 04db6c4004a..9933eb861c7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1313,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static int crypt_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct crypt_config *cc = ti->private;
+
+	return fn(ti, cc->dev, cc->start, data);
+}
+
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version= {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
@@ -1326,6 +1334,7 @@ static struct target_type crypt_target = {
 	.resume = crypt_resume,
 	.message = crypt_message,
 	.merge  = crypt_merge,
+	.iterate_devices = crypt_iterate_devices,
 };
 
 static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 8ad8a9044bb..4e5b843cd4d 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -318,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int delay_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct delay_c *dc = ti->private;
+	int ret = 0;
+
+	ret = fn(ti, dc->dev_read, dc->start_read, data);
+	if (ret)
+		goto out;
+
+	if (dc->dev_write)
+		ret = fn(ti, dc->dev_write, dc->start_write, data);
+
+out:
+	return ret;
+}
+
 static struct target_type delay_target = {
 	.name	     = "delay",
-	.version     = {1, 0, 2},
+	.version     = {1, 1, 0},
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
@@ -328,6 +345,7 @@ static struct target_type delay_target = {
 	.presuspend  = delay_presuspend,
 	.resume	     = delay_resume,
 	.status	     = delay_status,
+	.iterate_devices = delay_iterate_devices,
 };
 
 static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ecbb17421da..9184b6deb86 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -134,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static int linear_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct linear_c *lc = ti->private;
+
+	return fn(ti, lc->dev, lc->start, data);
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version= {1, 0, 3},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
@@ -144,6 +152,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
+	.iterate_devices = linear_iterate_devices,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 890c0e8ed13..f8aeaaa54af 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1450,12 +1450,32 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
 
+static int multipath_iterate_devices(struct dm_target *ti,
+				     iterate_devices_callout_fn fn, void *data)
+{
+	struct multipath *m = ti->private;
+	struct priority_group *pg;
+	struct pgpath *p;
+	int ret = 0;
+
+	list_for_each_entry(pg, &m->priority_groups, list) {
+		list_for_each_entry(p, &pg->pgpaths, list) {
+			ret = fn(ti, p->path.dev, ti->begin, data);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
 /*-----------------------------------------------------------------
  * Module setup
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 0, 5},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
@@ -1466,6 +1486,7 @@ static struct target_type multipath_target = {
 	.status = multipath_status,
 	.message = multipath_message,
 	.ioctl  = multipath_ioctl,
+	.iterate_devices = multipath_iterate_devices,
 };
 
 static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967..ce8868c768c 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int mirror_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct mirror_set *ms = ti->private;
+	int ret = 0;
+	unsigned i;
+
+	for (i = 0; !ret && i < ms->nr_mirrors; i++)
+		ret = fn(ti, ms->mirror[i].dev,
+			 ms->mirror[i].offset, data);
+
+	return ret;
+}
+
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 0, 20},
+	.version = {1, 12, 0},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
 	.postsuspend = mirror_postsuspend,
 	.resume	 = mirror_resume,
 	.status	 = mirror_status,
+	.iterate_devices = mirror_iterate_devices,
 };
 
 static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c64fe827a5f..b240e85ae39 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -313,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	return error;
 }
 
+static int stripe_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct stripe_c *sc = ti->private;
+	int ret = 0;
+	unsigned i = 0;
+
+	do
+		ret = fn(ti, sc->stripe[i].dev,
+			 sc->stripe[i].physical_start, data);
+	while (!ret && ++i < sc->stripes);
+
+	return ret;
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
 	.map    = stripe_map,
 	.end_io = stripe_end_io,
 	.status = stripe_status,
+	.iterate_devices = stripe_iterate_devices,
 };
 
 int __init dm_stripe_init(void)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 236880c1dc3..deac3b4e5e1 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -11,6 +11,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 
+struct dm_dev;
 struct dm_target;
 struct dm_table;
 struct mapped_device;
@@ -81,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
 			    struct bio_vec *biovec, int max_size);
 
+typedef int (*iterate_devices_callout_fn) (struct dm_target *ti,
+					   struct dm_dev *dev,
+					   sector_t physical_start,
+					   void *data);
+
+typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
+				      iterate_devices_callout_fn fn,
+				      void *data);
+
 /*
  * Returns:
  *    0: The target can handle the next I/O immediately.
@@ -139,6 +149,7 @@ struct target_type {
 	dm_ioctl_fn ioctl;
 	dm_merge_fn merge;
 	dm_busy_fn busy;
+	dm_iterate_devices_fn iterate_devices;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3-70-g09d2


From 754c5fc7ebb417b23601a6222a6005cc2e7f2913 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:34 +0100
Subject: dm: calculate queue limits during resume not load

Currently, device-mapper maintains a separate instance of 'struct
queue_limits' for each table of each device.  When the configuration of
a device is to be changed, first its table is loaded and this structure
is populated, then the device is 'resumed' and the calculated
queue_limits are applied.

This places restrictions on how userspace may process related devices,
where it is often advantageous to 'load' tables for several devices
at once before 'resuming' them together.  As the new queue_limits
only take effect after the 'resume', if they are changing and one
device uses another, the latter must be 'resumed' before the former
may be 'loaded'.

This patch moves the calculation of these queue_limits out of
the 'load' operation into 'resume'.  Since we are no longer
pre-calculating this struct, we no longer need to maintain copies
within our dm structs.

dm_set_device_limits() now passes the 'start' of the device's
data area (aka pe_start) as the 'offset' to blk_stack_limits().

init_valid_queue_limits() is replaced by blk_set_default_limits().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: martin.petersen@oracle.com
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 185 +++++++++++++++++++++++-------------------
 drivers/md/dm.c               |  12 ++-
 drivers/md/dm.h               |   5 +-
 include/linux/device-mapper.h |  10 +--
 4 files changed, 117 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 267817edc84..09a57113955 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -62,12 +62,6 @@ struct dm_table {
 	/* a list of devices used by this table */
 	struct list_head devices;
 
-	/*
-	 * These are optimistic limits taken from all the
-	 * targets, some targets will need smaller limits.
-	 */
-	struct queue_limits limits;
-
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
 	void *event_context;
@@ -346,18 +340,21 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 /*
  * If possible, this checks an area of a destination device is valid.
  */
-static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
-			     sector_t start, sector_t len)
+static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, void *data)
 {
-	sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+	struct queue_limits *limits = data;
+	struct block_device *bdev = dev->bdev;
+	sector_t dev_size =
+		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 	unsigned short logical_block_size_sectors =
-		ti->limits.logical_block_size >> SECTOR_SHIFT;
+		limits->logical_block_size >> SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];
 
 	if (!dev_size)
 		return 1;
 
-	if ((start >= dev_size) || (start + len > dev_size)) {
+	if ((start >= dev_size) || (start + ti->len > dev_size)) {
 		DMWARN("%s: %s too small for target",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 		return 0;
@@ -371,16 +368,16 @@ static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)start,
-		       ti->limits.logical_block_size, bdevname(bdev, b));
+		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
 
-	if (len & (logical_block_size_sectors - 1)) {
+	if (ti->len & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: len=%llu not aligned to h/w "
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
-		       (unsigned long long)len,
-		       ti->limits.logical_block_size, bdevname(bdev, b));
+		       (unsigned long long)ti->len,
+		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
 
@@ -479,18 +476,21 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
  */
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+			 sector_t start, void *data)
 {
+	struct queue_limits *limits = data;
+	struct block_device *bdev = dev->bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
 	char b[BDEVNAME_SIZE];
 
 	if (unlikely(!q)) {
 		DMWARN("%s: Cannot set limits for nonexistent device %s",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
-		return;
+		return 0;
 	}
 
-	if (blk_stack_limits(&ti->limits, &q->limits, 0) < 0)
+	if (blk_stack_limits(limits, &q->limits, start) < 0)
 		DMWARN("%s: target device %s is misaligned",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
@@ -501,32 +501,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 */
 
 	if (q->merge_bvec_fn && !ti->type->merge)
-		ti->limits.max_sectors =
-			min_not_zero(ti->limits.max_sectors,
+		limits->max_sectors =
+			min_not_zero(limits->max_sectors,
 				     (unsigned int) (PAGE_SIZE >> 9));
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 		  sector_t len, fmode_t mode, struct dm_dev **result)
 {
-	int r = __table_get_device(ti->table, ti, path,
-				   start, len, mode, result);
-
-	if (r)
-		return r;
-
-	dm_set_device_limits(ti, (*result)->bdev);
-
-	if (!device_area_is_valid(ti, (*result)->bdev, start, len)) {
-		dm_put_device(ti, *result);
-		*result = NULL;
-		return -EINVAL;
-	}
-
-	return r;
+	return __table_get_device(ti->table, ti, path,
+				  start, len, mode, result);
 }
 
+
 /*
  * Decrement a devices use count and remove it if necessary.
  */
@@ -641,34 +630,6 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 	return 0;
 }
 
-static void init_valid_queue_limits(struct queue_limits *limits)
-{
-	if (!limits->max_sectors)
-		limits->max_sectors = SAFE_MAX_SECTORS;
-	if (!limits->max_hw_sectors)
-		limits->max_hw_sectors = SAFE_MAX_SECTORS;
-	if (!limits->max_phys_segments)
-		limits->max_phys_segments = MAX_PHYS_SEGMENTS;
-	if (!limits->max_hw_segments)
-		limits->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!limits->logical_block_size)
-		limits->logical_block_size = 1 << SECTOR_SHIFT;
-	if (!limits->physical_block_size)
-		limits->physical_block_size = 1 << SECTOR_SHIFT;
-	if (!limits->io_min)
-		limits->io_min = 1 << SECTOR_SHIFT;
-	if (!limits->max_segment_size)
-		limits->max_segment_size = MAX_SEGMENT_SIZE;
-	if (!limits->seg_boundary_mask)
-		limits->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	if (!limits->bounce_pfn)
-		limits->bounce_pfn = -1;
-	/*
-	 * The other fields (alignment_offset, io_opt, misaligned)
-	 * hold 0 from the kzalloc().
-	 */
-}
-
 /*
  * Impose necessary and sufficient conditions on a devices's table such
  * that any incoming bio which respects its logical_block_size can be
@@ -676,14 +637,15 @@ static void init_valid_queue_limits(struct queue_limits *limits)
  * two or more targets, the size of each piece it gets split into must
  * be compatible with the logical_block_size of the target processing it.
  */
-static int validate_hardware_logical_block_alignment(struct dm_table *table)
+static int validate_hardware_logical_block_alignment(struct dm_table *table,
+						 struct queue_limits *limits)
 {
 	/*
 	 * This function uses arithmetic modulo the logical_block_size
 	 * (in units of 512-byte sectors).
 	 */
 	unsigned short device_logical_block_size_sects =
-		table->limits.logical_block_size >> SECTOR_SHIFT;
+		limits->logical_block_size >> SECTOR_SHIFT;
 
 	/*
 	 * Offset of the start of the next table entry, mod logical_block_size.
@@ -697,6 +659,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 	unsigned short remaining = 0;
 
 	struct dm_target *uninitialized_var(ti);
+	struct queue_limits ti_limits;
 	unsigned i = 0;
 
 	/*
@@ -705,12 +668,19 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
 
+		blk_set_default_limits(&ti_limits);
+
+		/* combine all target devices' limits */
+		if (ti->type->iterate_devices)
+			ti->type->iterate_devices(ti, dm_set_device_limits,
+						  &ti_limits);
+
 		/*
 		 * If the remaining sectors fall entirely within this
 		 * table entry are they compatible with its logical_block_size?
 		 */
 		if (remaining < ti->len &&
-		    remaining & ((ti->limits.logical_block_size >>
+		    remaining & ((ti_limits.logical_block_size >>
 				  SECTOR_SHIFT) - 1))
 			break;	/* Error */
 
@@ -723,11 +693,11 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 
 	if (remaining) {
 		DMWARN("%s: table line %u (start sect %llu len %llu) "
-		       "not aligned to hardware logical block size %hu",
+		       "not aligned to h/w logical block size %hu",
 		       dm_device_name(table->md), i,
 		       (unsigned long long) ti->begin,
 		       (unsigned long long) ti->len,
-		       table->limits.logical_block_size);
+		       limits->logical_block_size);
 		return -EINVAL;
 	}
 
@@ -786,12 +756,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	if (blk_stack_limits(&t->limits, &tgt->limits, 0) < 0)
-		DMWARN("%s: target device (start sect %llu len %llu) "
-		       "is misaligned",
-		       dm_device_name(t->md),
-		       (unsigned long long) tgt->begin,
-		       (unsigned long long) tgt->len);
 	return 0;
 
  bad:
@@ -834,12 +798,6 @@ int dm_table_complete(struct dm_table *t)
 	int r = 0;
 	unsigned int leaf_nodes;
 
-	init_valid_queue_limits(&t->limits);
-
-	r = validate_hardware_logical_block_alignment(t);
-	if (r)
-		return r;
-
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -914,6 +872,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+/*
+ * Establish the new table's queue_limits and validate them.
+ */
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits)
+{
+	struct dm_target *uninitialized_var(ti);
+	struct queue_limits ti_limits;
+	unsigned i = 0;
+
+	blk_set_default_limits(limits);
+
+	while (i < dm_table_get_num_targets(table)) {
+		blk_set_default_limits(&ti_limits);
+
+		ti = dm_table_get_target(table, i++);
+
+		if (!ti->type->iterate_devices)
+			goto combine_limits;
+
+		/*
+		 * Combine queue limits of all the devices this target uses.
+		 */
+		ti->type->iterate_devices(ti, dm_set_device_limits,
+					  &ti_limits);
+
+		/*
+		 * Check each device area is consistent with the target's
+		 * overall queue limits.
+		 */
+		if (!ti->type->iterate_devices(ti, device_area_is_valid,
+					       &ti_limits))
+			return -EINVAL;
+
+combine_limits:
+		/*
+		 * Merge this target's queue limits into the overall limits
+		 * for the table.
+		 */
+		if (blk_stack_limits(limits, &ti_limits, 0) < 0)
+			DMWARN("%s: target device "
+			       "(start sect %llu len %llu) "
+			       "is misaligned",
+			       dm_device_name(table->md),
+			       (unsigned long long) ti->begin,
+			       (unsigned long long) ti->len);
+	}
+
+	return validate_hardware_logical_block_alignment(table, limits);
+}
+
 /*
  * Set the integrity profile for this device if all devices used have
  * matching profiles.
@@ -953,14 +962,24 @@ no_integrity:
 	return;
 }
 
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits)
 {
+	/*
+	 * Each target device in the table has a data area that should normally
+	 * be aligned such that the DM device's alignment_offset is 0.
+	 * FIXME: Propagate alignment_offsets up the stack and warn of
+	 *	  sub-optimal or inconsistent settings.
+	 */
+	limits->alignment_offset = 0;
+	limits->misaligned = 0;
+
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
-	q->limits = t->limits;
+	q->limits = *limits;
 
-	if (t->limits.no_cluster)
+	if (limits->no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a9210bb594e..f609793a92d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1313,7 +1313,8 @@ static void __set_size(struct mapped_device *md, sector_t size)
 	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
-static int __bind(struct mapped_device *md, struct dm_table *t)
+static int __bind(struct mapped_device *md, struct dm_table *t,
+		  struct queue_limits *limits)
 {
 	struct request_queue *q = md->queue;
 	sector_t size;
@@ -1337,7 +1338,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
 	write_lock(&md->map_lock);
 	md->map = t;
-	dm_table_set_restrictions(t, q);
+	dm_table_set_restrictions(t, q, limits);
 	write_unlock(&md->map_lock);
 
 	return 0;
@@ -1562,6 +1563,7 @@ static void dm_queue_flush(struct mapped_device *md)
  */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
+	struct queue_limits limits;
 	int r = -EINVAL;
 
 	mutex_lock(&md->suspend_lock);
@@ -1570,8 +1572,12 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended(md))
 		goto out;
 
+	r = dm_calculate_queue_limits(table, &limits);
+	if (r)
+		goto out;
+
 	__unbind(md);
-	r = __bind(md, table);
+	r = __bind(md, table, &limits);
 
 out:
 	mutex_unlock(&md->suspend_lock);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b5935c610c4..604e85caadf 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -41,7 +41,10 @@ void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index deac3b4e5e1..e6bf3b8c7bf 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -103,7 +103,8 @@ void dm_error(const char *message);
 /*
  * Combine device limits.
  */
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev);
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+			 sector_t start, void *data);
 
 struct dm_dev {
 	struct block_device *bdev;
@@ -163,7 +164,6 @@ struct dm_target {
 	sector_t begin;
 	sector_t len;
 
-	/* FIXME: turn this into a mask, and merge with queue_limits */
 	/* Always a power of 2 */
 	sector_t split_io;
 
@@ -177,12 +177,6 @@ struct dm_target {
 	 */
 	unsigned num_flush_requests;
 
-	/*
-	 * These are automatically filled in by
-	 * dm_table_get_device.
-	 */
-	struct queue_limits limits;
-
 	/* target specific data */
 	void *private;
 
-- 
cgit v1.2.3-70-g09d2


From f5db4af466e2dca0fe822019812d586ca910b00c Mon Sep 17 00:00:00 2001
From: Jonthan Brassow <jbrassow@redhat.com>
Date: Mon, 22 Jun 2009 10:12:35 +0100
Subject: dm raid1: add userspace log

This patch contains a device-mapper mirror log module that forwards
requests to userspace for processing.

The structures used for communication between kernel and userspace are
located in include/linux/dm-log-userspace.h.  Due to the frequency,
diversity, and 2-way communication nature of the exchanges between
kernel and userspace, 'connector' was chosen as the interface for
communication.

The first log implementations written in userspace - "clustered-disk"
and "clustered-core" - support clustered shared storage.   A userspace
daemon (in the LVM2 source code repository) uses openAIS/corosync to
process requests in an ordered fashion with the rest of the nodes in the
cluster so as to prevent log state corruption.  Other implementations
with no association to LVM or openAIS/corosync, are certainly possible.

(Imagine if two machines are writing to the same region of a mirror.
They would both mark the region dirty, but you need a cluster-aware
entity that can handle properly marking the region clean when they are
done.  Otherwise, you might clear the region when the first machine is
done, not the second.)

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-log.txt |  54 +++
 drivers/md/Kconfig                     |  11 +
 drivers/md/Makefile                    |   3 +
 drivers/md/dm-log-userspace-base.c     | 696 +++++++++++++++++++++++++++++++++
 drivers/md/dm-log-userspace-transfer.c | 276 +++++++++++++
 drivers/md/dm-log-userspace-transfer.h |  18 +
 include/linux/Kbuild                   |   1 +
 include/linux/connector.h              |   4 +-
 include/linux/dm-log-userspace.h       | 386 ++++++++++++++++++
 9 files changed, 1448 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/device-mapper/dm-log.txt
 create mode 100644 drivers/md/dm-log-userspace-base.c
 create mode 100644 drivers/md/dm-log-userspace-transfer.c
 create mode 100644 drivers/md/dm-log-userspace-transfer.h
 create mode 100644 include/linux/dm-log-userspace.h

(limited to 'include/linux')

diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
new file mode 100644
index 00000000000..994dd75475a
--- /dev/null
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
+Device-Mapper Logging
+=====================
+The device-mapper logging code is used by some of the device-mapper
+RAID targets to track regions of the disk that are not consistent.
+A region (or portion of the address space) of the disk may be
+inconsistent because a RAID stripe is currently being operated on or
+a machine died while the region was being altered.  In the case of
+mirrors, a region would be considered dirty/inconsistent while you
+are writing to it because the writes need to be replicated for all
+the legs of the mirror and may not reach the legs at the same time.
+Once all writes are complete, the region is considered clean again.
+
+There is a generic logging interface that the device-mapper RAID
+implementations use to perform logging operations (see
+dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
+logging implementations are available and provide different
+capabilities.  The list includes:
+
+Type		Files
+====		=====
+disk		drivers/md/dm-log.c
+core		drivers/md/dm-log.c
+userspace	drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
+
+The "disk" log type
+-------------------
+This log implementation commits the log state to disk.  This way, the
+logging state survives reboots/crashes.
+
+The "core" log type
+-------------------
+This log implementation keeps the log state in memory.  The log state
+will not survive a reboot or crash, but there may be a small boost in
+performance.  This method can also be used if no storage device is
+available for storing log state.
+
+The "userspace" log type
+------------------------
+This log type simply provides a way to export the log API to userspace,
+so log implementations can be done there.  This is done by forwarding most
+logging requests to userspace, where a daemon receives and processes the
+request.
+
+The structure used for communication between kernel and userspace are
+located in include/linux/dm-log-userspace.h.  Due to the frequency,
+diversity, and 2-way communication nature of the exchanges between
+kernel and userspace, 'connector' is used as the interface for
+communication.
+
+There are currently two userspace log implementations that leverage this
+framework - "clustered_disk" and "clustered_core".  These implementations
+provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
+can be used in a shared-storage environment when the cluster log implementations
+are employed.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 09f93fa6891..020f9573fd8 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,6 +231,17 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_LOG_USERSPACE
+	tristate "Mirror userspace logging (EXPERIMENTAL)"
+	depends on DM_MIRROR && EXPERIMENTAL && NET
+	select CONNECTOR
+	---help---
+	  The userspace logging module provides a mechanism for
+	  relaying the dm-dirty-log API to userspace.  Log designs
+	  which are more suited to userspace implementation (e.g.
+	  shared storage logs) or experimental logs can be implemented
+	  by leveraging this framework.
+
 config DM_ZERO
 	tristate "Zero target"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dade52f6073..1dc4185bd78 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 dm-mirror-y	+= dm-raid1.o
+dm-log-userspace-y \
+		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
 raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
@@ -40,6 +42,7 @@ obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
 quiet_cmd_unroll = UNROLL  $@
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 00000000000..e69b9656099
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+struct flush_entry {
+	int type;
+	region_t region;
+	struct list_head list;
+};
+
+struct log_c {
+	struct dm_target *ti;
+	uint32_t region_size;
+	region_t region_count;
+	char uuid[DM_UUID_LEN];
+
+	char *usr_argv_str;
+	uint32_t usr_argc;
+
+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
+	spinlock_t flush_lock;
+	struct list_head flush_list;  /* only for clear and mark requests */
+};
+
+static mempool_t *flush_entry_pool;
+
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+
+static void flush_entry_free(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+				int request_type, char *data, size_t data_size,
+				char *rdata, size_t *rdata_size)
+{
+	int r;
+
+	/*
+	 * If the server isn't there, -ESRCH is returned,
+	 * and we must keep trying until the server is
+	 * restored.
+	 */
+retry:
+	r = dm_consult_userspace(uuid, request_type, data,
+				 data_size, rdata, rdata_size);
+
+	if (r != -ESRCH)
+		return r;
+
+	DMERR(" Userspace log server not found.");
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(2*HZ);
+		DMWARN("Attempting to contact userspace log server...");
+		r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+					 strlen(lc->usr_argv_str) + 1,
+					 NULL, NULL);
+		if (!r)
+			break;
+	}
+	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+	r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+				 0, NULL, NULL);
+	if (!r)
+		goto retry;
+
+	DMERR("Error trying to resume userspace log: %d", r);
+
+	return -ESRCH;
+}
+
+static int build_constructor_string(struct dm_target *ti,
+				    unsigned argc, char **argv,
+				    char **ctr_str)
+{
+	int i, str_size;
+	char *str = NULL;
+
+	*ctr_str = NULL;
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+
+	str_size += 20; /* Max number of chars in a printed u64 number */
+
+	str = kzalloc(str_size, GFP_KERNEL);
+	if (!str) {
+		DMWARN("Unable to allocate memory for constructor string");
+		return -ENOMEM;
+	}
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += sprintf(str + str_size, "%s ", argv[i]);
+	str_size += sprintf(str + str_size, "%llu",
+			    (unsigned long long)ti->len);
+
+	*ctr_str = str;
+	return str_size;
+}
+
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *	<UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *	<UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+			 unsigned argc, char **argv)
+{
+	int r = 0;
+	int str_size;
+	char *ctr_str = NULL;
+	struct log_c *lc = NULL;
+	uint64_t rdata;
+	size_t rdata_size = sizeof(rdata);
+
+	if (argc < 3) {
+		DMWARN("Too few arguments to userspace dirty log");
+		return -EINVAL;
+	}
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		DMWARN("Unable to allocate userspace log context.");
+		return -ENOMEM;
+	}
+
+	lc->ti = ti;
+
+	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+		DMWARN("UUID argument too long.");
+		kfree(lc);
+		return -EINVAL;
+	}
+
+	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+	spin_lock_init(&lc->flush_lock);
+	INIT_LIST_HEAD(&lc->flush_list);
+
+	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+	if (str_size < 0) {
+		kfree(lc);
+		return str_size;
+	}
+
+	/* Send table string */
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+				 ctr_str, str_size, NULL, NULL);
+
+	if (r == -ESRCH) {
+		DMERR("Userspace log server not found");
+		goto out;
+	}
+
+	/* Since the region size does not change, get it now */
+	rdata_size = sizeof(rdata);
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+				 NULL, 0, (char *)&rdata, &rdata_size);
+
+	if (r) {
+		DMERR("Failed to get region size of dirty log");
+		goto out;
+	}
+
+	lc->region_size = (uint32_t)rdata;
+	lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+
+out:
+	if (r) {
+		kfree(lc);
+		kfree(ctr_str);
+	} else {
+		lc->usr_argv_str = ctr_str;
+		lc->usr_argc = argc;
+		log->context = lc;
+	}
+
+	return r;
+}
+
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+				 NULL, 0,
+				 NULL, NULL);
+
+	kfree(lc->usr_argv_str);
+	kfree(lc);
+
+	return;
+}
+
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_resume(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	lc->in_sync_hint = 0;
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+	struct log_c *lc = log->context;
+
+	return lc->region_size;
+}
+
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+	int r;
+	uint64_t region64 = (uint64_t)region;
+	int64_t is_clean;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(is_clean);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&is_clean, &rdata_size);
+
+	return (r) ? 0 : (int)is_clean;
+}
+
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+			     int can_block)
+{
+	int r;
+	uint64_t region64 = region;
+	int64_t in_sync;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	/*
+	 * We can never respond directly - even if in_sync_hint is
+	 * set.  This is because another machine could see a device
+	 * failure and mark the region out-of-sync.  If we don't go
+	 * to userspace to ask, we might think the region is in-sync
+	 * and allow a read to pick up data that is stale.  (This is
+	 * very unlikely if a device actually fails; but it is very
+	 * likely if a connection to one device from one machine fails.)
+	 *
+	 * There still might be a problem if the mirror caches the region
+	 * state as in-sync... but then this call would not be made.  So,
+	 * that is a mirror problem.
+	 */
+	if (!can_block)
+		return -EWOULDBLOCK;
+
+	rdata_size = sizeof(in_sync);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&in_sync, &rdata_size);
+	return (r) ? 0 : (int)in_sync;
+}
+
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+	int r = 0;
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	LIST_HEAD(flush_list);
+	struct flush_entry *fe, *tmp_fe;
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	list_splice_init(&lc->flush_list, &flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	if (list_empty(&flush_list))
+		return 0;
+
+	/*
+	 * FIXME: Count up requests, group request types,
+	 * allocate memory to stick all requests in and
+	 * send to server in one go.  Failing the allocation,
+	 * do it one by one.
+	 */
+
+	list_for_each_entry(fe, &flush_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			goto fail;
+	}
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+				 NULL, 0, NULL, NULL);
+
+fail:
+	/*
+	 * We can safely remove these entries, even if failure.
+	 * Calling code will receive an error and will know that
+	 * the log facility has failed.
+	 */
+	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+		list_del(&fe->list);
+		mempool_free(fe, flush_entry_pool);
+	}
+
+	if (r)
+		dm_table_event(lc->ti->table);
+
+	return r;
+}
+
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/* Wait for an allocation, but _never_ fail */
+	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	BUG_ON(!fe);
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_MARK_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/*
+	 * If we fail to allocate, we skip the clearing of
+	 * the region.  This doesn't hurt us in any way, except
+	 * to cause the region to be resync'ed when the
+	 * device is activated next time.
+	 */
+	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	if (!fe) {
+		DMERR("Failed to allocate memory to clear region.");
+		return;
+	}
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_CLEAR_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+	int r;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+	struct {
+		int64_t i; /* 64-bit for mix arch compatibility */
+		region_t r;
+	} pkg;
+
+	if (lc->in_sync_hint >= lc->region_count)
+		return 0;
+
+	rdata_size = sizeof(pkg);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+				 NULL, 0,
+				 (char *)&pkg, &rdata_size);
+
+	*region = pkg.r;
+	return (r) ? r : (int)pkg.i;
+}
+
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+				      region_t region, int in_sync)
+{
+	int r;
+	struct log_c *lc = log->context;
+	struct {
+		region_t r;
+		int64_t i;
+	} pkg;
+
+	pkg.r = region;
+	pkg.i = (int64_t)in_sync;
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				 (char *)&pkg, sizeof(pkg),
+				 NULL, NULL);
+
+	/*
+	 * It would be nice to be able to report failures.
+	 * However, it is easy emough to detect and resolve.
+	 */
+	return;
+}
+
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+	int r;
+	size_t rdata_size;
+	uint64_t sync_count;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(sync_count);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+				 NULL, 0,
+				 (char *)&sync_count, &rdata_size);
+
+	if (r)
+		return 0;
+
+	if (sync_count >= lc->region_count)
+		lc->in_sync_hint = lc->region_count;
+
+	return (region_t)sync_count;
+}
+
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+			    char *result, unsigned maxlen)
+{
+	int r = 0;
+	size_t sz = (size_t)maxlen;
+	struct log_c *lc = log->context;
+
+	switch (status_type) {
+	case STATUSTYPE_INFO:
+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+					 NULL, 0,
+					 result, &sz);
+
+		if (r) {
+			sz = 0;
+			DMEMIT("%s 1 COM_FAILURE", log->type->name);
+		}
+		break;
+	case STATUSTYPE_TABLE:
+		sz = 0;
+		DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+		       lc->uuid, lc->usr_argv_str);
+		break;
+	}
+	return (r) ? 0 : (int)sz;
+}
+
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+					  region_t region)
+{
+	int r;
+	uint64_t region64 = region;
+	struct log_c *lc = log->context;
+	static unsigned long long limit;
+	struct {
+		int64_t is_recovering;
+		uint64_t in_sync_hint;
+	} pkg;
+	size_t rdata_size = sizeof(pkg);
+
+	/*
+	 * Once the mirror has been reported to be in-sync,
+	 * it will never again ask for recovery work.  So,
+	 * we can safely say there is not a remote machine
+	 * recovering if the device is in-sync.  (in_sync_hint
+	 * must be reset at resume time.)
+	 */
+	if (region < lc->in_sync_hint)
+		return 0;
+	else if (jiffies < limit)
+		return 1;
+
+	limit = jiffies + (HZ / 4);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&pkg, &rdata_size);
+	if (r)
+		return 1;
+
+	lc->in_sync_hint = pkg.in_sync_hint;
+
+	return (int)pkg.is_recovering;
+}
+
+static struct dm_dirty_log_type _userspace_type = {
+	.name = "userspace",
+	.module = THIS_MODULE,
+	.ctr = userspace_ctr,
+	.dtr = userspace_dtr,
+	.presuspend = userspace_presuspend,
+	.postsuspend = userspace_postsuspend,
+	.resume = userspace_resume,
+	.get_region_size = userspace_get_region_size,
+	.is_clean = userspace_is_clean,
+	.in_sync = userspace_in_sync,
+	.flush = userspace_flush,
+	.mark_region = userspace_mark_region,
+	.clear_region = userspace_clear_region,
+	.get_resync_work = userspace_get_resync_work,
+	.set_region_sync = userspace_set_region_sync,
+	.get_sync_count = userspace_get_sync_count,
+	.status = userspace_status,
+	.is_remote_recovering = userspace_is_remote_recovering,
+};
+
+static int __init userspace_dirty_log_init(void)
+{
+	int r = 0;
+
+	flush_entry_pool = mempool_create(100, flush_entry_alloc,
+					  flush_entry_free, NULL);
+
+	if (!flush_entry_pool) {
+		DMWARN("Unable to create flush_entry_pool:  No memory.");
+		return -ENOMEM;
+	}
+
+	r = dm_ulog_tfr_init();
+	if (r) {
+		DMWARN("Unable to initialize userspace log communications");
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	r = dm_dirty_log_type_register(&_userspace_type);
+	if (r) {
+		DMWARN("Couldn't register userspace dirty log type");
+		dm_ulog_tfr_exit();
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	DMINFO("version 1.0.0 loaded");
+	return 0;
+}
+
+static void __exit userspace_dirty_log_exit(void)
+{
+	dm_dirty_log_type_unregister(&_userspace_type);
+	dm_ulog_tfr_exit();
+	mempool_destroy(flush_entry_pool);
+
+	DMINFO("version 1.0.0 unloaded");
+	return;
+}
+
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 00000000000..0ca1ee768a1
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+static uint32_t dm_ulog_seq;
+
+/*
+ * Netlink/Connector is an unreliable protocol.  How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+
+static struct cb_id ulog_cn_id = {
+	.idx = CN_IDX_DM,
+	.val = CN_VAL_DM_USERSPACE_LOG
+};
+
+static DEFINE_MUTEX(dm_ulog_lock);
+
+struct receiving_pkg {
+	struct list_head list;
+	struct completion complete;
+
+	uint32_t seq;
+
+	int error;
+	size_t *data_size;
+	char *data;
+};
+
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+	int r;
+	struct cn_msg *msg = prealloced_cn_msg;
+
+	memset(msg, 0, sizeof(struct cn_msg));
+
+	msg->id.idx = ulog_cn_id.idx;
+	msg->id.val = ulog_cn_id.val;
+	msg->ack = 0;
+	msg->seq = tfr->seq;
+	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+
+	r = cn_netlink_send(msg, 0, gfp_any());
+
+	return r;
+}
+
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both.  This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+	uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+	struct receiving_pkg *pkg;
+
+	/*
+	 * The 'receiving_pkg' entries in this list are statically
+	 * allocated on the stack in 'dm_consult_userspace'.
+	 * Each process that is waiting for a reply from the user
+	 * space server will have an entry in this list.
+	 *
+	 * We are safe to do it this way because the stack space
+	 * is unique to each process, but still addressable by
+	 * other processes.
+	 */
+	list_for_each_entry(pkg, &receiving_list, list) {
+		if (rtn_seq != pkg->seq)
+			continue;
+
+		if (msg) {
+			pkg->error = -msg->ack;
+			/*
+			 * If we are trying again, we will need to know our
+			 * storage capacity.  Otherwise, along with the
+			 * error code, we make explicit that we have no data.
+			 */
+			if (pkg->error != -EAGAIN)
+				*(pkg->data_size) = 0;
+		} else if (tfr->data_size > *(pkg->data_size)) {
+			DMERR("Insufficient space to receive package [%u] "
+			      "(%u vs %lu)", tfr->request_type,
+			      tfr->data_size, *(pkg->data_size));
+
+			*(pkg->data_size) = 0;
+			pkg->error = -ENOSPC;
+		} else {
+			pkg->error = tfr->error;
+			memcpy(pkg->data, tfr->data, tfr->data_size);
+			*(pkg->data_size) = tfr->data_size;
+		}
+		complete(&pkg->complete);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(void *data)
+{
+	struct cn_msg *msg = (struct cn_msg *)data;
+	struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+
+	spin_lock(&receiving_list_lock);
+	if (msg->len == 0)
+		fill_pkg(msg, NULL);
+	else if (msg->len < sizeof(*tfr))
+		DMERR("Incomplete message received (expected %u, got %u): [%u]",
+		      (unsigned)sizeof(*tfr), msg->len, msg->seq);
+	else
+		fill_pkg(NULL, tfr);
+	spin_unlock(&receiving_list_lock);
+}
+
+/**
+ * dm_consult_userspace
+ * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @request_type:  found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space.  All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size)
+{
+	int r = 0;
+	size_t dummy = 0;
+	int overhead_size =
+		sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
+	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+	struct receiving_pkg pkg;
+
+	if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+		DMINFO("Size of tfr exceeds preallocated size");
+		return -EINVAL;
+	}
+
+	if (!rdata_size)
+		rdata_size = &dummy;
+resend:
+	/*
+	 * We serialize the sending of requests so we can
+	 * use the preallocated space.
+	 */
+	mutex_lock(&dm_ulog_lock);
+
+	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
+	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->seq = dm_ulog_seq++;
+
+	/*
+	 * Must be valid request type (all other bits set to
+	 * zero).  This reserves other bits for possible future
+	 * use.
+	 */
+	tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+
+	tfr->data_size = data_size;
+	if (data && data_size)
+		memcpy(tfr->data, data, data_size);
+
+	memset(&pkg, 0, sizeof(pkg));
+	init_completion(&pkg.complete);
+	pkg.seq = tfr->seq;
+	pkg.data_size = rdata_size;
+	pkg.data = rdata;
+	spin_lock(&receiving_list_lock);
+	list_add(&(pkg.list), &receiving_list);
+	spin_unlock(&receiving_list_lock);
+
+	r = dm_ulog_sendto_server(tfr);
+
+	mutex_unlock(&dm_ulog_lock);
+
+	if (r) {
+		DMERR("Unable to send log request [%u] to userspace: %d",
+		      request_type, r);
+		spin_lock(&receiving_list_lock);
+		list_del_init(&(pkg.list));
+		spin_unlock(&receiving_list_lock);
+
+		goto out;
+	}
+
+	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	spin_lock(&receiving_list_lock);
+	list_del_init(&(pkg.list));
+	spin_unlock(&receiving_list_lock);
+	if (!r) {
+		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+		       (strlen(uuid) > 8) ?
+		       (uuid + (strlen(uuid) - 8)) : (uuid),
+		       request_type, pkg.seq);
+		goto resend;
+	}
+
+	r = pkg.error;
+	if (r == -EAGAIN)
+		goto resend;
+
+out:
+	return r;
+}
+
+int dm_ulog_tfr_init(void)
+{
+	int r;
+	void *prealloced;
+
+	INIT_LIST_HEAD(&receiving_list);
+
+	prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+	if (!prealloced)
+		return -ENOMEM;
+
+	prealloced_cn_msg = prealloced;
+	prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+
+	r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+	if (r) {
+		cn_del_callback(&ulog_cn_id);
+		return r;
+	}
+
+	return 0;
+}
+
+void dm_ulog_tfr_exit(void)
+{
+	cn_del_callback(&ulog_cn_id);
+	kfree(prealloced_cn_msg);
+}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 00000000000..c26d8e4e271
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+
+#define DM_MSG_PREFIX "dm-log-userspace"
+
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size);
+
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 03f22076381..334a3593cdf 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -57,6 +57,7 @@ header-y += dlmconstants.h
 header-y += dlm_device.h
 header-y += dlm_netlink.h
 header-y += dm-ioctl.h
+header-y += dm-log-userspace.h
 header-y += dn.h
 header-y += dqblk_xfs.h
 header-y += efs_fs_sb.h
diff --git a/include/linux/connector.h b/include/linux/connector.h
index b9966e64604..b68d27850d5 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -41,8 +41,10 @@
 #define CN_IDX_BB			0x5	/* BlackBoard, from the TSP GPL sampling framework */
 #define CN_DST_IDX			0x6
 #define CN_DST_VAL			0x1
+#define CN_IDX_DM			0x7	/* Device Mapper */
+#define CN_VAL_DM_USERSPACE_LOG		0x1
 
-#define CN_NETLINK_USERS		7
+#define CN_NETLINK_USERS		8
 
 /*
  * Maximum connector's message size.
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
new file mode 100644
index 00000000000..642e3017b51
--- /dev/null
+++ b/include/linux/dm-log-userspace.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_H__
+#define __DM_LOG_USERSPACE_H__
+
+#include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
+
+/*
+ * The device-mapper userspace log module consists of a kernel component and
+ * a user-space component.  The kernel component implements the API defined
+ * in dm-dirty-log.h.  Its purpose is simply to pass the parameters and
+ * return values of those API functions between kernel and user-space.
+ *
+ * Below are defined the 'request_types' - DM_ULOG_CTR, DM_ULOG_DTR, etc.
+ * These request types represent the different functions in the device-mapper
+ * dirty log API.  Each of these is described in more detail below.
+ *
+ * The user-space program must listen for requests from the kernel (representing
+ * the various API functions) and process them.
+ *
+ * User-space begins by setting up the communication link (error checking
+ * removed for clarity):
+ *	fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ *	addr.nl_family = AF_NETLINK;
+ *	addr.nl_groups = CN_IDX_DM;
+ *	addr.nl_pid = 0;
+ *	r = bind(fd, (struct sockaddr *) &addr, sizeof(addr));
+ *	opt = addr.nl_groups;
+ *	setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &opt, sizeof(opt));
+ *
+ * User-space will then wait to receive requests form the kernel, which it
+ * will process as described below.  The requests are received in the form,
+ * ((struct dm_ulog_request) + (additional data)).  Depending on the request
+ * type, there may or may not be 'additional data'.  In the descriptions below,
+ * you will see 'Payload-to-userspace' and 'Payload-to-kernel'.  The
+ * 'Payload-to-userspace' is what the kernel sends in 'additional data' as
+ * necessary parameters to complete the request.  The 'Payload-to-kernel' is
+ * the 'additional data' returned to the kernel that contains the necessary
+ * results of the request.  The 'data_size' field in the dm_ulog_request
+ * structure denotes the availability and amount of payload data.
+ */
+
+/*
+ * DM_ULOG_CTR corresponds to (found in dm-dirty-log.h):
+ * int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
+ *	      unsigned argc, char **argv);
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is the reference that
+ * will be used by all request types to a specific log.  The constructor must
+ * record this assotiation with instance created.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CTR                    1
+
+/*
+ * DM_ULOG_DTR corresponds to (found in dm-dirty-log.h):
+ * void (*dtr)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being destroyed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_DTR                    2
+
+/*
+ * DM_ULOG_PRESUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*presuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being presuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_PRESUSPEND             3
+
+/*
+ * DM_ULOG_POSTSUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*postsuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being postsuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_POSTSUSPEND            4
+
+/*
+ * DM_ULOG_RESUME corresponds to (found in dm-dirty-log.h):
+ * int (*resume)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being resumed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_RESUME                 5
+
+/*
+ * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h):
+ * uint32_t (*get_region_size)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	uint64_t - contains the region size
+ *
+ * The region size is something that was determined at constructor time.
+ * It is returned in the payload area and 'data_size' is set to
+ * reflect this.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_REGION_SIZE        6
+
+/*
+ * DM_ULOG_IS_CLEAN corresponds to (found in dm-dirty-log.h):
+ * int (*is_clean)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - the region to get clean status on
+ * Payload-to-kernel:
+ *	int64_t  - 1 if clean, 0 otherwise
+ *
+ * Payload is sizeof(uint64_t) and contains the region for which the clean
+ * status is being made.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - filling the payload with 0 (not clean) or
+ * 1 (clean), setting 'data_size' and 'error' appropriately.
+ */
+#define DM_ULOG_IS_CLEAN               7
+
+/*
+ * DM_ULOG_IN_SYNC corresponds to (found in dm-dirty-log.h):
+ * int (*in_sync)(struct dm_dirty_log *log, region_t region,
+ *		  int can_block);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - the region to get sync status on
+ * Payload-to-kernel:
+ *	int64_t - 1 if in-sync, 0 otherwise
+ *
+ * Exactly the same as 'is_clean' above, except this time asking "has the
+ * region been recovered?" vs. "is the region not being modified?"
+ */
+#define DM_ULOG_IN_SYNC                8
+
+/*
+ * DM_ULOG_FLUSH corresponds to (found in dm-dirty-log.h):
+ * int (*flush)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * No incoming or outgoing payload.  Simply flush log state to disk.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_FLUSH                  9
+
+/*
+ * DM_ULOG_MARK_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*mark_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t [] - region(s) to mark
+ * Payload-to-kernel:
+ *	None.
+ *
+ * Incoming payload contains the one or more regions to mark dirty.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_MARK_REGION           10
+
+/*
+ * DM_ULOG_CLEAR_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*clear_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t [] - region(s) to clear
+ * Payload-to-kernel:
+ *	None.
+ *
+ * Incoming payload contains the one or more regions to mark clean.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CLEAR_REGION          11
+
+/*
+ * DM_ULOG_GET_RESYNC_WORK corresponds to (found in dm-dirty-log.h):
+ * int (*get_resync_work)(struct dm_dirty_log *log, region_t *region);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	{
+ *		int64_t i; -- 1 if recovery necessary, 0 otherwise
+ *		uint64_t r; -- The region to recover if i=1
+ *	}
+ * 'data_size' should be set appropriately.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_RESYNC_WORK       12
+
+/*
+ * DM_ULOG_SET_REGION_SYNC corresponds to (found in dm-dirty-log.h):
+ * void (*set_region_sync)(struct dm_dirty_log *log,
+ *			   region_t region, int in_sync);
+ *
+ * Payload-to-userspace:
+ *	{
+ *		uint64_t - region to set sync state on
+ *		int64_t  - 0 if not-in-sync, 1 if in-sync
+ *	}
+ * Payload-to-kernel:
+ *	None.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_SET_REGION_SYNC       13
+
+/*
+ * DM_ULOG_GET_SYNC_COUNT corresponds to (found in dm-dirty-log.h):
+ * region_t (*get_sync_count)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	uint64_t - the number of in-sync regions
+ *
+ * No incoming payload.  Kernel-bound payload contains the number of
+ * regions that are in-sync (in a size_t).
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_GET_SYNC_COUNT        14
+
+/*
+ * DM_ULOG_STATUS_INFO corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_INFO,
+ *		 char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	Character string containing STATUSTYPE_INFO
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_INFO           15
+
+/*
+ * DM_ULOG_STATUS_TABLE corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_TABLE,
+ *		 char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	Character string containing STATUSTYPE_TABLE
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_TABLE          16
+
+/*
+ * DM_ULOG_IS_REMOTE_RECOVERING corresponds to (found in dm-dirty-log.h):
+ * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - region to determine recovery status on
+ * Payload-to-kernel:
+ *	{
+ *		int64_t is_recovering;  -- 0 if no, 1 if yes
+ *		uint64_t in_sync_hint;  -- lowest region still needing resync
+ *	}
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_IS_REMOTE_RECOVERING  17
+
+/*
+ * (DM_ULOG_REQUEST_MASK & request_type) to get the request type
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * We are reserving 8 bits of the 32-bit 'request_type' field for the
+ * various request types above.  The remaining 24-bits are currently
+ * set to zero and are reserved for future use and compatibility concerns.
+ *
+ * User-space should always use DM_ULOG_REQUEST_TYPE to aquire the
+ * request type from the 'request_type' field to maintain forward compatibility.
+ */
+#define DM_ULOG_REQUEST_MASK 0xFF
+#define DM_ULOG_REQUEST_TYPE(request_type) \
+	(DM_ULOG_REQUEST_MASK & (request_type))
+
+struct dm_ulog_request {
+	char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
+	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+
+	int32_t error;          /* Used to report back processing errors */
+
+	uint32_t seq;           /* Sequence number for request */
+	uint32_t request_type;  /* DM_ULOG_* defined above */
+	uint32_t data_size;     /* How much data (not including this struct) */
+
+	char data[0];
+};
+
+#endif /* __DM_LOG_USERSPACE_H__ */
-- 
cgit v1.2.3-70-g09d2


From cec47e3d4a861e1d942b3a580d0bbef2700d2bb2 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:35 +0100
Subject: dm: prepare for request based option

This patch adds core functions for request-based dm.

When struct mapped device (md) is initialized, md->queue has
an I/O scheduler and the following functions are used for
request-based dm as the queue functions:
    make_request_fn: dm_make_request()
    pref_fn:         dm_prep_fn()
    request_fn:      dm_request_fn()
    softirq_done_fn: dm_softirq_done()
    lld_busy_fn:     dm_lld_busy()
Actual initializations are done in another patch (PATCH 2).

Below is a brief summary of how request-based dm behaves, including:
  - making request from bio
  - cloning, mapping and dispatching request
  - completing request and bio
  - suspending md
  - resuming md

  bio to request
  ==============
  md->queue->make_request_fn() (dm_make_request()) calls __make_request()
  for a bio submitted to the md.
  Then, the bio is kept in the queue as a new request or merged into
  another request in the queue if possible.

  Cloning and Mapping
  ===================
  Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()),
  when requests are dispatched after they are sorted by the I/O scheduler.

  dm_request_fn() checks busy state of underlying devices using
  target's busy() function and stops dispatching requests to keep them
  on the dm device's queue if busy.
  It helps better I/O merging, since no merge is done for a request
  once it is dispatched to underlying devices.

  Actual cloning and mapping are done in dm_prep_fn() and map_request()
  called from dm_request_fn().
  dm_prep_fn() clones not only request but also bios of the request
  so that dm can hold bio completion in error cases and prevent
  the bio submitter from noticing the error.
  (See the "Completion" section below for details.)

  After the cloning, the clone is mapped by target's map_rq() function
    and inserted to underlying device's queue using
    blk_insert_cloned_request().

  Completion
  ==========
  Request completion can be hooked by rq->end_io(), but then, all bios
  in the request will have been completed even error cases, and the bio
  submitter will have noticed the error.
  To prevent the bio completion in error cases, request-based dm clones
  both bio and request and hooks both bio->bi_end_io() and rq->end_io():
      bio->bi_end_io(): end_clone_bio()
      rq->end_io():     end_clone_request()

  Summary of the request completion flow is below:
  blk_end_request() for a clone request
    => blk_update_request()
       => bio->bi_end_io() == end_clone_bio() for each clone bio
          => Free the clone bio
          => Success: Complete the original bio (blk_update_request())
             Error:   Don't complete the original bio
    => blk_finish_request()
       => rq->end_io() == end_clone_request()
          => blk_complete_request()
             => dm_softirq_done()
                => Free the clone request
                => Success: Complete the original request (blk_end_request())
                   Error:   Requeue the original request

  end_clone_bio() completes the original request on the size of
  the original bio in successful cases.
  Even if all bios in the original request are completed by that
  completion, the original request must not be completed yet to keep
  the ordering of request completion for the stacking.
  So end_clone_bio() uses blk_update_request() instead of
  blk_end_request().
  In error cases, end_clone_bio() doesn't complete the original bio.
  It just frees the cloned bio and gives over the error handling to
  end_clone_request().

  end_clone_request(), which is called with queue lock held, completes
  the clone request and the original request in a softirq context
  (dm_softirq_done()), which has no queue lock, to avoid a deadlock
  issue on submission of another request during the completion:
      - The submitted request may be mapped to the same device
      - Request submission requires queue lock, but the queue lock
        has been held by itself and it doesn't know that

  The clone request has no clone bio when dm_softirq_done() is called.
  So target drivers can't resubmit it again even error cases.
  Instead, they can ask dm core for requeueing and remapping
  the original request in that cases.

  suspend
  =======
  Request-based dm uses stopping md->queue as suspend of the md.
  For noflush suspend, just stops md->queue.

  For flush suspend, inserts a marker request to the tail of md->queue.
  And dispatches all requests in md->queue until the marker comes to
  the front of md->queue.  Then, stops dispatching request and waits
  for the all dispatched requests to complete.
  After that, completes the marker request, stops md->queue and
  wake up the waiter on the suspend queue, md->wait.

  resume
  ======
  Starts md->queue.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         |  14 +
 drivers/md/dm.c               | 705 +++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.h               |   1 +
 include/linux/device-mapper.h |   9 +
 4 files changed, 725 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 09a57113955..c5f784419f2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1080,6 +1080,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
+int dm_table_any_busy_target(struct dm_table *t)
+{
+	unsigned i;
+	struct dm_target *ti;
+
+	for (i = 0; i < t->num_targets; i++) {
+		ti = t->targets + i;
+		if (ti->type->busy && ti->type->busy(ti))
+			return 1;
+	}
+
+	return 0;
+}
+
 void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f609793a92d..be003e5fea3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -78,7 +78,7 @@ struct dm_rq_target_io {
  */
 struct dm_rq_clone_bio_info {
 	struct bio *orig;
-	struct request *rq;
+	struct dm_rq_target_io *tio;
 };
 
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -88,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 	return NULL;
 }
 
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+	if (rq && rq->end_io_data)
+		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
+
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -169,6 +177,12 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
+	/* marker of flush suspend for request-based dm */
+	struct request suspend_rq;
+
+	/* For saving the address of __make_request for request based dm */
+	make_request_fn *saved_make_request_fn;
+
 	/* sysfs handle */
 	struct kobject kobj;
 
@@ -406,6 +420,26 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 	mempool_free(tio, md->tio_pool);
 }
 
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
+{
+	return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+}
+
+static void free_rq_tio(struct dm_rq_target_io *tio)
+{
+	mempool_free(tio, tio->md->tio_pool);
+}
+
+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
+{
+	return mempool_alloc(md->io_pool, GFP_ATOMIC);
+}
+
+static void free_bio_info(struct dm_rq_clone_bio_info *info)
+{
+	mempool_free(info, info->tio->md->io_pool);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
@@ -615,6 +649,262 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+	struct dm_rq_clone_bio_info *info = clone->bi_private;
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_size;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int run_queue)
+{
+	int wakeup_waiters = 0;
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!queue_in_flight(q))
+		wakeup_waiters = 1;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* nudge anyone waiting on suspend queue */
+	if (wakeup_waiters)
+		wake_up(&md->wait);
+
+	if (run_queue)
+		blk_run_queue(q);
+
+	/*
+	 * dm_put() must be at the end of this function. See the comment above
+	 */
+	dm_put(md);
+}
+
+static void dm_unprep_request(struct request *rq)
+{
+	struct request *clone = rq->special;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	rq->special = NULL;
+	rq->cmd_flags &= ~REQ_DONTPREP;
+
+	blk_rq_unprep_clone(clone);
+	free_rq_tio(tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_unmapped_request(struct request *clone)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	dm_unprep_request(rq);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (elv_queue_empty(q))
+		blk_plug_device(q);
+	blk_requeue_request(q, rq);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	rq_completed(md, 0);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
+
+static void __stop_queue(struct request_queue *q)
+{
+	blk_stop_queue(q);
+}
+
+static void stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void __start_queue(struct request_queue *q)
+{
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
+}
+
+static void start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+
+	if (blk_pc_request(rq)) {
+		rq->errors = clone->errors;
+		rq->resid_len = clone->resid_len;
+
+		if (rq->sense)
+			/*
+			 * We are using the sense buffer of the original
+			 * request.
+			 * So setting the length of the sense data is enough.
+			 */
+			rq->sense_len = clone->sense_len;
+	}
+
+	BUG_ON(clone->bio);
+	free_rq_tio(tio);
+
+	blk_end_request_all(rq, error);
+
+	rq_completed(md, 1);
+}
+
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+	struct request *clone = rq->completion_data;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+	int error = tio->error;
+
+	if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
+		error = rq_end_io(tio->ti, clone, error, &tio->info);
+
+	if (error <= 0)
+		/* The target wants to complete the I/O */
+		dm_end_request(clone, error);
+	else if (error == DM_ENDIO_INCOMPLETE)
+		/* The target will handle the I/O */
+		return;
+	else if (error == DM_ENDIO_REQUEUE)
+		/* The target wants to requeue the I/O */
+		dm_requeue_unmapped_request(clone);
+	else {
+		DMWARN("unimplemented target endio return value: %d", error);
+		BUG();
+	}
+}
+
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct request *rq = tio->orig;
+
+	tio->error = error;
+	rq->completion_data = clone;
+	blk_complete_request(rq);
+}
+
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() function fails.
+ */
+void dm_kill_unmapped_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct request *rq = tio->orig;
+
+	rq->cmd_flags |= REQ_FAILED;
+	dm_complete_request(clone, error);
+}
+EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
+
+/*
+ * Called with the queue lock held
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+	/*
+	 * For just cleaning up the information of the queue in which
+	 * the clone was dispatched.
+	 * The clone is *NOT* freed actually here because it is alloced from
+	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
+	 */
+	__blk_put_request(clone->q, clone);
+
+	/*
+	 * Actual request completion is done in a softirq context which doesn't
+	 * hold the queue lock.  Otherwise, deadlock could occur because:
+	 *     - another request may be submitted by the upper level driver
+	 *       of the stacking during the completion
+	 *     - the submission which requires queue lock may be done
+	 *       against this queue
+	 */
+	dm_complete_request(clone, error);
+}
+
 static sector_t max_io_len(struct mapped_device *md,
 			   sector_t sector, struct dm_target *ti)
 {
@@ -998,7 +1288,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static int dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
@@ -1035,12 +1325,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (unlikely(bio_barrier(bio))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	return md->saved_make_request_fn(q, bio); /* call __make_request() */
+}
+
+static int dm_request_based(struct mapped_device *md)
+{
+	return blk_queue_stackable(md->queue);
+}
+
+static int dm_request(struct request_queue *q, struct bio *bio)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (dm_request_based(md))
+		return dm_make_request(q, bio);
+
+	return _dm_request(q, bio);
+}
+
+void dm_dispatch_request(struct request *rq)
+{
+	int r;
+
+	if (blk_queue_io_stat(rq->q))
+		rq->cmd_flags |= REQ_IO_STAT;
+
+	rq->start_time = jiffies;
+	r = blk_insert_cloned_request(rq->q, rq);
+	if (r)
+		dm_complete_request(rq, r);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+
+static void dm_rq_bio_destructor(struct bio *bio)
+{
+	struct dm_rq_clone_bio_info *info = bio->bi_private;
+	struct mapped_device *md = info->tio->md;
+
+	free_bio_info(info);
+	bio_free(bio, md->bs);
+}
+
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
+{
+	struct dm_rq_target_io *tio = data;
+	struct mapped_device *md = tio->md;
+	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
+
+	if (!info)
+		return -ENOMEM;
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+	bio->bi_private = info;
+	bio->bi_destructor = dm_rq_bio_destructor;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio)
+{
+	int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+				  dm_rq_bio_constructor, tio);
+
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
+	clone->buffer = rq->buffer;
+	clone->end_io = end_clone_request;
+	clone->end_io_data = tio;
+
+	return 0;
+}
+
+static int dm_rq_flush_suspending(struct mapped_device *md)
+{
+	return !md->suspend_rq.special;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_rq_target_io *tio;
+	struct request *clone;
+
+	if (unlikely(rq == &md->suspend_rq)) {
+		if (dm_rq_flush_suspending(md))
+			return BLKPREP_OK;
+		else
+			/* The flush suspend was interrupted */
+			return BLKPREP_KILL;
+	}
+
+	if (unlikely(rq->special)) {
+		DMWARN("Already has something in rq->special.");
+		return BLKPREP_KILL;
+	}
+
+	tio = alloc_rq_tio(md); /* Only one for each original request */
+	if (!tio)
+		/* -ENOMEM */
+		return BLKPREP_DEFER;
+
+	tio->md = md;
+	tio->ti = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	memset(&tio->info, 0, sizeof(tio->info));
+
+	clone = &tio->clone;
+	if (setup_clone(clone, rq, tio)) {
+		/* -ENOMEM */
+		free_rq_tio(tio);
+		return BLKPREP_DEFER;
+	}
+
+	rq->special = clone;
+	rq->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+}
+
+static void map_request(struct dm_target *ti, struct request *rq,
+			struct mapped_device *md)
+{
+	int r;
+	struct request *clone = rq->special;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+
+	tio->ti = ti;
+	r = ti->type->map_rq(ti, clone, &tio->info);
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		/* The target has taken the I/O to submit by itself later */
+		break;
+	case DM_MAPIO_REMAPPED:
+		/* The target has remapped the I/O so dispatch it */
+		dm_dispatch_request(clone);
+		break;
+	case DM_MAPIO_REQUEUE:
+		/* The target wants to requeue the I/O */
+		dm_requeue_unmapped_request(clone);
+		break;
+	default:
+		if (r > 0) {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+
+		/* The target wants to complete the I/O */
+		dm_kill_unmapped_request(clone, r);
+		break;
+	}
+}
+
+/*
+ * q->request_fn for request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	struct dm_target *ti;
+	struct request *rq;
+
+	/*
+	 * For noflush suspend, check blk_queue_stopped() to immediately
+	 * quit I/O dispatching.
+	 */
+	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+		rq = blk_peek_request(q);
+		if (!rq)
+			goto plug_and_out;
+
+		if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
+			if (queue_in_flight(q))
+				/* Not quiet yet.  Wait more */
+				goto plug_and_out;
+
+			/* This device should be quiet now */
+			__stop_queue(q);
+			blk_start_request(rq);
+			__blk_end_request_all(rq, 0);
+			wake_up(&md->wait);
+			goto out;
+		}
+
+		ti = dm_table_find_target(map, blk_rq_pos(rq));
+		if (ti->type->busy && ti->type->busy(ti))
+			goto plug_and_out;
+
+		blk_start_request(rq);
+		spin_unlock(q->queue_lock);
+		map_request(ti, rq, md);
+		spin_lock_irq(q->queue_lock);
+	}
+
+	goto out;
+
+plug_and_out:
+	if (!elv_queue_empty(q))
+		/* Some requests still remain, retry later */
+		blk_plug_device(q);
+
+out:
+	dm_table_put(map);
+
+	return;
+}
+
+int dm_underlying_device_busy(struct request_queue *q)
+{
+	return blk_lld_busy(q);
+}
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+
+static int dm_lld_busy(struct request_queue *q)
+{
+	int r;
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
+		r = 1;
+	else
+		r = dm_table_any_busy_target(map);
+
+	dm_table_put(map);
+
+	return r;
+}
+
 static void dm_unplug_all(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_table(md);
 
 	if (map) {
+		if (dm_request_based(md))
+			generic_unplug_device(q);
+
 		dm_table_unplug_all(map);
 		dm_table_put(map);
 	}
@@ -1055,7 +1607,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		map = dm_get_table(md);
 		if (map) {
-			r = dm_table_any_congested(map, bdi_bits);
+			/*
+			 * Request-based dm cares about only own queue for
+			 * the query about congestion status of request_queue
+			 */
+			if (dm_request_based(md))
+				r = md->queue->backing_dev_info.state &
+				    bdi_bits;
+			else
+				r = dm_table_any_congested(map, bdi_bits);
+
 			dm_table_put(map);
 		}
 	}
@@ -1458,6 +2019,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
+	struct request_queue *q = md->queue;
+	unsigned long flags;
 
 	dm_unplug_all(md->queue);
 
@@ -1467,7 +2030,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 		set_current_state(interruptible);
 
 		smp_mb();
-		if (!atomic_read(&md->pending))
+		if (dm_request_based(md)) {
+			spin_lock_irqsave(q->queue_lock, flags);
+			if (!queue_in_flight(q) && blk_queue_stopped(q)) {
+				spin_unlock_irqrestore(q->queue_lock, flags);
+				break;
+			}
+			spin_unlock_irqrestore(q->queue_lock, flags);
+		} else if (!atomic_read(&md->pending))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1584,6 +2154,67 @@ out:
 	return r;
 }
 
+static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
+{
+	md->suspend_rq.special = (void *)0x1;
+}
+
+static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
+{
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!noflush)
+		dm_rq_invalidate_suspend_marker(md);
+	__start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
+{
+	struct request *rq = &md->suspend_rq;
+	struct request_queue *q = md->queue;
+
+	if (noflush)
+		stop_queue(q);
+	else {
+		blk_rq_init(q, rq);
+		blk_insert_request(q, rq, 0, NULL);
+	}
+}
+
+static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
+{
+	int r = 1;
+	struct request *rq = &md->suspend_rq;
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	if (noflush)
+		return r;
+
+	/* The marker must be protected by queue lock if it is in use */
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (unlikely(rq->ref_count)) {
+		/*
+		 * This can happen, when the previous flush suspend was
+		 * interrupted, the marker is still in the queue and
+		 * this flush suspend has been invoked, because we don't
+		 * remove the marker at the time of suspend interruption.
+		 * We have only one marker per mapped_device, so we can't
+		 * start another flush suspend while it is in use.
+		 */
+		BUG_ON(!rq->special); /* The marker should be invalidated */
+		DMWARN("Invalidating the previous flush suspend is still in"
+		       " progress.  Please retry later.");
+		r = 0;
+	}
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return r;
+}
+
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
@@ -1623,6 +2254,53 @@ static void unlock_fs(struct mapped_device *md)
  * dm_bind_table, dm_suspend must be called to flush any in
  * flight bios and ensure that any further io gets deferred.
  */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * After the suspend starts, further incoming requests are kept in
+ * the request_queue and deferred.
+ * Remaining requests in the request_queue at the start of suspend are flushed
+ * if it is flush suspend.
+ * The suspend completes when the following conditions have been satisfied,
+ * so wait for it:
+ *    1. q->in_flight is 0 (which means no in_flight request)
+ *    2. queue has been stopped (which means no request dispatching)
+ *
+ *
+ * Noflush suspend
+ * ---------------
+ * Noflush suspend doesn't need to dispatch remaining requests.
+ * So stop the queue immediately.  Then, wait for all in_flight requests
+ * to be completed or requeued.
+ *
+ * To abort noflush suspend, start the queue.
+ *
+ *
+ * Flush suspend
+ * -------------
+ * Flush suspend needs to dispatch remaining requests.  So stop the queue
+ * after the remaining requests are completed. (Requeued request must be also
+ * re-dispatched and completed.  Until then, we can't stop the queue.)
+ *
+ * During flushing the remaining requests, further incoming requests are also
+ * inserted to the same queue.  To distinguish which requests are to be
+ * flushed, we insert a marker request to the queue at the time of starting
+ * flush suspend, like a barrier.
+ * The dispatching is blocked when the marker is found on the top of the queue.
+ * And the queue is stopped when all in_flight requests are completed, since
+ * that means the remaining requests are completely flushed.
+ * Then, the marker is removed from the queue.
+ *
+ * To abort flush suspend, we also need to take care of the marker, not only
+ * starting the queue.
+ * We don't remove the marker forcibly from the queue since it's against
+ * the block-layer manner.  Instead, we put a invalidated mark on the marker.
+ * When the invalidated marker is found on the top of the queue, it is
+ * immediately removed from the queue, so it doesn't block dispatching.
+ * Because we have only one marker per mapped_device, we can't start another
+ * flush suspend until the invalidated marker is removed from the queue.
+ * So fail and return with -EBUSY in such a case.
+ */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
@@ -1637,6 +2315,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
+	if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
+		r = -EBUSY;
+		goto out_unlock;
+	}
+
 	map = dm_get_table(md);
 
 	/*
@@ -1682,6 +2365,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	flush_workqueue(md->wq);
 
+	if (dm_request_based(md))
+		dm_rq_start_suspend(md, noflush);
+
 	/*
 	 * At this point no more requests are entering target request routines.
 	 * We call dm_wait_for_completion to wait for all existing requests
@@ -1698,6 +2384,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (r < 0) {
 		dm_queue_flush(md);
 
+		if (dm_request_based(md))
+			dm_rq_abort_suspend(md, noflush);
+
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
@@ -1739,6 +2428,14 @@ int dm_resume(struct mapped_device *md)
 
 	dm_queue_flush(md);
 
+	/*
+	 * Flushing deferred I/Os must be done after targets are resumed
+	 * so that mapping of targets can work correctly.
+	 * Request-based dm is queueing the deferred I/Os in its request_queue.
+	 */
+	if (dm_request_based(md))
+		start_queue(md->queue);
+
 	unlock_fs(md);
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 604e85caadf..8dcabb1caff 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -50,6 +50,7 @@ void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_busy_target(struct dm_table *t);
 
 /*
  * To check the return value from dm_table_find_target().
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e6bf3b8c7bf..0d6310657f3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -234,6 +234,7 @@ struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 /*
  * Geometry functions.
@@ -396,4 +397,12 @@ static inline unsigned long to_bytes(sector_t n)
 	return (n << SECTOR_SHIFT);
 }
 
+/*-----------------------------------------------------------------
+ * Helper for block layer and dm core operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request *rq);
+void dm_requeue_unmapped_request(struct request *rq);
+void dm_kill_unmapped_request(struct request *rq, int error);
+int dm_underlying_device_busy(struct request_queue *q);
+
 #endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3-70-g09d2


From 099d5270897606473d63091afcc63f53ee1894bc Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Mon, 22 Jun 2009 18:42:42 +0100
Subject: serial@ add OMAP wakeup-enable register

Add the wakeup enable register to the list of OMAP-specific UART
registers.  This is to support forthcoming OMAP PM enhancements which
use the wakeup feature of the OMAP's 8250-based UART.

Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial_reg.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_reg.h b/include/linux/serial_reg.h
index 96c0d93fc2c..850db2e8051 100644
--- a/include/linux/serial_reg.h
+++ b/include/linux/serial_reg.h
@@ -323,6 +323,7 @@
 #define UART_OMAP_MVER		0x14	/* Module version register */
 #define UART_OMAP_SYSC		0x15	/* System configuration register */
 #define UART_OMAP_SYSS		0x16	/* System status register */
+#define UART_OMAP_WER		0x17	/* Wake-up enable register */
 
 #endif /* _LINUX_SERIAL_REG_H */
 
-- 
cgit v1.2.3-70-g09d2


From 04896a77a97b87e1611dedd61be88264ef4ac96c Mon Sep 17 00:00:00 2001
From: Robert Love <rlove@google.com>
Date: Mon, 22 Jun 2009 18:43:11 +0100
Subject: msm_serial: serial driver for MSM7K onboard serial peripheral.

Signed-off-by: Brian Swetland <swetland@google.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/Kconfig      |  10 +
 drivers/serial/Makefile     |   1 +
 drivers/serial/msm_serial.c | 767 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/msm_serial.h | 117 +++++++
 include/linux/serial_core.h |   3 +
 5 files changed, 898 insertions(+)
 create mode 100644 drivers/serial/msm_serial.c
 create mode 100644 drivers/serial/msm_serial.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 1132c5cae7a..037c1e0b7c4 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1320,6 +1320,16 @@ config SERIAL_SGI_IOC3
 	  If you have an SGI Altix with an IOC3 serial card,
 	  say Y or M.  Otherwise, say N.
 
+config SERIAL_MSM
+	bool "MSM on-chip serial port support"
+	depends on ARM && ARCH_MSM
+	select SERIAL_CORE
+
+config SERIAL_MSM_CONSOLE
+	bool "MSM serial console support"
+	depends on SERIAL_MSM=y
+	select SERIAL_CORE_CONSOLE
+
 config SERIAL_NETX
 	tristate "NetX serial port support"
 	depends on ARM && ARCH_NETX
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 45a8658f54d..d5a29981c6c 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_serial.o
 obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_serial.o
 obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o
 obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o
+obj-$(CONFIG_SERIAL_MSM) += msm_serial.o
 obj-$(CONFIG_SERIAL_NETX) += netx-serial.o
 obj-$(CONFIG_SERIAL_OF_PLATFORM) += of_serial.o
 obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
diff --git a/drivers/serial/msm_serial.c b/drivers/serial/msm_serial.c
new file mode 100644
index 00000000000..1a7c856f76f
--- /dev/null
+++ b/drivers/serial/msm_serial.c
@@ -0,0 +1,767 @@
+/*
+ * drivers/serial/msm_serial.c - driver for msm7k serial device and console
+ *
+ * Copyright (C) 2007 Google, Inc.
+ * Author: Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#if defined(CONFIG_SERIAL_MSM_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+# define SUPPORT_SYSRQ
+#endif
+
+#include <linux/hrtimer.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+
+#include "msm_serial.h"
+
+struct msm_port {
+	struct uart_port	uart;
+	char			name[16];
+	struct clk		*clk;
+	unsigned int		imr;
+};
+
+#define UART_TO_MSM(uart_port)	((struct msm_port *) uart_port)
+
+static inline void msm_write(struct uart_port *port, unsigned int val,
+			     unsigned int off)
+{
+	__raw_writel(val, port->membase + off);
+}
+
+static inline unsigned int msm_read(struct uart_port *port, unsigned int off)
+{
+	return __raw_readl(port->membase + off);
+}
+
+static void msm_stop_tx(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr &= ~UART_IMR_TXLEV;
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void msm_start_tx(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr |= UART_IMR_TXLEV;
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void msm_stop_rx(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr &= ~(UART_IMR_RXLEV | UART_IMR_RXSTALE);
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void msm_enable_ms(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr |= UART_IMR_DELTA_CTS;
+	msm_write(port, msm_port->imr, UART_IMR);
+}
+
+static void handle_rx(struct uart_port *port)
+{
+	struct tty_struct *tty = port->info->port.tty;
+	unsigned int sr;
+
+	/*
+	 * Handle overrun. My understanding of the hardware is that overrun
+	 * is not tied to the RX buffer, so we handle the case out of band.
+	 */
+	if ((msm_read(port, UART_SR) & UART_SR_OVERRUN)) {
+		port->icount.overrun++;
+		tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+		msm_write(port, UART_CR_CMD_RESET_ERR, UART_CR);
+	}
+
+	/* and now the main RX loop */
+	while ((sr = msm_read(port, UART_SR)) & UART_SR_RX_READY) {
+		unsigned int c;
+		char flag = TTY_NORMAL;
+
+		c = msm_read(port, UART_RF);
+
+		if (sr & UART_SR_RX_BREAK) {
+			port->icount.brk++;
+			if (uart_handle_break(port))
+				continue;
+		} else if (sr & UART_SR_PAR_FRAME_ERR) {
+			port->icount.frame++;
+		} else {
+			port->icount.rx++;
+		}
+
+		/* Mask conditions we're ignorning. */
+		sr &= port->read_status_mask;
+
+		if (sr & UART_SR_RX_BREAK) {
+			flag = TTY_BREAK;
+		} else if (sr & UART_SR_PAR_FRAME_ERR) {
+			flag = TTY_FRAME;
+		}
+
+		if (!uart_handle_sysrq_char(port, c))
+			tty_insert_flip_char(tty, c, flag);
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+static void handle_tx(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+	struct msm_port *msm_port = UART_TO_MSM(port);
+	int sent_tx;
+
+	if (port->x_char) {
+		msm_write(port, port->x_char, UART_TF);
+		port->icount.tx++;
+		port->x_char = 0;
+	}
+
+	while (msm_read(port, UART_SR) & UART_SR_TX_READY) {
+		if (uart_circ_empty(xmit)) {
+			/* disable tx interrupts */
+			msm_port->imr &= ~UART_IMR_TXLEV;
+			msm_write(port, msm_port->imr, UART_IMR);
+			break;
+		}
+
+		msm_write(port, xmit->buf[xmit->tail], UART_TF);
+
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		sent_tx = 1;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+}
+
+static void handle_delta_cts(struct uart_port *port)
+{
+	msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
+	port->icount.cts++;
+	wake_up_interruptible(&port->info->delta_msr_wait);
+}
+
+static irqreturn_t msm_irq(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	struct msm_port *msm_port = UART_TO_MSM(port);
+	unsigned int misr;
+
+	spin_lock(&port->lock);
+	misr = msm_read(port, UART_MISR);
+	msm_write(port, 0, UART_IMR); /* disable interrupt */
+
+	if (misr & (UART_IMR_RXLEV | UART_IMR_RXSTALE))
+		handle_rx(port);
+	if (misr & UART_IMR_TXLEV)
+		handle_tx(port);
+	if (misr & UART_IMR_DELTA_CTS)
+		handle_delta_cts(port);
+
+	msm_write(port, msm_port->imr, UART_IMR); /* restore interrupt */
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int msm_tx_empty(struct uart_port *port)
+{
+	return (msm_read(port, UART_SR) & UART_SR_TX_EMPTY) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int msm_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_CAR | TIOCM_CTS | TIOCM_DSR | TIOCM_RTS;
+}
+
+static void msm_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	unsigned int mr;
+
+	mr = msm_read(port, UART_MR1);
+
+	if (!(mctrl & TIOCM_RTS)) {
+		mr &= ~UART_MR1_RX_RDY_CTL;
+		msm_write(port, mr, UART_MR1);
+		msm_write(port, UART_CR_CMD_RESET_RFR, UART_CR);
+	} else {
+		mr |= UART_MR1_RX_RDY_CTL;
+		msm_write(port, mr, UART_MR1);
+	}
+}
+
+static void msm_break_ctl(struct uart_port *port, int break_ctl)
+{
+	if (break_ctl)
+		msm_write(port, UART_CR_CMD_START_BREAK, UART_CR);
+	else
+		msm_write(port, UART_CR_CMD_STOP_BREAK, UART_CR);
+}
+
+static void msm_set_baud_rate(struct uart_port *port, unsigned int baud)
+{
+	unsigned int baud_code, rxstale, watermark;
+
+	switch (baud) {
+	case 300:
+		baud_code = UART_CSR_300;
+		rxstale = 1;
+		break;
+	case 600:
+		baud_code = UART_CSR_600;
+		rxstale = 1;
+		break;
+	case 1200:
+		baud_code = UART_CSR_1200;
+		rxstale = 1;
+		break;
+	case 2400:
+		baud_code = UART_CSR_2400;
+		rxstale = 1;
+		break;
+	case 4800:
+		baud_code = UART_CSR_4800;
+		rxstale = 1;
+		break;
+	case 9600:
+		baud_code = UART_CSR_9600;
+		rxstale = 2;
+		break;
+	case 14400:
+		baud_code = UART_CSR_14400;
+		rxstale = 3;
+		break;
+	case 19200:
+		baud_code = UART_CSR_19200;
+		rxstale = 4;
+		break;
+	case 28800:
+		baud_code = UART_CSR_28800;
+		rxstale = 6;
+		break;
+	case 38400:
+		baud_code = UART_CSR_38400;
+		rxstale = 8;
+		break;
+	case 57600:
+		baud_code = UART_CSR_57600;
+		rxstale = 16;
+		break;
+	case 115200:
+	default:
+		baud_code = UART_CSR_115200;
+		rxstale = 31;
+		break;
+	}
+
+	msm_write(port, baud_code, UART_CSR);
+
+	/* RX stale watermark */
+	watermark = UART_IPR_STALE_LSB & rxstale;
+	watermark |= UART_IPR_RXSTALE_LAST;
+	watermark |= UART_IPR_STALE_TIMEOUT_MSB & (rxstale << 2);
+	msm_write(port, watermark, UART_IPR);
+
+	/* set RX watermark */
+	watermark = (port->fifosize * 3) / 4;
+	msm_write(port, watermark, UART_RFWR);
+
+	/* set TX watermark */
+	msm_write(port, 10, UART_TFWR);
+}
+
+static void msm_reset(struct uart_port *port)
+{
+	/* reset everything */
+	msm_write(port, UART_CR_CMD_RESET_RX, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_TX, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_ERR, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_BREAK_INT, UART_CR);
+	msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
+	msm_write(port, UART_CR_CMD_SET_RFR, UART_CR);
+}
+
+static void msm_init_clock(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	clk_enable(msm_port->clk);
+
+	msm_write(port, 0xC0, UART_MREG);
+	msm_write(port, 0xB2, UART_NREG);
+	msm_write(port, 0x7D, UART_DREG);
+	msm_write(port, 0x1C, UART_MNDREG);
+}
+
+static int msm_startup(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+	unsigned int data, rfr_level;
+	int ret;
+
+	snprintf(msm_port->name, sizeof(msm_port->name),
+		 "msm_serial%d", port->line);
+
+	ret = request_irq(port->irq, msm_irq, IRQF_TRIGGER_HIGH,
+			  msm_port->name, port);
+	if (unlikely(ret))
+		return ret;
+
+	msm_init_clock(port);
+
+	if (likely(port->fifosize > 12))
+		rfr_level = port->fifosize - 12;
+	else
+		rfr_level = port->fifosize;
+
+	/* set automatic RFR level */
+	data = msm_read(port, UART_MR1);
+	data &= ~UART_MR1_AUTO_RFR_LEVEL1;
+	data &= ~UART_MR1_AUTO_RFR_LEVEL0;
+	data |= UART_MR1_AUTO_RFR_LEVEL1 & (rfr_level << 2);
+	data |= UART_MR1_AUTO_RFR_LEVEL0 & rfr_level;
+	msm_write(port, data, UART_MR1);
+
+	/* make sure that RXSTALE count is non-zero */
+	data = msm_read(port, UART_IPR);
+	if (unlikely(!data)) {
+		data |= UART_IPR_RXSTALE_LAST;
+		data |= UART_IPR_STALE_LSB;
+		msm_write(port, data, UART_IPR);
+	}
+
+	msm_reset(port);
+
+	msm_write(port, 0x05, UART_CR);	/* enable TX & RX */
+
+	/* turn on RX and CTS interrupts */
+	msm_port->imr = UART_IMR_RXLEV | UART_IMR_RXSTALE |
+			UART_IMR_CURRENT_CTS;
+	msm_write(port, msm_port->imr, UART_IMR);
+
+	return 0;
+}
+
+static void msm_shutdown(struct uart_port *port)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	msm_port->imr = 0;
+	msm_write(port, 0, UART_IMR); /* disable interrupts */
+
+	clk_disable(msm_port->clk);
+
+	free_irq(port->irq, port);
+}
+
+static void msm_set_termios(struct uart_port *port, struct ktermios *termios,
+			    struct ktermios *old)
+{
+	unsigned long flags;
+	unsigned int baud, mr;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* calculate and set baud rate */
+	baud = uart_get_baud_rate(port, termios, old, 300, 115200);
+	msm_set_baud_rate(port, baud);
+
+	/* calculate parity */
+	mr = msm_read(port, UART_MR2);
+	mr &= ~UART_MR2_PARITY_MODE;
+	if (termios->c_cflag & PARENB) {
+		if (termios->c_cflag & PARODD)
+			mr |= UART_MR2_PARITY_MODE_ODD;
+		else if (termios->c_cflag & CMSPAR)
+			mr |= UART_MR2_PARITY_MODE_SPACE;
+		else
+			mr |= UART_MR2_PARITY_MODE_EVEN;
+	}
+
+	/* calculate bits per char */
+	mr &= ~UART_MR2_BITS_PER_CHAR;
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		mr |= UART_MR2_BITS_PER_CHAR_5;
+		break;
+	case CS6:
+		mr |= UART_MR2_BITS_PER_CHAR_6;
+		break;
+	case CS7:
+		mr |= UART_MR2_BITS_PER_CHAR_7;
+		break;
+	case CS8:
+	default:
+		mr |= UART_MR2_BITS_PER_CHAR_8;
+		break;
+	}
+
+	/* calculate stop bits */
+	mr &= ~(UART_MR2_STOP_BIT_LEN_ONE | UART_MR2_STOP_BIT_LEN_TWO);
+	if (termios->c_cflag & CSTOPB)
+		mr |= UART_MR2_STOP_BIT_LEN_TWO;
+	else
+		mr |= UART_MR2_STOP_BIT_LEN_ONE;
+
+	/* set parity, bits per char, and stop bit */
+	msm_write(port, mr, UART_MR2);
+
+	/* calculate and set hardware flow control */
+	mr = msm_read(port, UART_MR1);
+	mr &= ~(UART_MR1_CTS_CTL | UART_MR1_RX_RDY_CTL);
+	if (termios->c_cflag & CRTSCTS) {
+		mr |= UART_MR1_CTS_CTL;
+		mr |= UART_MR1_RX_RDY_CTL;
+	}
+	msm_write(port, mr, UART_MR1);
+
+	/* Configure status bits to ignore based on termio flags. */
+	port->read_status_mask = 0;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= UART_SR_PAR_FRAME_ERR;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		port->read_status_mask |= UART_SR_RX_BREAK;
+
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *msm_type(struct uart_port *port)
+{
+	return "MSM";
+}
+
+static void msm_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *resource;
+	resource_size_t size;
+
+	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (unlikely(!resource))
+		return;
+	size = resource->end - resource->start + 1;
+
+	release_mem_region(port->mapbase, size);
+	iounmap(port->membase);
+	port->membase = NULL;
+}
+
+static int msm_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *resource;
+	resource_size_t size;
+
+	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (unlikely(!resource))
+		return -ENXIO;
+	size = resource->end - resource->start + 1;
+
+	if (unlikely(!request_mem_region(port->mapbase, size, "msm_serial")))
+		return -EBUSY;
+
+	port->membase = ioremap(port->mapbase, size);
+	if (!port->membase) {
+		release_mem_region(port->mapbase, size);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void msm_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_MSM;
+		msm_request_port(port);
+	}
+}
+
+static int msm_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	if (unlikely(ser->type != PORT_UNKNOWN && ser->type != PORT_MSM))
+		return -EINVAL;
+	if (unlikely(port->irq != ser->irq))
+		return -EINVAL;
+	return 0;
+}
+
+static void msm_power(struct uart_port *port, unsigned int state,
+		      unsigned int oldstate)
+{
+	struct msm_port *msm_port = UART_TO_MSM(port);
+
+	switch (state) {
+	case 0:
+		clk_enable(msm_port->clk);
+		break;
+	case 3:
+		clk_disable(msm_port->clk);
+		break;
+	default:
+		printk(KERN_ERR "msm_serial: Unknown PM state %d\n", state);
+	}
+}
+
+static struct uart_ops msm_uart_pops = {
+	.tx_empty = msm_tx_empty,
+	.set_mctrl = msm_set_mctrl,
+	.get_mctrl = msm_get_mctrl,
+	.stop_tx = msm_stop_tx,
+	.start_tx = msm_start_tx,
+	.stop_rx = msm_stop_rx,
+	.enable_ms = msm_enable_ms,
+	.break_ctl = msm_break_ctl,
+	.startup = msm_startup,
+	.shutdown = msm_shutdown,
+	.set_termios = msm_set_termios,
+	.type = msm_type,
+	.release_port = msm_release_port,
+	.request_port = msm_request_port,
+	.config_port = msm_config_port,
+	.verify_port = msm_verify_port,
+	.pm = msm_power,
+};
+
+static struct msm_port msm_uart_ports[] = {
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 512,
+			.line = 0,
+		},
+	},
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 512,
+			.line = 1,
+		},
+	},
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 64,
+			.line = 2,
+		},
+	},
+};
+
+#define UART_NR	ARRAY_SIZE(msm_uart_ports)
+
+static inline struct uart_port *get_port_from_line(unsigned int line)
+{
+	return &msm_uart_ports[line].uart;
+}
+
+#ifdef CONFIG_SERIAL_MSM_CONSOLE
+
+static void msm_console_putchar(struct uart_port *port, int c)
+{
+	while (!(msm_read(port, UART_SR) & UART_SR_TX_READY))
+		;
+	msm_write(port, c, UART_TF);
+}
+
+static void msm_console_write(struct console *co, const char *s,
+			      unsigned int count)
+{
+	struct uart_port *port;
+	struct msm_port *msm_port;
+
+	BUG_ON(co->index < 0 || co->index >= UART_NR);
+
+	port = get_port_from_line(co->index);
+	msm_port = UART_TO_MSM(port);
+
+	spin_lock(&port->lock);
+	uart_console_write(port, s, count, msm_console_putchar);
+	spin_unlock(&port->lock);
+}
+
+static int __init msm_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud, flow, bits, parity;
+
+	if (unlikely(co->index >= UART_NR || co->index < 0))
+		return -ENXIO;
+
+	port = get_port_from_line(co->index);
+
+	if (unlikely(!port->membase))
+		return -ENXIO;
+
+	port->cons = co;
+
+	msm_init_clock(port);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	bits = 8;
+	parity = 'n';
+	flow = 'n';
+	msm_write(port, UART_MR2_BITS_PER_CHAR_8 | UART_MR2_STOP_BIT_LEN_ONE,
+		  UART_MR2);	/* 8N1 */
+
+	if (baud < 300 || baud > 115200)
+		baud = 115200;
+	msm_set_baud_rate(port, baud);
+
+	msm_reset(port);
+
+	printk(KERN_INFO "msm_serial: console setup on port #%d\n", port->line);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver msm_uart_driver;
+
+static struct console msm_console = {
+	.name = "ttyMSM",
+	.write = msm_console_write,
+	.device = uart_console_device,
+	.setup = msm_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &msm_uart_driver,
+};
+
+#define MSM_CONSOLE	(&msm_console)
+
+#else
+#define MSM_CONSOLE	NULL
+#endif
+
+static struct uart_driver msm_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "msm_serial",
+	.dev_name = "ttyMSM",
+	.nr = UART_NR,
+	.cons = MSM_CONSOLE,
+};
+
+static int __init msm_serial_probe(struct platform_device *pdev)
+{
+	struct msm_port *msm_port;
+	struct resource *resource;
+	struct uart_port *port;
+
+	if (unlikely(pdev->id < 0 || pdev->id >= UART_NR))
+		return -ENXIO;
+
+	printk(KERN_INFO "msm_serial: detected port #%d\n", pdev->id);
+
+	port = get_port_from_line(pdev->id);
+	port->dev = &pdev->dev;
+	msm_port = UART_TO_MSM(port);
+
+	msm_port->clk = clk_get(&pdev->dev, "uart_clk");
+	if (unlikely(IS_ERR(msm_port->clk)))
+		return PTR_ERR(msm_port->clk);
+	port->uartclk = clk_get_rate(msm_port->clk);
+
+	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (unlikely(!resource))
+		return -ENXIO;
+	port->mapbase = resource->start;
+
+	port->irq = platform_get_irq(pdev, 0);
+	if (unlikely(port->irq < 0))
+		return -ENXIO;
+
+	platform_set_drvdata(pdev, port);
+
+	return uart_add_one_port(&msm_uart_driver, port);
+}
+
+static int __devexit msm_serial_remove(struct platform_device *pdev)
+{
+	struct msm_port *msm_port = platform_get_drvdata(pdev);
+
+	clk_put(msm_port->clk);
+
+	return 0;
+}
+
+static struct platform_driver msm_platform_driver = {
+	.probe = msm_serial_probe,
+	.remove = msm_serial_remove,
+	.driver = {
+		.name = "msm_serial",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init msm_serial_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&msm_uart_driver);
+	if (unlikely(ret))
+		return ret;
+
+	ret = platform_driver_probe(&msm_platform_driver, msm_serial_probe);
+	if (unlikely(ret))
+		uart_unregister_driver(&msm_uart_driver);
+
+	printk(KERN_INFO "msm_serial: driver initialized\n");
+
+	return ret;
+}
+
+static void __exit msm_serial_exit(void)
+{
+#ifdef CONFIG_SERIAL_MSM_CONSOLE
+	unregister_console(&msm_console);
+#endif
+	platform_driver_unregister(&msm_platform_driver);
+	uart_unregister_driver(&msm_uart_driver);
+}
+
+module_init(msm_serial_init);
+module_exit(msm_serial_exit);
+
+MODULE_AUTHOR("Robert Love <rlove@google.com>");
+MODULE_DESCRIPTION("Driver for msm7x serial device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/msm_serial.h b/drivers/serial/msm_serial.h
new file mode 100644
index 00000000000..689f1fa0e84
--- /dev/null
+++ b/drivers/serial/msm_serial.h
@@ -0,0 +1,117 @@
+/*
+ * drivers/serial/msm_serial.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ * Author: Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __DRIVERS_SERIAL_MSM_SERIAL_H
+#define __DRIVERS_SERIAL_MSM_SERIAL_H
+
+#define UART_MR1			0x0000
+
+#define UART_MR1_AUTO_RFR_LEVEL0	0x3F
+#define UART_MR1_AUTO_RFR_LEVEL1	0x3FF00
+#define UART_MR1_RX_RDY_CTL    		(1 << 7)
+#define UART_MR1_CTS_CTL       		(1 << 6)
+
+#define UART_MR2			0x0004
+#define UART_MR2_ERROR_MODE		(1 << 6)
+#define UART_MR2_BITS_PER_CHAR		0x30
+#define UART_MR2_BITS_PER_CHAR_5	(0x0 << 4)
+#define UART_MR2_BITS_PER_CHAR_6	(0x1 << 4)
+#define UART_MR2_BITS_PER_CHAR_7	(0x2 << 4)
+#define UART_MR2_BITS_PER_CHAR_8	(0x3 << 4)
+#define UART_MR2_STOP_BIT_LEN_ONE	(0x1 << 2)
+#define UART_MR2_STOP_BIT_LEN_TWO	(0x3 << 2)
+#define UART_MR2_PARITY_MODE_NONE	0x0
+#define UART_MR2_PARITY_MODE_ODD	0x1
+#define UART_MR2_PARITY_MODE_EVEN	0x2
+#define UART_MR2_PARITY_MODE_SPACE	0x3
+#define UART_MR2_PARITY_MODE		0x3
+
+#define UART_CSR	0x0008
+#define UART_CSR_115200	0xFF
+#define UART_CSR_57600	0xEE
+#define UART_CSR_38400	0xDD
+#define UART_CSR_28800	0xCC
+#define UART_CSR_19200	0xBB
+#define UART_CSR_14400	0xAA
+#define UART_CSR_9600	0x99
+#define UART_CSR_4800	0x77
+#define UART_CSR_2400	0x55
+#define UART_CSR_1200	0x44
+#define UART_CSR_600	0x33
+#define UART_CSR_300	0x22
+
+#define UART_TF		0x000C
+
+#define UART_CR				0x0010
+#define UART_CR_CMD_NULL		(0 << 4)
+#define UART_CR_CMD_RESET_RX		(1 << 4)
+#define UART_CR_CMD_RESET_TX		(2 << 4)
+#define UART_CR_CMD_RESET_ERR		(3 << 4)
+#define UART_CR_CMD_RESET_BREAK_INT	(4 << 4)
+#define UART_CR_CMD_START_BREAK		(5 << 4)
+#define UART_CR_CMD_STOP_BREAK		(6 << 4)
+#define UART_CR_CMD_RESET_CTS		(7 << 4)
+#define UART_CR_CMD_PACKET_MODE		(9 << 4)
+#define UART_CR_CMD_MODE_RESET		(12 << 4)
+#define UART_CR_CMD_SET_RFR		(13 << 4)
+#define UART_CR_CMD_RESET_RFR		(14 << 4)
+#define UART_CR_TX_DISABLE		(1 << 3)
+#define UART_CR_TX_ENABLE		(1 << 3)
+#define UART_CR_RX_DISABLE		(1 << 3)
+#define UART_CR_RX_ENABLE		(1 << 3)
+
+#define UART_IMR		0x0014
+#define UART_IMR_TXLEV		(1 << 0)
+#define UART_IMR_RXSTALE	(1 << 3)
+#define UART_IMR_RXLEV		(1 << 4)
+#define UART_IMR_DELTA_CTS	(1 << 5)
+#define UART_IMR_CURRENT_CTS	(1 << 6)
+
+#define UART_IPR_RXSTALE_LAST		0x20
+#define UART_IPR_STALE_LSB		0x1F
+#define UART_IPR_STALE_TIMEOUT_MSB	0x3FF80
+
+#define UART_IPR	0x0018
+#define UART_TFWR	0x001C
+#define UART_RFWR	0x0020
+#define UART_HCR	0x0024
+
+#define UART_MREG		0x0028
+#define UART_NREG		0x002C
+#define UART_DREG		0x0030
+#define UART_MNDREG		0x0034
+#define UART_IRDA		0x0038
+#define UART_MISR_MODE		0x0040
+#define UART_MISR_RESET		0x0044
+#define UART_MISR_EXPORT	0x0048
+#define UART_MISR_VAL		0x004C
+#define UART_TEST_CTRL		0x0050
+
+#define UART_SR			0x0008
+#define UART_SR_HUNT_CHAR	(1 << 7)
+#define UART_SR_RX_BREAK	(1 << 6)
+#define UART_SR_PAR_FRAME_ERR	(1 << 5)
+#define UART_SR_OVERRUN		(1 << 4)
+#define UART_SR_TX_EMPTY	(1 << 3)
+#define UART_SR_TX_READY	(1 << 2)
+#define UART_SR_RX_FULL		(1 << 1)
+#define UART_SR_RX_READY	(1 << 0)
+
+#define UART_RF		0x000C
+#define UART_MISR	0x0010
+#define UART_ISR	0x0014
+
+#endif	/* __DRIVERS_SERIAL_MSM_SERIAL_H */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 6fd80c4243f..23d2fb051f9 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -171,6 +171,9 @@
 /* Timberdale UART */
 #define PORT_TIMBUART	87
 
+/* Qualcomm MSM SoCs */
+#define PORT_MSM	88
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3-70-g09d2


From 9a7aa12f3911853a3574d47d567b81a2a5df7208 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Jun 2009 15:26:49 +0200
Subject: vfs: Set special lockdep map for dirs only if not set by fs

Some filesystems need to set lockdep map for i_mutex differently for
different directories. For example OCFS2 has system directories (for
orphan inode tracking and for gathering all system files like journal
or quota files into a single place) which have different locking
locking rules than standard directories. For a filesystem setting
lockdep map is naturaly done when the inode is read but we have to
modify unlock_new_inode() not to overwrite the lockdep map the filesystem
has set.

Acked-by: peterz@infradead.org
CC: mingo@redhat.com
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/inode.c              | 17 +++++++++++------
 include/linux/lockdep.h | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index f643be565df..04c785bb63c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -665,12 +665,17 @@ void unlock_new_inode(struct inode *inode)
 	if (inode->i_mode & S_IFDIR) {
 		struct file_system_type *type = inode->i_sb->s_type;
 
-		/*
-		 * ensure nobody is actually holding i_mutex
-		 */
-		mutex_destroy(&inode->i_mutex);
-		mutex_init(&inode->i_mutex);
-		lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
+		/* Set new key only if filesystem hasn't already changed it */
+		if (!lockdep_match_class(&inode->i_mutex,
+		    &type->i_mutex_key)) {
+			/*
+			 * ensure nobody is actually holding i_mutex
+			 */
+			mutex_destroy(&inode->i_mutex);
+			mutex_init(&inode->i_mutex);
+			lockdep_set_class(&inode->i_mutex,
+					  &type->i_mutex_dir_key);
+		}
 	}
 #endif
 	/*
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index da5a5a1f4cd..b25d1b53df0 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -258,6 +258,16 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
 #define lockdep_set_subclass(lock, sub)	\
 		lockdep_init_map(&(lock)->dep_map, #lock, \
 				 (lock)->dep_map.key, sub)
+/*
+ * Compare locking classes
+ */
+#define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key)
+
+static inline int lockdep_match_key(struct lockdep_map *lock,
+				    struct lock_class_key *key)
+{
+	return lock->key == key;
+}
 
 /*
  * Acquire a lock.
@@ -326,6 +336,11 @@ static inline void lockdep_on(void)
 #define lockdep_set_class_and_subclass(lock, key, sub) \
 		do { (void)(key); } while (0)
 #define lockdep_set_subclass(lock, sub)		do { } while (0)
+/*
+ * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
+ * case since the result is not well defined and the caller should rather
+ * #ifdef the call himself.
+ */
 
 # define INIT_LOCKDEP
 # define lockdep_reset()		do { debug_locks = 1; } while (0)
-- 
cgit v1.2.3-70-g09d2


From 31950eb66ff47c946fd9c65c2f8c94b6b7ba13fc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 22 Jun 2009 21:18:12 -0700
Subject: mm/init: cpu_hotplug_init() must be initialized before SLAB

SLAB uses get/put_online_cpus() which use a mutex which is itself only
initialized when cpu_hotplug_init() is called.  Currently we hang suring
boot in SLAB due to doing that too late.

Reported by James Bottomley and Sachin Sant (and possibly others).
Debugged by Benjamin Herrenschmidt.

This just removes the dynamic initialization of the data structures, and
replaces it with a static one, avoiding this dependency entirely, and
removing one unnecessary special initcall.

Tested-by: Sachin Sant <sachinp@in.ibm.com>
Tested-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpu.h |  5 -----
 init/main.c         |  1 -
 kernel/cpu.c        | 13 +++++--------
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2643d848df9..4d668e05d45 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -69,7 +69,6 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
-extern void cpu_hotplug_init(void);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
 
@@ -84,10 +83,6 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
 
-static inline void cpu_hotplug_init(void)
-{
-}
-
 static inline void cpu_maps_update_begin(void)
 {
 }
diff --git a/init/main.c b/init/main.c
index 09131ec090c..4870dfeb9ee 100644
--- a/init/main.c
+++ b/init/main.c
@@ -678,7 +678,6 @@ asmlinkage void __init start_kernel(void)
 #endif
 	page_cgroup_init();
 	enable_debug_pagealloc();
-	cpu_hotplug_init();
 	kmemtrace_init();
 	kmemleak_init();
 	debug_objects_mem_init();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8..8ce10043e4a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
 	 * an ongoing cpu hotplug operation.
 	 */
 	int refcount;
-} cpu_hotplug;
-
-void __init cpu_hotplug_init(void)
-{
-	cpu_hotplug.active_writer = NULL;
-	mutex_init(&cpu_hotplug.lock);
-	cpu_hotplug.refcount = 0;
-}
+} cpu_hotplug = {
+	.active_writer = NULL,
+	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+	.refcount = 0,
+};
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-- 
cgit v1.2.3-70-g09d2


From 616511d039af402670de8500d0e24495113a9cab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 22 Jun 2009 15:09:13 -0400
Subject: VFS: Uninline the function put_mnt_ns()

In order to allow modules to use it without having to export vfsmount_lock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c                | 8 ++++++--
 include/linux/mnt_namespace.h | 9 +--------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 2dd333b0fe7..6645846f205 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2246,10 +2246,14 @@ void __init mnt_init(void)
 	init_mount_tree();
 }
 
-void __put_mnt_ns(struct mnt_namespace *ns)
+void put_mnt_ns(struct mnt_namespace *ns)
 {
-	struct vfsmount *root = ns->root;
+	struct vfsmount *root;
 	LIST_HEAD(umount_list);
+
+	if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+		return;
+	root = ns->root;
 	ns->root = NULL;
 	spin_unlock(&vfsmount_lock);
 	down_write(&namespace_sem);
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 3a059298cc1..299d11af5f7 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -26,14 +26,7 @@ struct fs_struct;
 
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
-extern void __put_mnt_ns(struct mnt_namespace *ns);
-
-static inline void put_mnt_ns(struct mnt_namespace *ns)
-{
-	if (atomic_dec_and_lock(&ns->count, &vfsmount_lock))
-		/* releases vfsmount_lock */
-		__put_mnt_ns(ns);
-}
+extern void put_mnt_ns(struct mnt_namespace *ns);
 
 static inline void exit_mnt_ns(struct task_struct *p)
 {
-- 
cgit v1.2.3-70-g09d2


From cf8d2c11cb77f129675478792122f50827e5b0ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 22 Jun 2009 15:09:13 -0400
Subject: VFS: Add VFS helper functions for setting up private namespaces

The purpose of this patch is to improve the remote mount path lookup
support for distributed filesystems such as the NFSv4 client.

When given a mount command of the form "mount server:/foo/bar /mnt", the
NFSv4 client is required to look up the filehandle for "server:/", and
then look up each component of the remote mount path "foo/bar" in order
to find the directory that is actually going to be mounted on /mnt.
Following that remote mount path may involve following symlinks,
crossing server-side mount points and even following referrals to
filesystem volumes on other servers.

Since the standard VFS path lookup code already supports walking paths
that contain all these features (using in-kernel automounts for
following referrals) we would like to be able to reuse that rather than
duplicate the full path traversal functionality in the NFSv4 client code.

This patch therefore defines a VFS helper function create_mnt_ns(), that
sets up a temporary filesystem namespace and attaches a root filesystem to
it. It exports the create_mnt_ns() and put_mnt_ns() function for use by
filesystem modules.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c                | 45 +++++++++++++++++++++++++++++++++++--------
 include/linux/mnt_namespace.h |  1 +
 2 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 6645846f205..a7bea8c8bd4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1937,6 +1937,21 @@ dput_out:
 	return retval;
 }
 
+static struct mnt_namespace *alloc_mnt_ns(void)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	atomic_set(&new_ns->count, 1);
+	new_ns->root = NULL;
+	INIT_LIST_HEAD(&new_ns->list);
+	init_waitqueue_head(&new_ns->poll);
+	new_ns->event = 0;
+	return new_ns;
+}
+
 /*
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
@@ -1948,14 +1963,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct vfsmount *p, *q;
 
-	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-
-	atomic_set(&new_ns->count, 1);
-	INIT_LIST_HEAD(&new_ns->list);
-	init_waitqueue_head(&new_ns->poll);
-	new_ns->event = 0;
+	new_ns = alloc_mnt_ns();
+	if (IS_ERR(new_ns))
+		return new_ns;
 
 	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
@@ -2019,6 +2029,24 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	return new_ns;
 }
 
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = alloc_mnt_ns();
+	if (!IS_ERR(new_ns)) {
+		mnt->mnt_ns = new_ns;
+		new_ns->root = mnt;
+		list_add(&new_ns->list, &new_ns->root->mnt_list);
+	}
+	return new_ns;
+}
+EXPORT_SYMBOL(create_mnt_ns);
+
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 		char __user *, type, unsigned long, flags, void __user *, data)
 {
@@ -2264,3 +2292,4 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	release_mounts(&umount_list);
 	kfree(ns);
 }
+EXPORT_SYMBOL(put_mnt_ns);
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 299d11af5f7..3beb2592b03 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -24,6 +24,7 @@ struct proc_mounts {
 
 struct fs_struct;
 
+extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt);
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);
-- 
cgit v1.2.3-70-g09d2


From 02ab18b0f497bed623814677577b76cc97234085 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 14 Jun 2009 04:32:04 -0300
Subject: V4L/DVB (12072): gspca-ov519: add extra controls

This patch adds autobrightness (so that it can
be turned off to make the already present brightness
control work) and light frequency filtering controls.

The lightfreq control needed 2 different entries
in the ctrls array, as the number of options differs
depending on the sensor. Always one of the 2 entires is
disabled ofcourse.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/ov519.c | 214 ++++++++++++++++++++++++++++++++++++--
 include/linux/videodev2.h         |   3 +-
 2 files changed, 207 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/ov519.c b/drivers/media/video/gspca/ov519.c
index 188866ac6ce..baa488dd33d 100644
--- a/drivers/media/video/gspca/ov519.c
+++ b/drivers/media/video/gspca/ov519.c
@@ -65,6 +65,8 @@ struct sd {
 	__u8 colors;
 	__u8 hflip;
 	__u8 vflip;
+	__u8 autobrightness;
+	__u8 freq;
 
 	__u8 stopped;		/* Streaming is temporarily paused */
 
@@ -94,11 +96,17 @@ static int sd_sethflip(struct gspca_dev *gspca_dev, __s32 val);
 static int sd_gethflip(struct gspca_dev *gspca_dev, __s32 *val);
 static int sd_setvflip(struct gspca_dev *gspca_dev, __s32 val);
 static int sd_getvflip(struct gspca_dev *gspca_dev, __s32 *val);
+static int sd_setautobrightness(struct gspca_dev *gspca_dev, __s32 val);
+static int sd_getautobrightness(struct gspca_dev *gspca_dev, __s32 *val);
+static int sd_setfreq(struct gspca_dev *gspca_dev, __s32 val);
+static int sd_getfreq(struct gspca_dev *gspca_dev, __s32 *val);
 static void setbrightness(struct gspca_dev *gspca_dev);
 static void setcontrast(struct gspca_dev *gspca_dev);
 static void setcolors(struct gspca_dev *gspca_dev);
+static void setautobrightness(struct sd *sd);
+static void setfreq(struct sd *sd);
 
-static struct ctrl sd_ctrls[] = {
+static const struct ctrl sd_ctrls[] = {
 	{
 	    {
 		.id      = V4L2_CID_BRIGHTNESS,
@@ -141,7 +149,7 @@ static struct ctrl sd_ctrls[] = {
 	    .set = sd_setcolors,
 	    .get = sd_getcolors,
 	},
-/* next controls work with ov7670 only */
+/* The flip controls work with ov7670 only */
 #define HFLIP_IDX 3
 	{
 	    {
@@ -172,6 +180,51 @@ static struct ctrl sd_ctrls[] = {
 	    .set = sd_setvflip,
 	    .get = sd_getvflip,
 	},
+#define AUTOBRIGHT_IDX 5
+	{
+	    {
+		.id      = V4L2_CID_AUTOBRIGHTNESS,
+		.type    = V4L2_CTRL_TYPE_BOOLEAN,
+		.name    = "Auto Brightness",
+		.minimum = 0,
+		.maximum = 1,
+		.step    = 1,
+#define AUTOBRIGHT_DEF 1
+		.default_value = AUTOBRIGHT_DEF,
+	    },
+	    .set = sd_setautobrightness,
+	    .get = sd_getautobrightness,
+	},
+#define FREQ_IDX 6
+	{
+	    {
+		.id	 = V4L2_CID_POWER_LINE_FREQUENCY,
+		.type    = V4L2_CTRL_TYPE_MENU,
+		.name    = "Light frequency filter",
+		.minimum = 0,
+		.maximum = 2,	/* 0: 0, 1: 50Hz, 2:60Hz */
+		.step    = 1,
+#define FREQ_DEF 0
+		.default_value = FREQ_DEF,
+	    },
+	    .set = sd_setfreq,
+	    .get = sd_getfreq,
+	},
+#define OV7670_FREQ_IDX 7
+	{
+	    {
+		.id	 = V4L2_CID_POWER_LINE_FREQUENCY,
+		.type    = V4L2_CTRL_TYPE_MENU,
+		.name    = "Light frequency filter",
+		.minimum = 0,
+		.maximum = 3,	/* 0: 0, 1: 50Hz, 2:60Hz 3: Auto Hz */
+		.step    = 1,
+#define OV7670_FREQ_DEF 3
+		.default_value = OV7670_FREQ_DEF,
+	    },
+	    .set = sd_setfreq,
+	    .get = sd_getfreq,
+	},
 };
 
 static const struct v4l2_pix_format ov519_vga_mode[] = {
@@ -416,7 +469,7 @@ static const struct ov_i2c_regvals norm_6x30[] = {
 	{ 0x07, 0x2d }, /* Sharpness */
 	{ 0x0c, 0x20 },
 	{ 0x0d, 0x20 },
-	{ 0x0e, 0x20 },
+	{ 0x0e, 0xa0 }, /* Was 0x20, bit7 enables a 2x gain which we need */
 	{ 0x0f, 0x05 },
 	{ 0x10, 0x9a },
 	{ 0x11, 0x00 }, /* Pixel clock = fastest */
@@ -1659,9 +1712,21 @@ static int sd_config(struct gspca_dev *gspca_dev,
 	sd->colors = COLOR_DEF;
 	sd->hflip = HFLIP_DEF;
 	sd->vflip = VFLIP_DEF;
-	if (sd->sensor != SEN_OV7670)
-		gspca_dev->ctrl_dis = (1 << HFLIP_IDX)
-					| (1 << VFLIP_IDX);
+	sd->autobrightness = AUTOBRIGHT_DEF;
+	if (sd->sensor == SEN_OV7670) {
+		sd->freq = OV7670_FREQ_DEF;
+		gspca_dev->ctrl_dis = 1 << FREQ_IDX;
+	} else {
+		sd->freq = FREQ_DEF;
+		gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX) |
+				      (1 << OV7670_FREQ_IDX);
+	}
+	if (sd->sensor == SEN_OV7640 || sd->sensor == SEN_OV7670)
+		gspca_dev->ctrl_dis |= 1 << AUTOBRIGHT_IDX;
+	/* OV8610 Frequency filter control should work but needs testing */
+	if (sd->sensor == SEN_OV8610)
+		gspca_dev->ctrl_dis |= 1 << FREQ_IDX;
+
 	return 0;
 error:
 	PDEBUG(D_ERR, "OV519 Config failed");
@@ -2233,7 +2298,6 @@ static int set_ov_sensor_window(struct sd *sd)
 		msleep(10);	/* need to sleep between read and write to
 				 * same reg! */
 		i2c_w(sd, OV7670_REG_VREF, v);
-		sethvflip(sd);
 	} else {
 		i2c_w(sd, 0x17, hwsbase);
 		i2c_w(sd, 0x18, hwebase + (sd->gspca_dev.width >> hwscale));
@@ -2268,6 +2332,9 @@ static int sd_start(struct gspca_dev *gspca_dev)
 	setcontrast(gspca_dev);
 	setbrightness(gspca_dev);
 	setcolors(gspca_dev);
+	sethvflip(sd);
+	setautobrightness(sd);
+	setfreq(sd);
 
 	ret = ov51x_restart(sd);
 	if (ret < 0)
@@ -2394,8 +2461,7 @@ static void setbrightness(struct gspca_dev *gspca_dev)
 		break;
 	case SEN_OV7620:
 		/* 7620 doesn't like manual changes when in auto mode */
-/*fixme
- *		if (!sd->auto_brt) */
+		if (!sd->autobrightness)
 			i2c_w(sd, OV7610_REG_BRT, val);
 		break;
 	case SEN_OV7670:
@@ -2482,6 +2548,70 @@ static void setcolors(struct gspca_dev *gspca_dev)
 	}
 }
 
+static void setautobrightness(struct sd *sd)
+{
+	if (sd->sensor == SEN_OV7640 || sd->sensor == SEN_OV7670)
+		return;
+
+	i2c_w_mask(sd, 0x2d, sd->autobrightness ? 0x10 : 0x00, 0x10);
+}
+
+static void setfreq(struct sd *sd)
+{
+	if (sd->sensor == SEN_OV7670) {
+		switch (sd->freq) {
+		case 0: /* Banding filter disabled */
+			i2c_w_mask(sd, OV7670_REG_COM8, 0, OV7670_COM8_BFILT);
+			break;
+		case 1: /* 50 hz */
+			i2c_w_mask(sd, OV7670_REG_COM8, OV7670_COM8_BFILT,
+				   OV7670_COM8_BFILT);
+			i2c_w_mask(sd, OV7670_REG_COM11, 0x08, 0x18);
+			break;
+		case 2: /* 60 hz */
+			i2c_w_mask(sd, OV7670_REG_COM8, OV7670_COM8_BFILT,
+				   OV7670_COM8_BFILT);
+			i2c_w_mask(sd, OV7670_REG_COM11, 0x00, 0x18);
+			break;
+		case 3: /* Auto hz */
+			i2c_w_mask(sd, OV7670_REG_COM8, OV7670_COM8_BFILT,
+				   OV7670_COM8_BFILT);
+			i2c_w_mask(sd, OV7670_REG_COM11, OV7670_COM11_HZAUTO,
+				   0x18);
+			break;
+		}
+	} else {
+		switch (sd->freq) {
+		case 0: /* Banding filter disabled */
+			i2c_w_mask(sd, 0x2d, 0x00, 0x04);
+			i2c_w_mask(sd, 0x2a, 0x00, 0x80);
+			break;
+		case 1: /* 50 hz (filter on and framerate adj) */
+			i2c_w_mask(sd, 0x2d, 0x04, 0x04);
+			i2c_w_mask(sd, 0x2a, 0x80, 0x80);
+			/* 20 fps -> 16.667 fps */
+			if (sd->sensor == SEN_OV6620 ||
+			    sd->sensor == SEN_OV6630)
+				i2c_w(sd, 0x2b, 0x5e);
+			else
+				i2c_w(sd, 0x2b, 0xac);
+			break;
+		case 2: /* 60 hz (filter on, ...) */
+			i2c_w_mask(sd, 0x2d, 0x04, 0x04);
+			if (sd->sensor == SEN_OV6620 ||
+			    sd->sensor == SEN_OV6630) {
+				/* 20 fps -> 15 fps */
+				i2c_w_mask(sd, 0x2a, 0x80, 0x80);
+				i2c_w(sd, 0x2b, 0xa8);
+			} else {
+				/* no framerate adj. */
+				i2c_w_mask(sd, 0x2a, 0x00, 0x80);
+			}
+			break;
+		}
+	}
+}
+
 static int sd_setbrightness(struct gspca_dev *gspca_dev, __s32 val)
 {
 	struct sd *sd = (struct sd *) gspca_dev;
@@ -2572,6 +2702,71 @@ static int sd_getvflip(struct gspca_dev *gspca_dev, __s32 *val)
 	return 0;
 }
 
+static int sd_setautobrightness(struct gspca_dev *gspca_dev, __s32 val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	sd->autobrightness = val;
+	if (gspca_dev->streaming)
+		setautobrightness(sd);
+	return 0;
+}
+
+static int sd_getautobrightness(struct gspca_dev *gspca_dev, __s32 *val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	*val = sd->autobrightness;
+	return 0;
+}
+
+static int sd_setfreq(struct gspca_dev *gspca_dev, __s32 val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	sd->freq = val;
+	if (gspca_dev->streaming)
+		setfreq(sd);
+	return 0;
+}
+
+static int sd_getfreq(struct gspca_dev *gspca_dev, __s32 *val)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	*val = sd->freq;
+	return 0;
+}
+
+static int sd_querymenu(struct gspca_dev *gspca_dev,
+			struct v4l2_querymenu *menu)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	switch (menu->id) {
+	case V4L2_CID_POWER_LINE_FREQUENCY:
+		switch (menu->index) {
+		case 0:		/* V4L2_CID_POWER_LINE_FREQUENCY_DISABLED */
+			strcpy((char *) menu->name, "NoFliker");
+			return 0;
+		case 1:		/* V4L2_CID_POWER_LINE_FREQUENCY_50HZ */
+			strcpy((char *) menu->name, "50 Hz");
+			return 0;
+		case 2:		/* V4L2_CID_POWER_LINE_FREQUENCY_60HZ */
+			strcpy((char *) menu->name, "60 Hz");
+			return 0;
+		case 3:
+			if (sd->sensor != SEN_OV7670)
+				return -EINVAL;
+
+			strcpy((char *) menu->name, "Automatic");
+			return 0;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
 /* sub-driver description */
 static const struct sd_desc sd_desc = {
 	.name = MODULE_NAME,
@@ -2582,6 +2777,7 @@ static const struct sd_desc sd_desc = {
 	.start = sd_start,
 	.stopN = sd_stopN,
 	.pkt_scan = sd_pkt_scan,
+	.querymenu = sd_querymenu,
 };
 
 /* -- module initialisation -- */
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index f24eceecc5a..772d226cb5c 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -894,9 +894,10 @@ enum v4l2_colorfx {
 	V4L2_COLORFX_BW		= 1,
 	V4L2_COLORFX_SEPIA	= 2,
 };
+#define V4L2_CID_AUTOBRIGHTNESS			(V4L2_CID_BASE+32)
 
 /* last CID + 1 */
-#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+32)
+#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+33)
 
 /*  MPEG-class control IDs defined by V4L2 */
 #define V4L2_CID_MPEG_BASE 			(V4L2_CTRL_CLASS_MPEG | 0x900)
-- 
cgit v1.2.3-70-g09d2


From 1876bb923c98c605eca69f0bfe295f7b5f5eba28 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 14 Jun 2009 06:45:50 -0300
Subject: V4L/DVB (12079): gspca_ov519: add support for the ov511 bridge

gspca_ov519: add support for the ov511 bridge

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/ov519.c | 533 +++++++++++++++++++++++++++++++++++++-
 include/linux/videodev2.h         |   1 +
 2 files changed, 521 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/ov519.c b/drivers/media/video/gspca/ov519.c
index 9d4b69dbf96..1f8e2613ecc 100644
--- a/drivers/media/video/gspca/ov519.c
+++ b/drivers/media/video/gspca/ov519.c
@@ -76,8 +76,8 @@ struct sd {
 
 	__u8 stopped;		/* Streaming is temporarily paused */
 
-	__u8 frame_rate;	/* current Framerate (OV519 only) */
-	__u8 clockdiv;		/* clockdiv override for OV519 only */
+	__u8 frame_rate;	/* current Framerate */
+	__u8 clockdiv;		/* clockdiv override */
 
 	char sensor;		/* Type of image sensor chip (SEN_*) */
 #define SEN_UNKNOWN 0
@@ -304,17 +304,77 @@ static const struct v4l2_pix_format ov518_sif_mode[] = {
 		.priv = 0},
 };
 
+static const struct v4l2_pix_format ov511_vga_mode[] = {
+	{320, 240, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 320,
+		.sizeimage = 320 * 240 * 3,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 1},
+	{640, 480, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 640,
+		.sizeimage = 640 * 480 * 2,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 0},
+};
+static const struct v4l2_pix_format ov511_sif_mode[] = {
+	{160, 120, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 160,
+		.sizeimage = 40000,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 3},
+	{176, 144, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 176,
+		.sizeimage = 40000,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 1},
+	{320, 240, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 320,
+		.sizeimage = 320 * 240 * 3,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 2},
+	{352, 288, V4L2_PIX_FMT_OV511, V4L2_FIELD_NONE,
+		.bytesperline = 352,
+		.sizeimage = 352 * 288 * 3,
+		.colorspace = V4L2_COLORSPACE_JPEG,
+		.priv = 0},
+};
 
 /* Registers common to OV511 / OV518 */
+#define R51x_FIFO_PSIZE			0x30	/* 2 bytes wide w/ OV518(+) */
 #define R51x_SYS_RESET          	0x50
+	/* Reset type flags */
+	#define	OV511_RESET_OMNICE	0x08
 #define R51x_SYS_INIT         		0x53
 #define R51x_SYS_SNAP			0x52
 #define R51x_SYS_CUST_ID		0x5F
 #define R51x_COMP_LUT_BEGIN		0x80
 
 /* OV511 Camera interface register numbers */
+#define R511_CAM_DELAY			0x10
+#define R511_CAM_EDGE			0x11
+#define R511_CAM_PXCNT			0x12
+#define R511_CAM_LNCNT			0x13
+#define R511_CAM_PXDIV			0x14
+#define R511_CAM_LNDIV			0x15
+#define R511_CAM_UV_EN			0x16
+#define R511_CAM_LINE_MODE		0x17
+#define R511_CAM_OPTS			0x18
+
+#define R511_SNAP_FRAME			0x19
+#define R511_SNAP_PXCNT			0x1A
+#define R511_SNAP_LNCNT			0x1B
+#define R511_SNAP_PXDIV			0x1C
+#define R511_SNAP_LNDIV			0x1D
+#define R511_SNAP_UV_EN			0x1E
+#define R511_SNAP_UV_EN			0x1E
+#define R511_SNAP_OPTS			0x1F
+
+#define R511_DRAM_FLOW_CTL		0x20
+#define R511_FIFO_OPTS			0x31
+#define R511_I2C_CTL			0x40
 #define R511_SYS_LED_CTL		0x55	/* OV511+ only */
-#define	OV511_RESET_NOREGS		0x3F	/* All but OV511 & regs */
+#define R511_COMP_EN			0x78
+#define R511_COMP_LUT_EN		0x79
 
 /* OV518 Camera interface register numbers */
 #define R518_GPIO_OUT			0x56	/* OV518(+) only */
@@ -1079,13 +1139,128 @@ static int ov518_reg_w32(struct sd *sd, __u16 index, u32 value, int n)
 	return ret;
 }
 
+static int ov511_i2c_w(struct sd *sd, __u8 reg, __u8 value)
+{
+	int rc, retries;
+
+	PDEBUG(D_USBO, "i2c 0x%02x -> [0x%02x]", value, reg);
+
+	/* Three byte write cycle */
+	for (retries = 6; ; ) {
+		/* Select camera register */
+		rc = reg_w(sd, R51x_I2C_SADDR_3, reg);
+		if (rc < 0)
+			return rc;
+
+		/* Write "value" to I2C data port of OV511 */
+		rc = reg_w(sd, R51x_I2C_DATA, value);
+		if (rc < 0)
+			return rc;
+
+		/* Initiate 3-byte write cycle */
+		rc = reg_w(sd, R511_I2C_CTL, 0x01);
+		if (rc < 0)
+			return rc;
+
+		do
+			rc = reg_r(sd, R511_I2C_CTL);
+		while (rc > 0 && ((rc & 1) == 0)); /* Retry until idle */
+
+		if (rc < 0)
+			return rc;
+
+		if ((rc & 2) == 0) /* Ack? */
+			break;
+		if (--retries < 0) {
+			PDEBUG(D_USBO, "i2c write retries exhausted");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int ov511_i2c_r(struct sd *sd, __u8 reg)
+{
+	int rc, value, retries;
+
+	/* Two byte write cycle */
+	for (retries = 6; ; ) {
+		/* Select camera register */
+		rc = reg_w(sd, R51x_I2C_SADDR_2, reg);
+		if (rc < 0)
+			return rc;
+
+		/* Initiate 2-byte write cycle */
+		rc = reg_w(sd, R511_I2C_CTL, 0x03);
+		if (rc < 0)
+			return rc;
+
+		do
+			rc = reg_r(sd, R511_I2C_CTL);
+		while (rc > 0 && ((rc & 1) == 0)); /* Retry until idle */
+
+		if (rc < 0)
+			return rc;
+
+		if ((rc & 2) == 0) /* Ack? */
+			break;
+
+		/* I2C abort */
+		reg_w(sd, R511_I2C_CTL, 0x10);
+
+		if (--retries < 0) {
+			PDEBUG(D_USBI, "i2c write retries exhausted");
+			return -1;
+		}
+	}
+
+	/* Two byte read cycle */
+	for (retries = 6; ; ) {
+		/* Initiate 2-byte read cycle */
+		rc = reg_w(sd, R511_I2C_CTL, 0x05);
+		if (rc < 0)
+			return rc;
+
+		do
+			rc = reg_r(sd, R511_I2C_CTL);
+		while (rc > 0 && ((rc & 1) == 0)); /* Retry until idle */
+
+		if (rc < 0)
+			return rc;
+
+		if ((rc & 2) == 0) /* Ack? */
+			break;
+
+		/* I2C abort */
+		rc = reg_w(sd, R511_I2C_CTL, 0x10);
+		if (rc < 0)
+			return rc;
+
+		if (--retries < 0) {
+			PDEBUG(D_USBI, "i2c read retries exhausted");
+			return -1;
+		}
+	}
+
+	value = reg_r(sd, R51x_I2C_DATA);
+
+	PDEBUG(D_USBI, "i2c [0x%02X] -> 0x%02X", reg, value);
+
+	/* This is needed to make i2c_w() work */
+	rc = reg_w(sd, R511_I2C_CTL, 0x05);
+	if (rc < 0)
+		return rc;
+
+	return value;
+}
 
 /*
  * The OV518 I2C I/O procedure is different, hence, this function.
  * This is normally only called from i2c_w(). Note that this function
  * always succeeds regardless of whether the sensor is present and working.
  */
-static int i2c_w(struct sd *sd,
+static int ov518_i2c_w(struct sd *sd,
 		__u8 reg,
 		__u8 value)
 {
@@ -1120,7 +1295,7 @@ static int i2c_w(struct sd *sd,
  * This is normally only called from i2c_r(). Note that this function
  * always succeeds regardless of whether the sensor is present and working.
  */
-static int i2c_r(struct sd *sd, __u8 reg)
+static int ov518_i2c_r(struct sd *sd, __u8 reg)
 {
 	int rc, value;
 
@@ -1143,6 +1318,34 @@ static int i2c_r(struct sd *sd, __u8 reg)
 	return value;
 }
 
+static int i2c_w(struct sd *sd, __u8 reg, __u8 value)
+{
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		return ov511_i2c_w(sd, reg, value);
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+	case BRIDGE_OV519:
+		return ov518_i2c_w(sd, reg, value);
+	}
+	return -1; /* Should never happen */
+}
+
+static int i2c_r(struct sd *sd, __u8 reg)
+{
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		return ov511_i2c_r(sd, reg);
+	case BRIDGE_OV518:
+	case BRIDGE_OV518PLUS:
+	case BRIDGE_OV519:
+		return ov518_i2c_r(sd, reg);
+	}
+	return -1; /* Should never happen */
+}
+
 /* Writes bits at positions specified by mask to an I2C reg. Bits that are in
  * the same position as 1's in "mask" are cleared and set to "value". Bits
  * that are in the same position as 0's in "mask" are preserved, regardless
@@ -1490,9 +1693,31 @@ static void ov51x_led_control(struct sd *sd, int on)
 	}
 }
 
-/* OV518 quantization tables are 8x4 (instead of 8x8) */
-static int ov518_upload_quan_tables(struct sd *sd)
+static int ov51x_upload_quan_tables(struct sd *sd)
 {
+	const unsigned char yQuanTable511[] = {
+		0, 1, 1, 2, 2, 3, 3, 4,
+		1, 1, 1, 2, 2, 3, 4, 4,
+		1, 1, 2, 2, 3, 4, 4, 4,
+		2, 2, 2, 3, 4, 4, 4, 4,
+		2, 2, 3, 4, 4, 5, 5, 5,
+		3, 3, 4, 4, 5, 5, 5, 5,
+		3, 4, 4, 4, 5, 5, 5, 5,
+		4, 4, 4, 4, 5, 5, 5, 5
+	};
+
+	const unsigned char uvQuanTable511[] = {
+		0, 2, 2, 3, 4, 4, 4, 4,
+		2, 2, 2, 4, 4, 4, 4, 4,
+		2, 2, 3, 4, 4, 4, 4, 4,
+		3, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4
+	};
+
+	/* OV518 quantization tables are 8x4 (instead of 8x8) */
 	const unsigned char yQuanTable518[] = {
 		5, 4, 5, 6, 6, 7, 7, 7,
 		5, 5, 5, 5, 6, 7, 7, 7,
@@ -1507,14 +1732,23 @@ static int ov518_upload_quan_tables(struct sd *sd)
 		7, 7, 7, 7, 7, 7, 8, 8
 	};
 
-	const unsigned char *pYTable = yQuanTable518;
-	const unsigned char *pUVTable = uvQuanTable518;
+	const unsigned char *pYTable, *pUVTable;
 	unsigned char val0, val1;
-	int i, rc, reg = R51x_COMP_LUT_BEGIN;
+	int i, size, rc, reg = R51x_COMP_LUT_BEGIN;
 
 	PDEBUG(D_PROBE, "Uploading quantization tables");
 
-	for (i = 0; i < 16; i++) {
+	if (sd->bridge == BRIDGE_OV511 || sd->bridge == BRIDGE_OV511PLUS) {
+		pYTable = yQuanTable511;
+		pUVTable = uvQuanTable511;
+		size  = 32;
+	} else {
+		pYTable = yQuanTable518;
+		pUVTable = uvQuanTable518;
+		size  = 16;
+	}
+
+	for (i = 0; i < size; i++) {
 		val0 = *pYTable++;
 		val1 = *pYTable++;
 		val0 &= 0x0f;
@@ -1529,7 +1763,7 @@ static int ov518_upload_quan_tables(struct sd *sd)
 		val0 &= 0x0f;
 		val1 &= 0x0f;
 		val0 |= val1 << 4;
-		rc = reg_w(sd, reg + 16, val0);
+		rc = reg_w(sd, reg + size, val0);
 		if (rc < 0)
 			return rc;
 
@@ -1539,6 +1773,87 @@ static int ov518_upload_quan_tables(struct sd *sd)
 	return 0;
 }
 
+/* This initializes the OV511/OV511+ and the sensor */
+static int ov511_configure(struct gspca_dev *gspca_dev)
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+	int rc;
+
+	/* For 511 and 511+ */
+	const struct ov_regvals init_511[] = {
+		{ R51x_SYS_RESET,	0x7f },
+		{ R51x_SYS_INIT,	0x01 },
+		{ R51x_SYS_RESET,	0x7f },
+		{ R51x_SYS_INIT,	0x01 },
+		{ R51x_SYS_RESET,	0x3f },
+		{ R51x_SYS_INIT,	0x01 },
+		{ R51x_SYS_RESET,	0x3d },
+	};
+
+	const struct ov_regvals norm_511[] = {
+		{ R511_DRAM_FLOW_CTL, 	0x01 },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R51x_SYS_SNAP,	0x02 },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R511_FIFO_OPTS,	0x1f },
+		{ R511_COMP_EN,		0x00 },
+		{ R511_COMP_LUT_EN,	0x03 },
+	};
+
+	const struct ov_regvals norm_511_p[] = {
+		{ R511_DRAM_FLOW_CTL,	0xff },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R51x_SYS_SNAP,	0x02 },
+		{ R51x_SYS_SNAP,	0x00 },
+		{ R511_FIFO_OPTS,	0xff },
+		{ R511_COMP_EN,		0x00 },
+		{ R511_COMP_LUT_EN,	0x03 },
+	};
+
+	const struct ov_regvals compress_511[] = {
+		{ 0x70, 0x1f },
+		{ 0x71, 0x05 },
+		{ 0x72, 0x06 },
+		{ 0x73, 0x06 },
+		{ 0x74, 0x14 },
+		{ 0x75, 0x03 },
+		{ 0x76, 0x04 },
+		{ 0x77, 0x04 },
+	};
+
+	PDEBUG(D_PROBE, "Device custom id %x", reg_r(sd, R51x_SYS_CUST_ID));
+
+	rc = write_regvals(sd, init_511, ARRAY_SIZE(init_511));
+	if (rc < 0)
+		return rc;
+
+	switch (sd->bridge) {
+	case BRIDGE_OV511:
+		rc = write_regvals(sd, norm_511, ARRAY_SIZE(norm_511));
+		if (rc < 0)
+			return rc;
+		break;
+	case BRIDGE_OV511PLUS:
+		rc = write_regvals(sd, norm_511_p, ARRAY_SIZE(norm_511_p));
+		if (rc < 0)
+			return rc;
+		break;
+	}
+
+	/* Init compression */
+	rc = write_regvals(sd, compress_511, ARRAY_SIZE(compress_511));
+	if (rc < 0)
+		return rc;
+
+	rc = ov51x_upload_quan_tables(sd);
+	if (rc < 0) {
+		PDEBUG(D_ERR, "Error uploading quantization tables");
+		return rc;
+	}
+
+	return 0;
+}
+
 /* This initializes the OV518/OV518+ and the sensor */
 static int ov518_configure(struct gspca_dev *gspca_dev)
 {
@@ -1615,7 +1930,7 @@ static int ov518_configure(struct gspca_dev *gspca_dev)
 		break;
 	}
 
-	rc = ov518_upload_quan_tables(sd);
+	rc = ov51x_upload_quan_tables(sd);
 	if (rc < 0) {
 		PDEBUG(D_ERR, "Error uploading quantization tables");
 		return rc;
@@ -1661,6 +1976,10 @@ static int sd_config(struct gspca_dev *gspca_dev,
 	sd->invert_led = id->driver_info & BRIDGE_INVERT_LED;
 
 	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		ret = ov511_configure(gspca_dev);
+		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
 		ret = ov518_configure(gspca_dev);
@@ -1719,6 +2038,16 @@ static int sd_config(struct gspca_dev *gspca_dev,
 
 	cam = &gspca_dev->cam;
 	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		if (!sd->sif) {
+			cam->cam_mode = ov511_vga_mode;
+			cam->nmodes = ARRAY_SIZE(ov511_vga_mode);
+		} else {
+			cam->cam_mode = ov511_sif_mode;
+			cam->nmodes = ARRAY_SIZE(ov511_sif_mode);
+		}
+		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
 		if (!sd->sif) {
@@ -1810,6 +2139,126 @@ static int sd_init(struct gspca_dev *gspca_dev)
 	return 0;
 }
 
+/* Set up the OV511/OV511+ with the given image parameters.
+ *
+ * Do not put any sensor-specific code in here (including I2C I/O functions)
+ */
+static int ov511_mode_init_regs(struct sd *sd)
+{
+	int hsegs, vsegs, packet_size, fps, needed;
+	int interlaced = 0;
+	struct usb_host_interface *alt;
+	struct usb_interface *intf;
+
+	intf = usb_ifnum_to_if(sd->gspca_dev.dev, sd->gspca_dev.iface);
+	alt = usb_altnum_to_altsetting(intf, sd->gspca_dev.alt);
+	if (!alt) {
+		PDEBUG(D_ERR, "Couldn't get altsetting");
+		return -EIO;
+	}
+
+	packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize);
+	reg_w(sd, R51x_FIFO_PSIZE, packet_size >> 5);
+
+	reg_w(sd, R511_CAM_UV_EN, 0x01);
+	reg_w(sd, R511_SNAP_UV_EN, 0x01);
+	reg_w(sd, R511_SNAP_OPTS, 0x03);
+
+	/* Here I'm assuming that snapshot size == image size.
+	 * I hope that's always true. --claudio
+	 */
+	hsegs = (sd->gspca_dev.width >> 3) - 1;
+	vsegs = (sd->gspca_dev.height >> 3) - 1;
+
+	reg_w(sd, R511_CAM_PXCNT, hsegs);
+	reg_w(sd, R511_CAM_LNCNT, vsegs);
+	reg_w(sd, R511_CAM_PXDIV, 0x00);
+	reg_w(sd, R511_CAM_LNDIV, 0x00);
+
+	/* YUV420, low pass filter on */
+	reg_w(sd, R511_CAM_OPTS, 0x03);
+
+	/* Snapshot additions */
+	reg_w(sd, R511_SNAP_PXCNT, hsegs);
+	reg_w(sd, R511_SNAP_LNCNT, vsegs);
+	reg_w(sd, R511_SNAP_PXDIV, 0x00);
+	reg_w(sd, R511_SNAP_LNDIV, 0x00);
+
+	/******** Set the framerate ********/
+	if (frame_rate > 0)
+		sd->frame_rate = frame_rate;
+
+	switch (sd->sensor) {
+	case SEN_OV6620:
+		/* No framerate control, doesn't like higher rates yet */
+		sd->clockdiv = 3;
+		break;
+
+	/* Note once the FIXME's in mode_init_ov_sensor_regs() are fixed
+	   for more sensors we need to do this for them too */
+	case SEN_OV7620:
+	case SEN_OV7640:
+		if (sd->gspca_dev.width == 320)
+			interlaced = 1;
+		/* Fall through */
+	case SEN_OV6630:
+	case SEN_OV76BE:
+	case SEN_OV7610:
+	case SEN_OV7670:
+		switch (sd->frame_rate) {
+		case 30:
+		case 25:
+			/* Not enough bandwidth to do 640x480 @ 30 fps */
+			if (sd->gspca_dev.width != 640) {
+				sd->clockdiv = 0;
+				break;
+			}
+			/* Fall through for 640x480 case */
+		default:
+/*		case 20: */
+/*		case 15: */
+			sd->clockdiv = 1;
+			break;
+		case 10:
+			sd->clockdiv = 2;
+			break;
+		case 5:
+			sd->clockdiv = 5;
+			break;
+		}
+		if (interlaced) {
+			sd->clockdiv = (sd->clockdiv + 1) * 2 - 1;
+			/* Higher then 10 does not work */
+			if (sd->clockdiv > 10)
+				sd->clockdiv = 10;
+		}
+		break;
+
+	case SEN_OV8610:
+		/* No framerate control ?? */
+		sd->clockdiv = 0;
+		break;
+	}
+
+	/* Check if we have enough bandwidth to disable compression */
+	fps = (interlaced ? 60 : 30) / (sd->clockdiv + 1) + 1;
+	needed = fps * sd->gspca_dev.width * sd->gspca_dev.height * 3 / 2;
+	/* 1400 is a conservative estimate of the max nr of isoc packets/sec */
+	if (needed > 1400 * packet_size) {
+		/* Enable Y and UV quantization and compression */
+		reg_w(sd, R511_COMP_EN, 0x07);
+		reg_w(sd, R511_COMP_LUT_EN, 0x03);
+	} else {
+		reg_w(sd, R511_COMP_EN, 0x06);
+		reg_w(sd, R511_COMP_LUT_EN, 0x00);
+	}
+
+	reg_w(sd, R51x_SYS_RESET, OV511_RESET_OMNICE);
+	reg_w(sd, R51x_SYS_RESET, 0);
+
+	return 0;
+}
+
 /* Sets up the OV518/OV518+ with the given image parameters
  *
  * OV518 needs a completely different approach, until we can figure out what
@@ -2363,6 +2812,10 @@ static int sd_start(struct gspca_dev *gspca_dev)
 	int ret = 0;
 
 	switch (sd->bridge) {
+	case BRIDGE_OV511:
+	case BRIDGE_OV511PLUS:
+		ret = ov511_mode_init_regs(sd);
+		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
 		ret = ov518_mode_init_regs(sd);
@@ -2403,6 +2856,56 @@ static void sd_stopN(struct gspca_dev *gspca_dev)
 	ov51x_led_control(sd, 0);
 }
 
+static void ov511_pkt_scan(struct gspca_dev *gspca_dev,
+			struct gspca_frame *frame,	/* target */
+			__u8 *in,			/* isoc packet */
+			int len)			/* iso packet length */
+{
+	struct sd *sd = (struct sd *) gspca_dev;
+
+	/* SOF/EOF packets have 1st to 8th bytes zeroed and the 9th
+	 * byte non-zero. The EOF packet has image width/height in the
+	 * 10th and 11th bytes. The 9th byte is given as follows:
+	 *
+	 * bit 7: EOF
+	 *     6: compression enabled
+	 *     5: 422/420/400 modes
+	 *     4: 422/420/400 modes
+	 *     3: 1
+	 *     2: snapshot button on
+	 *     1: snapshot frame
+	 *     0: even/odd field
+	 */
+	if (!(in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6] | in[7]) &&
+	    (in[8] & 0x08)) {
+		if (in[8] & 0x80) {
+			/* Frame end */
+			if ((in[9] + 1) * 8 != gspca_dev->width ||
+			    (in[10] + 1) * 8 != gspca_dev->height) {
+				PDEBUG(D_ERR, "Invalid frame size, got: %dx%d,"
+					" requested: %dx%d\n",
+					(in[9] + 1) * 8, (in[10] + 1) * 8,
+					gspca_dev->width, gspca_dev->height);
+				gspca_dev->last_packet_type = DISCARD_PACKET;
+				return;
+			}
+			/* Add 11 byte footer to frame, might be usefull */
+			gspca_frame_add(gspca_dev, LAST_PACKET, frame, in, 11);
+			return;
+		} else {
+			/* Frame start */
+			gspca_frame_add(gspca_dev, FIRST_PACKET, frame, in, 0);
+			sd->packet_nr = 0;
+		}
+	}
+
+	/* Ignore the packet number */
+	len--;
+
+	/* intermediate packet */
+	gspca_frame_add(gspca_dev, INTER_PACKET, frame, in, len);
+}
+
 static void ov518_pkt_scan(struct gspca_dev *gspca_dev,
 			struct gspca_frame *frame,	/* target */
 			__u8 *data,			/* isoc packet */
@@ -2495,6 +2998,7 @@ static void sd_pkt_scan(struct gspca_dev *gspca_dev,
 	switch (sd->bridge) {
 	case BRIDGE_OV511:
 	case BRIDGE_OV511PLUS:
+		ov511_pkt_scan(gspca_dev, frame, data, len);
 		break;
 	case BRIDGE_OV518:
 	case BRIDGE_OV518PLUS:
@@ -2862,12 +3366,15 @@ static const __devinitdata struct usb_device_id device_table[] = {
 	{USB_DEVICE(0x045e, 0x028c), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x054c, 0x0154), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x054c, 0x0155), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0x0511), .driver_info = BRIDGE_OV511 },
 	{USB_DEVICE(0x05a9, 0x0518), .driver_info = BRIDGE_OV518 },
 	{USB_DEVICE(0x05a9, 0x0519), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x05a9, 0x0530), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x05a9, 0x4519), .driver_info = BRIDGE_OV519 },
 	{USB_DEVICE(0x05a9, 0x8519), .driver_info = BRIDGE_OV519 },
+	{USB_DEVICE(0x05a9, 0xa511), .driver_info = BRIDGE_OV511PLUS },
 	{USB_DEVICE(0x05a9, 0xa518), .driver_info = BRIDGE_OV518PLUS },
+	{USB_DEVICE(0x0813, 0x0002), .driver_info = BRIDGE_OV511PLUS },
 	{}
 };
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 772d226cb5c..8a025d51090 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -348,6 +348,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SQ905C   v4l2_fourcc('9', '0', '5', 'C') /* compressed RGGB bayer */
 #define V4L2_PIX_FMT_PJPG     v4l2_fourcc('P', 'J', 'P', 'G') /* Pixart 73xx JPEG */
 #define V4L2_PIX_FMT_YVYU     v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16 YVU 4:2:2 */
+#define V4L2_PIX_FMT_OV511    v4l2_fourcc('O', '5', '1', '1') /* ov511 JPEG */
 #define V4L2_PIX_FMT_OV518    v4l2_fourcc('O', '5', '1', '8') /* ov518 JPEG */
 
 /*
-- 
cgit v1.2.3-70-g09d2


From d5fdd6babcfc2b0e6a8da1acf492a69fb54b4c47 Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Tue, 23 Jun 2009 04:31:07 -0700
Subject: ipv6: Use correct data types for ICMPv6 type and code

Change all the code that deals directly with ICMPv6 type and code
values to use u8 instead of a signed int as that's the actual data
type.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h  |  6 +++---
 include/net/protocol.h  |  2 +-
 include/net/rawv6.h     |  2 +-
 include/net/xfrm.h      |  2 +-
 net/dccp/ipv6.c         |  2 +-
 net/ipv6/ah6.c          |  2 +-
 net/ipv6/esp6.c         |  2 +-
 net/ipv6/icmp.c         | 12 ++++++------
 net/ipv6/ip6_tunnel.c   | 18 +++++++++---------
 net/ipv6/ipcomp6.c      |  2 +-
 net/ipv6/mip6.c         |  2 +-
 net/ipv6/raw.c          |  4 ++--
 net/ipv6/route.c        |  2 +-
 net/ipv6/tcp_ipv6.c     |  2 +-
 net/ipv6/tunnel6.c      |  2 +-
 net/ipv6/udp.c          |  6 +++---
 net/ipv6/udp_impl.h     |  2 +-
 net/ipv6/udplite.c      |  2 +-
 net/ipv6/xfrm6_tunnel.c |  2 +-
 net/sctp/ipv6.c         |  2 +-
 20 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 10d701eec48..b6a85183c33 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -175,16 +175,16 @@ struct icmp6_filter {
 
 
 extern void				icmpv6_send(struct sk_buff *skb,
-						    int type, int code,
+						    u8 type, u8 code,
 						    __u32 info, 
 						    struct net_device *dev);
 
 extern int				icmpv6_init(void);
-extern int				icmpv6_err_convert(int type, int code,
+extern int				icmpv6_err_convert(u8 type, u8 code,
 							   int *err);
 extern void				icmpv6_cleanup(void);
 extern void				icmpv6_param_prob(struct sk_buff *skb,
-							  int code, int pos);
+							  u8 code, int pos);
 
 struct flowi;
 struct in6_addr;
diff --git a/include/net/protocol.h b/include/net/protocol.h
index ffa5b8b1f1d..1089d5aabd4 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -53,7 +53,7 @@ struct inet6_protocol
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
-			       int type, int code, int offset,
+			       u8 type, u8 code, int offset,
 			       __be32 info);
 
 	int	(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 8a22599f26b..f6b9b830df8 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -6,7 +6,7 @@
 #include <net/protocol.h>
 
 void raw6_icmp_error(struct sk_buff *, int nexthdr,
-		int type, int code, int inner_offset, __be32);
+		u8 type, u8 code, int inner_offset, __be32);
 int raw6_local_deliver(struct sk_buff *, int);
 
 extern int			rawv6_rcv(struct sock *sk,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 736bca45088..9e3a3f4c1f6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1274,7 +1274,7 @@ struct xfrm_tunnel {
 struct xfrm6_tunnel {
 	int (*handler)(struct sk_buff *skb);
 	int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			   int type, int code, int offset, __be32 info);
+			   u8 type, u8 code, int offset, __be32 info);
 	struct xfrm6_tunnel *next;
 	int priority;
 };
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 05ea7440d9e..3e70faab298 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -85,7 +85,7 @@ static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
 }
 
 static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			int type, int code, int offset, __be32 info)
+			u8 type, u8 code, int offset, __be32 info)
 {
 	struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
 	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 52449f7a1b7..86f42a288c4 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -405,7 +405,7 @@ out:
 }
 
 static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		    int type, int code, int offset, __be32 info)
+		    u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index c2f250150db..678bb95b152 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -354,7 +354,7 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
 }
 
 static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		     int type, int code, int offset, __be32 info)
+		     u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 36dff880718..eab62a7a8f0 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -117,7 +117,7 @@ static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
 /*
  * Slightly more convenient version of icmpv6_send.
  */
-void icmpv6_param_prob(struct sk_buff *skb, int code, int pos)
+void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
 {
 	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
 	kfree_skb(skb);
@@ -161,7 +161,7 @@ static int is_ineligible(struct sk_buff *skb)
 /*
  * Check the ICMP output rate limit
  */
-static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
+static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
 				     struct flowi *fl)
 {
 	struct dst_entry *dst;
@@ -305,7 +305,7 @@ static inline void mip6_addr_swap(struct sk_buff *skb) {}
 /*
  *	Send an ICMP message in response to a packet in error
  */
-void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
+void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		 struct net_device *dev)
 {
 	struct net *net = dev_net(skb->dev);
@@ -590,7 +590,7 @@ out:
 	icmpv6_xmit_unlock(sk);
 }
 
-static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info)
+static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 {
 	struct inet6_protocol *ipprot;
 	int inner_offset;
@@ -643,7 +643,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
 	struct in6_addr *saddr, *daddr;
 	struct ipv6hdr *orig_hdr;
 	struct icmp6hdr *hdr;
-	int type;
+	u8 type;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 		struct sec_path *sp = skb_sec_path(skb);
@@ -914,7 +914,7 @@ static const struct icmp6_err {
 	},
 };
 
-int icmpv6_err_convert(int type, int code, int *err)
+int icmpv6_err_convert(u8 type, u8 code, int *err)
 {
 	int fatal = 0;
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 404d16a97d5..51f410e7775 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -394,13 +394,13 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
 
 static int
 ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
-	    int *type, int *code, int *msg, __u32 *info, int offset)
+	    u8 *type, u8 *code, int *msg, __u32 *info, int offset)
 {
 	struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data;
 	struct ip6_tnl *t;
 	int rel_msg = 0;
-	int rel_type = ICMPV6_DEST_UNREACH;
-	int rel_code = ICMPV6_ADDR_UNREACH;
+	u8 rel_type = ICMPV6_DEST_UNREACH;
+	u8 rel_code = ICMPV6_ADDR_UNREACH;
 	__u32 rel_info = 0;
 	__u16 len;
 	int err = -ENOENT;
@@ -488,11 +488,11 @@ out:
 
 static int
 ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-	   int type, int code, int offset, __be32 info)
+	   u8 type, u8 code, int offset, __be32 info)
 {
 	int rel_msg = 0;
-	int rel_type = type;
-	int rel_code = code;
+	u8 rel_type = type;
+	u8 rel_code = code;
 	__u32 rel_info = ntohl(info);
 	int err;
 	struct sk_buff *skb2;
@@ -586,11 +586,11 @@ out:
 
 static int
 ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-	   int type, int code, int offset, __be32 info)
+	   u8 type, u8 code, int offset, __be32 info)
 {
 	int rel_msg = 0;
-	int rel_type = type;
-	int rel_code = code;
+	u8 rel_type = type;
+	u8 rel_code = code;
 	__u32 rel_info = ntohl(info);
 	int err;
 
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 3a0b3be7ece..79c172f1ff0 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -51,7 +51,7 @@
 #include <linux/mutex.h>
 
 static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-				int type, int code, int offset, __be32 info)
+				u8 type, u8 code, int offset, __be32 info)
 {
 	__be32 spi;
 	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index f995e19c87a..f797e8c6f3b 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -54,7 +54,7 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen)
 	return data + padlen;
 }
 
-static inline void mip6_param_prob(struct sk_buff *skb, int code, int pos)
+static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos)
 {
 	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8b0b6f94806..d6c3c1c34b2 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -310,7 +310,7 @@ out:
 
 static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	       struct inet6_skb_parm *opt,
-	       int type, int code, int offset, __be32 info)
+	       u8 type, u8 code, int offset, __be32 info)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -343,7 +343,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 }
 
 void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
-		int type, int code, int inner_offset, __be32 info)
+		u8 type, u8 code, int inner_offset, __be32 info)
 {
 	struct sock *sk;
 	int hash;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 658293ea05b..1473ee0a1f5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1865,7 +1865,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
  *	Drop the packet on the floor
  */
 
-static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
+static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
 {
 	int type;
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 53b6a4192b1..58810c65b63 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -317,7 +317,7 @@ failure:
 }
 
 static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		int type, int code, int offset, __be32 info)
+		u8 type, u8 code, int offset, __be32 info)
 {
 	struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
 	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 669f280989c..633ad789eff 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -124,7 +124,7 @@ drop:
 }
 
 static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			int type, int code, int offset, __be32 info)
+			u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_tunnel *handler;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 023beda6b22..33b59bd92c4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -312,7 +312,7 @@ csum_copy_err:
 }
 
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		    int type, int code, int offset, __be32 info,
+		    u8 type, u8 code, int offset, __be32 info,
 		    struct udp_table *udptable)
 {
 	struct ipv6_pinfo *np;
@@ -346,8 +346,8 @@ out:
 }
 
 static __inline__ void udpv6_err(struct sk_buff *skb,
-				 struct inet6_skb_parm *opt, int type,
-				 int code, int offset, __be32 info     )
+				 struct inet6_skb_parm *opt, u8 type,
+				 u8 code, int offset, __be32 info     )
 {
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 23779208c33..6bb303471e2 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -9,7 +9,7 @@
 
 extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
 extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
-			       int , int , int , __be32 , struct udp_table *);
+			       u8 , u8 , int , __be32 , struct udp_table *);
 
 extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index ba162a82458..4818c48688f 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -20,7 +20,7 @@ static int udplitev6_rcv(struct sk_buff *skb)
 
 static void udplitev6_err(struct sk_buff *skb,
 			  struct inet6_skb_parm *opt,
-			  int type, int code, int offset, __be32 info)
+			  u8 type, u8 code, int offset, __be32 info)
 {
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 80193db224d..81a95c00e50 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -262,7 +262,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
 }
 
 static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			    int type, int code, int offset, __be32 info)
+			    u8 type, u8 code, int offset, __be32 info)
 {
 	/* xfrm6_tunnel native err handling */
 	switch (type) {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index a63de3f7f18..6a4b1909414 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -133,7 +133,7 @@ static struct notifier_block sctp_inet6addr_notifier = {
 
 /* ICMP error handler. */
 SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			     int type, int code, int offset, __be32 info)
+			     u8 type, u8 code, int offset, __be32 info)
 {
 	struct inet6_dev *idev;
 	struct sock *sk;
-- 
cgit v1.2.3-70-g09d2


From 788c7df451467df71638dd79a2d63d78c6e13b9c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 23 Jun 2009 13:49:05 +0100
Subject: hugetlb: fault flags instead of write_access

handle_mm_fault() is now passing fault flags rather than write_access
down to hugetlb_fault(), so better recognize that in hugetlb_fault(),
and in hugetlb_no_page().

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a05a5ef3339..2723513a565 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,7 +33,7 @@ void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access);
+			unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						int acctflags);
@@ -98,7 +98,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define pud_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-#define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
+#define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 
 #define hugetlb_change_protection(vma, address, end, newprot)
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a56e6f3ce97..d0351e31f47 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1985,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
 }
 
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep, int write_access)
+			unsigned long address, pte_t *ptep, unsigned int flags)
 {
 	struct hstate *h = hstate_vma(vma);
 	int ret = VM_FAULT_SIGBUS;
@@ -2053,7 +2053,7 @@ retry:
 	 * any allocations necessary to record that reservation occur outside
 	 * the spinlock.
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED))
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
 		if (vma_needs_reservation(h, vma, address) < 0) {
 			ret = VM_FAULT_OOM;
 			goto backout_unlocked;
@@ -2072,7 +2072,7 @@ retry:
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
 
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
 	}
@@ -2091,7 +2091,7 @@ backout_unlocked:
 }
 
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access)
+			unsigned long address, unsigned int flags)
 {
 	pte_t *ptep;
 	pte_t entry;
@@ -2112,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	mutex_lock(&hugetlb_instantiation_mutex);
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
-		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
 		goto out_mutex;
 	}
 
@@ -2126,7 +2126,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * page now as it is used to determine if a reservation has been
 	 * consumed.
 	 */
-	if (write_access && !pte_write(entry)) {
+	if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
 		if (vma_needs_reservation(h, vma, address) < 0) {
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
@@ -2143,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_page_table_lock;
 
 
-	if (write_access) {
+	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry)) {
 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
 							pagecache_page);
@@ -2152,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
-	if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
+	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
+						flags & FAULT_FLAG_WRITE))
 		update_mmu_cache(vma, address, entry);
 
 out_page_table_lock:
-- 
cgit v1.2.3-70-g09d2


From a5c9b696ec109bb54d547fdb437a7a0c2d514670 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 23 Jun 2009 12:36:58 -0700
Subject: mm: pass mm to grab_swap_token

If a kthread happens to use get_user_pages() on an mm (as KSM does),
there's a chance that it will end up trying to read in a swap page, then
oops in grab_swap_token() because the kthread has no mm: GUP passes down
the right mm, so grab_swap_token() ought to be using it.

We have not identified a stronger case than KSM's daemon (not yet in
mainline), but the issue must have come up before, since RHEL has included
a fix for this for years (though a different fix, they just back out of
grab_swap_token if current->mm is unset: which is what we first proposed,
but using the right mm here seems more correct).

Reported-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 12 ++++++------
 mm/memory.c          |  2 +-
 mm/thrash.c          | 32 +++++++++++++++-----------------
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c88b36665f7..7c15334f3ff 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,8 +298,8 @@ extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
-extern struct mm_struct * swap_token_mm;
-extern void grab_swap_token(void);
+extern struct mm_struct *swap_token_mm;
+extern void grab_swap_token(struct mm_struct *);
 extern void __put_swap_token(struct mm_struct *);
 
 static inline int has_swap_token(struct mm_struct *mm)
@@ -419,10 +419,10 @@ static inline swp_entry_t get_swap_page(void)
 }
 
 /* linux/mm/thrash.c */
-#define put_swap_token(x) do { } while(0)
-#define grab_swap_token()  do { } while(0)
-#define has_swap_token(x) 0
-#define disable_swap_token() do { } while(0)
+#define put_swap_token(mm)	do { } while (0)
+#define grab_swap_token(mm)	do { } while (0)
+#define has_swap_token(mm)	0
+#define disable_swap_token()	do { } while (0)
 
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
diff --git a/mm/memory.c b/mm/memory.c
index 50da9511aa7..f46ac18ba23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2519,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(); /* Contend for token _before_ read-in */
+		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205a9c3..2372d4ed5dd 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
 static unsigned int global_faults;
 
-void grab_swap_token(void)
+void grab_swap_token(struct mm_struct *mm)
 {
 	int current_interval;
 
 	global_faults++;
 
-	current_interval = global_faults - current->mm->faultstamp;
+	current_interval = global_faults - mm->faultstamp;
 
 	if (!spin_trylock(&swap_token_lock))
 		return;
 
 	/* First come first served */
 	if (swap_token_mm == NULL) {
-		current->mm->token_priority = current->mm->token_priority + 2;
-		swap_token_mm = current->mm;
+		mm->token_priority = mm->token_priority + 2;
+		swap_token_mm = mm;
 		goto out;
 	}
 
-	if (current->mm != swap_token_mm) {
-		if (current_interval < current->mm->last_interval)
-			current->mm->token_priority++;
+	if (mm != swap_token_mm) {
+		if (current_interval < mm->last_interval)
+			mm->token_priority++;
 		else {
-			if (likely(current->mm->token_priority > 0))
-				current->mm->token_priority--;
+			if (likely(mm->token_priority > 0))
+				mm->token_priority--;
 		}
 		/* Check if we deserve the token */
-		if (current->mm->token_priority >
-				swap_token_mm->token_priority) {
-			current->mm->token_priority += 2;
-			swap_token_mm = current->mm;
+		if (mm->token_priority > swap_token_mm->token_priority) {
+			mm->token_priority += 2;
+			swap_token_mm = mm;
 		}
 	} else {
 		/* Token holder came in again! */
-		current->mm->token_priority += 2;
+		mm->token_priority += 2;
 	}
 
 out:
-	current->mm->faultstamp = global_faults;
-	current->mm->last_interval = current_interval;
+	mm->faultstamp = global_faults;
+	mm->last_interval = current_interval;
 	spin_unlock(&swap_token_lock);
-return;
 }
 
 /* Called on process exit. */
-- 
cgit v1.2.3-70-g09d2


From 01ff53f416757da416413bc32229770a8448b6ef Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 23 Jun 2009 12:37:01 -0700
Subject: rmap: fixup page_referenced() for nommu systems

After the recent changes that went into mm/vmscan.c to overhaul stuff, we
ended up with these warnings on no-mmu systems:

  mm/vmscan.c: In function `shrink_page_list':
  mm/vmscan.c:580: warning: unused variable `vm_flags'
  mm/vmscan.c: In function `shrink_active_list':
  mm/vmscan.c:1294: warning: `vm_flags' may be used uninitialized in this function
  mm/vmscan.c:1242: note: `vm_flags' was declared here

This is because the no-mmu function defines page_referenced() to work on
the first argument only (the page).  It does not clear the vm_flags given
to it because for no-mmu systems, they never actually get utilized.  Since
that is no longer strictly true, we need to set vm_flags to 0 like
everyone else so gcc can do proper dead code elimination without annoying
us with unused warnings.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Acked-by: David McCullough <davidm@snapgear.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 216d024f830..bf116d0dbf2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -118,7 +118,14 @@ int try_to_munlock(struct page *);
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page)
+static inline int page_referenced(struct page *page, int is_locked,
+				  struct mem_cgroup *cnt,
+				  unsigned long *vm_flags)
+{
+	*vm_flags = 0;
+	return TestClearPageReferenced(page);
+}
+
 #define try_to_unmap(page, refs) SWAP_FAIL
 
 static inline int page_mkclean(struct page *page)
-- 
cgit v1.2.3-70-g09d2


From f007e99c8e2e322b8331aba72414715119a2920d Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Sat, 23 May 2009 00:41:15 +0800
Subject: Intel-IOMMU, intr-remap: source-id checking

To support domain-isolation usages, the platform hardware must be
capable of uniquely identifying the requestor (source-id) for each
interrupt message. Without source-id checking for interrupt remapping
, a rouge guest/VM with assigned devices can launch interrupt attacks
to bring down anothe guest/VM or the VMM itself.

This patch adds source-id checking for interrupt remapping, and then
really isolates interrupts for guests/VMs with assigned devices.

Because PCI subsystem is not initialized yet when set up IOAPIC
entries, use read_pci_config_byte to access PCI config space directly.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/apic/io_apic.c |   6 +++
 drivers/pci/intr_remapping.c   | 120 +++++++++++++++++++++++++++++++++++++++--
 drivers/pci/intr_remapping.h   |   2 +
 include/linux/dmar.h           |  11 ++++
 4 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b7a79207295..4d0216fcb36 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1414,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq,
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);
 
+		/* Set source-id of interrupt request */
+		set_ioapic_sid(&irte, apic_id);
+
 		modify_irte(irq, &irte);
 
 		ir_entry->index2 = (index >> 15) & 0x1;
@@ -3290,6 +3293,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 		irte.vector = cfg->vector;
 		irte.dest_id = IRTE_DEST(dest);
 
+		/* Set source-id of interrupt request */
+		set_msi_sid(&irte, pdev);
+
 		modify_irte(irq, &irte);
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 44025a0c2bb..4f5b8712931 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -10,6 +10,8 @@
 #include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 #include <acpi/acpi.h>
+#include <asm/pci-direct.h>
+#include "pci.h"
 
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static int ir_ioapic_num;
@@ -418,6 +420,91 @@ int free_irte(int irq)
 	return rc;
 }
 
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY		0x0  /* no verification is required */
+#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fiels */
+#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
+			      * the third least significant bit
+			      */
+#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
+			      * the second and third least significant bits
+			      */
+#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
+			      * the least three significant bits
+			      */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+			 unsigned int sq, unsigned int sid)
+{
+	irte->svt = svt;
+	irte->sq = sq;
+	irte->sid = sid;
+}
+
+int set_ioapic_sid(struct irte *irte, int apic)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_IO_APICS; i++) {
+		if (ir_ioapic[i].id == apic) {
+			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+			break;
+		}
+	}
+
+	if (sid == 0) {
+		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
+		return -1;
+	}
+
+	set_irte_sid(irte, 1, 0, sid);
+
+	return 0;
+}
+
+int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	struct pci_dev *bridge;
+
+	if (!irte || !dev)
+		return -1;
+
+	/* PCIe device or Root Complex integrated PCI device */
+	if (dev->is_pcie || !dev->bus->parent) {
+		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+			     (dev->bus->number << 8) | dev->devfn);
+		return 0;
+	}
+
+	bridge = pci_find_upstream_pcie_bridge(dev);
+	if (bridge) {
+		if (bridge->is_pcie) /* this is a PCIE-to-PCI/PCIX bridge */
+			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+				(bridge->bus->number << 8) | dev->bus->number);
+		else /* this is a legacy PCI bridge */
+			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+				(bridge->bus->number << 8) | bridge->devfn);
+	}
+
+	return 0;
+}
+
 static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 {
 	u64 addr;
@@ -624,6 +711,35 @@ error:
 	return -1;
 }
 
+static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	ir_ioapic[ir_ioapic_num].bus   = bus;
+	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_ioapic[ir_ioapic_num].iommu = iommu;
+	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
+	ir_ioapic_num++;
+}
+
 static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
 				 struct intel_iommu *iommu)
 {
@@ -648,9 +764,7 @@ static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
 			       " 0x%Lx\n", scope->enumeration_id,
 			       drhd->address);
 
-			ir_ioapic[ir_ioapic_num].iommu = iommu;
-			ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
-			ir_ioapic_num++;
+			ir_parse_one_ioapic_scope(scope, iommu);
 		}
 		start += scope->length;
 	}
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
index ca48f0df8ac..63a263c1841 100644
--- a/drivers/pci/intr_remapping.h
+++ b/drivers/pci/intr_remapping.h
@@ -3,6 +3,8 @@
 struct ioapic_scope {
 	struct intel_iommu *iommu;
 	unsigned int id;
+	unsigned int bus;	/* PCI bus number */
+	unsigned int devfn;	/* PCI devfn number */
 };
 
 #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 1731fb5fd77..4a2b162c256 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -126,6 +126,8 @@ extern int free_irte(int irq);
 extern int irq_remapped(int irq);
 extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
+extern int set_ioapic_sid(struct irte *irte, int apic);
+extern int set_msi_sid(struct irte *irte, struct pci_dev *dev);
 #else
 static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 {
@@ -156,6 +158,15 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 {
 	return NULL;
 }
+static inline int set_ioapic_sid(struct irte *irte, int apic)
+{
+	return 0;
+}
+static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	return 0;
+}
+
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
 #define disable_intr_remapping()	(0)
-- 
cgit v1.2.3-70-g09d2


From 9d9609851003ebed15957f0f2ce18492739ee124 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 14:31:37 -0400
Subject: Audit: clean up all op= output to include string quoting

A number of places in the audit system we send an op= followed by a string
that includes spaces.  Somehow this works but it's just wrong.  This patch
moves all of those that I could find to be quoted.

Example:

Change From: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1
subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op=remove rule
key="number2" list=4 res=0

Change To: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1
subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op="remove rule"
key="number2" list=4 res=0

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  3 +++
 kernel/audit.c        |  9 +++++++++
 kernel/audit_tree.c   | 10 ++++------
 kernel/audit_watch.c  |  6 +-----
 kernel/auditfilter.c  | 12 +++++-------
 kernel/auditsc.c      |  8 ++------
 6 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4fa2810b675..3c7a358241a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -599,6 +599,8 @@ extern void		    audit_log_untrustedstring(struct audit_buffer *ab,
 extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct path *path);
+extern void		    audit_log_key(struct audit_buffer *ab,
+					  char *key);
 extern void		    audit_log_lost(const char *message);
 extern int		    audit_update_lsm_rules(void);
 
@@ -621,6 +623,7 @@ extern int audit_enabled;
 #define audit_log_n_untrustedstring(a,n,s) do { ; } while (0)
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b, p, d) do { ; } while (0)
+#define audit_log_key(b, k) do { ; } while (0)
 #define audit_enabled 0
 #endif
 #endif
diff --git a/kernel/audit.c b/kernel/audit.c
index e07ad2340db..6194c50e203 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1450,6 +1450,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	kfree(pathname);
 }
 
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+	audit_log_format(ab, " key=");
+	if (key)
+		audit_log_untrustedstring(ab, key);
+	else
+		audit_log_format(ab, "(null)");
+}
+
 /**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d7668..3ff0731284a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -441,13 +441,11 @@ static void kill_rules(struct audit_tree *tree)
 		if (rule->tree) {
 			/* not a half-baked one */
 			ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-			audit_log_format(ab, "op=remove rule dir=");
+			audit_log_format(ab, "op=");
+			audit_log_string(ab, "remove rule");
+			audit_log_format(ab, " dir=");
 			audit_log_untrustedstring(ab, rule->tree->pathname);
-			if (rule->filterkey) {
-				audit_log_format(ab, " key=");
-				audit_log_untrustedstring(ab, rule->filterkey);
-			} else
-				audit_log_format(ab, " key=(null)");
+			audit_log_key(ab, rule->filterkey);
 			audit_log_format(ab, " list=%d res=1", rule->listnr);
 			audit_log_end(ab);
 			rule->tree = NULL;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b49ab019fdf..0e96dbc60ea 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -234,11 +234,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 		audit_log_string(ab, op);
 		audit_log_format(ab, " path=");
 		audit_log_untrustedstring(ab, w->path);
-		if (r->filterkey) {
-			audit_log_format(ab, " key=");
-			audit_log_untrustedstring(ab, r->filterkey);
-		} else
-			audit_log_format(ab, " key=(null)");
+		audit_log_key(ab, r->filterkey);
 		audit_log_format(ab, " list=%d res=1", r->listnr);
 		audit_log_end(ab);
 	}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 21b623595aa..a70604047f3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1079,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
 			security_release_secctx(ctx, len);
 		}
 	}
-	audit_log_format(ab, " op=%s rule key=", action);
-	if (rule->filterkey)
-		audit_log_untrustedstring(ab, rule->filterkey);
-	else
-		audit_log_format(ab, "(null)");
+	audit_log_format(ab, " op=");
+	audit_log_string(ab, action);
+	audit_log_key(ab, rule->filterkey);
 	audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
 	audit_log_end(ab);
 }
@@ -1147,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_add_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "add",
+		audit_log_rule_change(loginuid, sessionid, sid, "add rule",
 				      &entry->rule, !err);
 
 		if (err)
@@ -1163,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_del_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "remove",
+		audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
 				      &entry->rule, !err);
 
 		audit_free_rule(entry);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 0b862cac6ca..2de95d1582b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1137,7 +1137,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		if (has_cntl)
 			audit_log_n_hex(*ab, buf, to_send);
 		else
-			audit_log_format(*ab, "\"%s\"", buf);
+			audit_log_string(*ab, buf);
 
 		p += to_send;
 		len_left -= to_send;
@@ -1372,11 +1372,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 
 	audit_log_task_info(ab, tsk);
-	if (context->filterkey) {
-		audit_log_format(ab, " key=");
-		audit_log_untrustedstring(ab, context->filterkey);
-	} else
-		audit_log_format(ab, " key=(null)");
+	audit_log_key(ab, context->filterkey);
 	audit_log_end(ab);
 
 	for (aux = context->aux; aux; aux = aux->next) {
-- 
cgit v1.2.3-70-g09d2


From 3e63cbb1efca7dd3137de1bb475e2e068e38ef23 Mon Sep 17 00:00:00 2001
From: Ankit Jain <me@ankitjain.org>
Date: Fri, 19 Jun 2009 14:28:07 -0400
Subject: fs: Add new pre-allocation ioctls to vfs for compatibility with
 legacy xfs ioctls

This patch adds ioctls to vfs for compatibility with legacy XFS
pre-allocation ioctls (XFS_IOC_*RESVP*). The implementation
effectively invokes sys_fallocate for the new ioctls.
Also handles the compat_ioctl case.
Note: These legacy ioctls are also implemented by OCFS2.

[AV: folded fixes from hch]

Signed-off-by: Ankit Jain <me@ankitjain.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c      | 48 +++++++++++++++++++++++++++++++++++++++++
 fs/ioctl.c             | 35 ++++++++++++++++++++++++++++++
 fs/open.c              | 58 +++++++++++++++++++++++++-------------------------
 include/linux/falloc.h | 21 ++++++++++++++++++
 include/linux/fs.h     |  6 ++++++
 5 files changed, 139 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c135202c38b..626c7483b4d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -31,6 +31,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/vt.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/ppp_defs.h>
@@ -1779,6 +1780,41 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)tn);
 }
 
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+struct space_resv_32 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+};
+
+#define FS_IOC_RESVSP_32		_IOW ('X', 40, struct space_resv_32)
+#define FS_IOC_RESVSP64_32	_IOW ('X', 42, struct space_resv_32)
+
+/* just account for different alignment */
+static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
+{
+	struct space_resv_32	__user *p32 = (void __user *)arg;
+	struct space_resv	__user *p = compat_alloc_user_space(sizeof(*p));
+
+	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
+	    copy_in_user(&p->l_whence,	&p32->l_whence, sizeof(s16)) ||
+	    copy_in_user(&p->l_start,	&p32->l_start,	sizeof(s64)) ||
+	    copy_in_user(&p->l_len,	&p32->l_len,	sizeof(s64)) ||
+	    copy_in_user(&p->l_sysid,	&p32->l_sysid,	sizeof(s32)) ||
+	    copy_in_user(&p->l_pid,	&p32->l_pid,	sizeof(u32)) ||
+	    copy_in_user(&p->l_pad,	&p32->l_pad,	4*sizeof(u32)))
+		return -EFAULT;
+
+	return ioctl_preallocate(file, p);
+}
+#endif
+
 
 typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
 					unsigned long, struct file *);
@@ -2756,6 +2792,18 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	case FIOQSIZE:
 		break;
 
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+	case FS_IOC_RESVSP_32:
+	case FS_IOC_RESVSP64_32:
+		error = compat_ioctl_preallocate(filp, arg);
+		goto out_fput;
+#else
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		error = ioctl_preallocate(filp, (void __user *)arg);
+		goto out_fput;
+#endif
+
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 001f8d3118f..5612880fcbe 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
+#include <linux/falloc.h>
 
 #include <asm/ioctls.h>
 
@@ -403,6 +404,37 @@ EXPORT_SYMBOL(generic_block_fiemap);
 
 #endif  /*  CONFIG_BLOCK  */
 
+/*
+ * This provides compatibility with legacy XFS pre-allocation ioctls
+ * which predate the fallocate syscall.
+ *
+ * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
+ * are used here, rest are ignored.
+ */
+int ioctl_preallocate(struct file *filp, void __user *argp)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct space_resv sr;
+
+	if (copy_from_user(&sr, argp, sizeof(sr)))
+		return -EFAULT;
+
+	switch (sr.l_whence) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		sr.l_start += filp->f_pos;
+		break;
+	case SEEK_END:
+		sr.l_start += i_size_read(inode);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -414,6 +446,9 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 		return ioctl_fibmap(filp, p);
 	case FIONREAD:
 		return put_user(i_size_read(inode) - filp->f_pos, p);
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		return ioctl_preallocate(filp, p);
 	}
 
 	return vfs_ioctl(filp, cmd, arg);
diff --git a/fs/open.c b/fs/open.c
index 7200e23d925..dd98e807602 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -378,63 +378,63 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
 #endif /* BITS_PER_LONG == 32 */
 
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+
+int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
-	struct file *file;
-	struct inode *inode;
-	long ret = -EINVAL;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	long ret;
 
 	if (offset < 0 || len <= 0)
-		goto out;
+		return -EINVAL;
 
 	/* Return error if mode is not supported */
-	ret = -EOPNOTSUPP;
 	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
-		goto out;
+		return -EOPNOTSUPP;
 
-	ret = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
 	if (!(file->f_mode & FMODE_WRITE))
-		goto out_fput;
+		return -EBADF;
 	/*
 	 * Revalidate the write permissions, in case security policy has
 	 * changed since the files were opened.
 	 */
 	ret = security_file_permission(file, MAY_WRITE);
 	if (ret)
-		goto out_fput;
+		return ret;
 
-	inode = file->f_path.dentry->d_inode;
-
-	ret = -ESPIPE;
 	if (S_ISFIFO(inode->i_mode))
-		goto out_fput;
+		return -ESPIPE;
 
-	ret = -ENODEV;
 	/*
 	 * Let individual file system decide if it supports preallocation
 	 * for directories or not.
 	 */
 	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-		goto out_fput;
+		return -ENODEV;
 
-	ret = -EFBIG;
 	/* Check for wrap through zero too */
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
-		goto out_fput;
+		return -EFBIG;
 
-	if (inode->i_op->fallocate)
-		ret = inode->i_op->fallocate(inode, mode, offset, len);
-	else
-		ret = -EOPNOTSUPP;
+	if (!inode->i_op->fallocate)
+		return -EOPNOTSUPP;
 
-out_fput:
-	fput(file);
-out:
-	return ret;
+	return inode->i_op->fallocate(inode, mode, offset, len);
 }
+
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+{
+	struct file *file;
+	int error = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		error = do_fallocate(file, mode, offset, len);
+		fput(file);
+	}
+
+	return error;
+}
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
 {
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 8e912ab6a07..3c155107d61 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -3,4 +3,25 @@
 
 #define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
 
+#ifdef __KERNEL__
+
+/*
+ * Space reservation ioctls and argument structure
+ * are designed to be compatible with the legacy XFS ioctls.
+ */
+struct space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserved area */
+};
+
+#define FS_IOC_RESVSP		_IOW('X', 40, struct space_resv)
+#define FS_IOC_RESVSP64		_IOW('X', 42, struct space_resv)
+
+#endif /* __KERNEL__ */
+
 #endif /* _FALLOC_H_ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1ff5e4e0195..79e302ddde0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1906,6 +1906,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
+extern int do_fallocate(struct file *file, int mode, loff_t offset,
+			loff_t len);
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			int mode);
 extern struct file *filp_open(const char *, int, int);
@@ -1914,6 +1916,10 @@ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
 extern int filp_close(struct file *, fl_owner_t id);
 extern char * getname(const char __user *);
 
+/* fs/ioctl.c */
+
+extern int ioctl_preallocate(struct file *filp, void __user *argp);
+
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
 extern void __init vfs_caches_init(unsigned long);
-- 
cgit v1.2.3-70-g09d2


From f19d4a8fa6f9b6ccf54df0971c97ffcaa390b7b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 19:50:45 -0400
Subject: add caching of ACLs in struct inode

No helpers, no conversions yet.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 10 ++++++++++
 include/linux/fs.h |  7 +++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index f643be565df..e193cd592fa 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,7 @@
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
+#include <linux/posix_acl.h>
 
 /*
  * This is needed for the following functions:
@@ -189,6 +190,9 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	}
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+#endif
 
 #ifdef CONFIG_FSNOTIFY
 	inode->i_fsnotify_mask = 0;
@@ -227,6 +231,12 @@ void destroy_inode(struct inode *inode)
 	ima_inode_free(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_acl);
+	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_default_acl);
+#endif
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79e302ddde0..0872372184f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -710,6 +710,9 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 #define i_size_ordered_init(inode) do { } while (0)
 #endif
 
+struct posix_acl;
+#define ACL_NOT_CACHED ((void *)(-1))
+
 struct inode {
 	struct hlist_node	i_hash;
 	struct list_head	i_list;
@@ -772,6 +775,10 @@ struct inode {
 	atomic_t		i_writecount;
 #ifdef CONFIG_SECURITY
 	void			*i_security;
+#endif
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
 #endif
 	void			*i_private; /* fs or device private pointer */
 };
-- 
cgit v1.2.3-70-g09d2


From 6582a0e6f6bc7bf64817b9e1a424782855292ab0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 19:53:58 -0400
Subject: switch ext3 to inode->i_acl

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/acl.c             | 22 ++++++++++------------
 fs/ext3/acl.h             |  4 ----
 fs/ext3/inode.c           |  4 ----
 fs/ext3/super.c           | 16 ----------------
 include/linux/ext3_fs_i.h |  4 ----
 5 files changed, 10 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e0c74545171..a9707689d9e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -134,7 +134,7 @@ ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
 	if (acl) {
 		spin_lock(&inode->i_lock);
 		acl = *i_acl;
-		if (acl != EXT3_ACL_NOT_CACHED)
+		if (acl != ACL_NOT_CACHED)
 			acl = posix_acl_dup(acl);
 		spin_unlock(&inode->i_lock);
 	}
@@ -147,7 +147,7 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
                   struct posix_acl *acl)
 {
 	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT3_ACL_NOT_CACHED)
+	if (*i_acl != ACL_NOT_CACHED)
 		posix_acl_release(*i_acl);
 	*i_acl = posix_acl_dup(acl);
 	spin_unlock(&inode->i_lock);
@@ -161,7 +161,6 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
 {
-	struct ext3_inode_info *ei = EXT3_I(inode);
 	int name_index;
 	char *value = NULL;
 	struct posix_acl *acl;
@@ -172,15 +171,15 @@ ext3_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			acl = ext3_iget_acl(inode, &ei->i_acl);
-			if (acl != EXT3_ACL_NOT_CACHED)
+			acl = ext3_iget_acl(inode, &inode->i_acl);
+			if (acl != ACL_NOT_CACHED)
 				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			acl = ext3_iget_acl(inode, &ei->i_default_acl);
-			if (acl != EXT3_ACL_NOT_CACHED)
+			acl = ext3_iget_acl(inode, &inode->i_default_acl);
+			if (acl != ACL_NOT_CACHED)
 				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
 			break;
@@ -206,11 +205,11 @@ ext3_get_acl(struct inode *inode, int type)
 	if (!IS_ERR(acl)) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &ei->i_acl, acl);
+				ext3_iset_acl(inode, &inode->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &ei->i_default_acl, acl);
+				ext3_iset_acl(inode, &inode->i_default_acl, acl);
 				break;
 		}
 	}
@@ -226,7 +225,6 @@ static int
 ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	     struct posix_acl *acl)
 {
-	struct ext3_inode_info *ei = EXT3_I(inode);
 	int name_index;
 	void *value = NULL;
 	size_t size = 0;
@@ -274,11 +272,11 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	if (!error) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &ei->i_acl, acl);
+				ext3_iset_acl(inode, &inode->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &ei->i_default_acl, acl);
+				ext3_iset_acl(inode, &inode->i_default_acl, acl);
 				break;
 		}
 	}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 42da16b8cac..07d15a3a596 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -53,10 +53,6 @@ static inline int ext3_acl_count(size_t size)
 
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
-/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT3_ACL_NOT_CACHED ((void *)-1)
-
 /* acl.c */
 extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 05dea8132fc..5f51fed5c75 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2752,10 +2752,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	ei = EXT3_I(inode);
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	ei->i_acl = EXT3_ACL_NOT_CACHED;
-	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
 	ei->i_block_alloc_info = NULL;
 
 	ret = __ext3_get_inode_loc(inode, &iloc, 0);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 601e881e610..524b349c629 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -464,10 +464,6 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	ei->i_acl = EXT3_ACL_NOT_CACHED;
-	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
 	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
 	return &ei->vfs_inode;
@@ -518,18 +514,6 @@ static void destroy_inodecache(void)
 static void ext3_clear_inode(struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	if (EXT3_I(inode)->i_acl &&
-			EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-		posix_acl_release(EXT3_I(inode)->i_acl);
-		EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-	}
-	if (EXT3_I(inode)->i_default_acl &&
-			EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-		posix_acl_release(EXT3_I(inode)->i_default_acl);
-		EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-	}
-#endif
 	ext3_discard_reservation(inode);
 	EXT3_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 7894dd0f3b7..ca1bfe90004 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -103,10 +103,6 @@ struct ext3_inode_info {
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
 
 	struct list_head i_orphan;	/* unlinked but open inodes */
 
-- 
cgit v1.2.3-70-g09d2


From 7a77b15d9294749809de918e24bebc39e0fbc9ab Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 21:01:13 -0400
Subject: switch reiserfs to usual conventions for caching ACLs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/super.c          | 12 ++++++------
 fs/reiserfs/xattr_acl.c      | 21 ++++++++-------------
 include/linux/reiserfs_acl.h |  4 ++--
 3 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2969773cfc2..b194451fc04 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -530,8 +530,8 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	inode_init_once(&ei->vfs_inode);
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	ei->i_acl_access = NULL;
-	ei->i_acl_default = NULL;
+	ei->i_acl_access = ACL_NOT_CACHED;
+	ei->i_acl_default = ACL_NOT_CACHED;
 #endif
 }
 
@@ -586,14 +586,14 @@ static void reiserfs_clear_inode(struct inode *inode)
 	struct posix_acl *acl;
 
 	acl = REISERFS_I(inode)->i_acl_access;
-	if (acl && !IS_ERR(acl))
+	if (acl && acl != ACL_NOT_CACHED)
 		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_access = NULL;
+	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
 
 	acl = REISERFS_I(inode)->i_acl_default;
-	if (acl && !IS_ERR(acl))
+	if (acl && acl != ACL_NOT_CACHED)
 		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_default = NULL;
+	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
 }
 #else
 #define reiserfs_clear_inode NULL
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a1a7e3530e1..7b3aeb9327d 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -192,19 +192,19 @@ static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
 			    struct posix_acl *acl)
 {
 	spin_lock(&inode->i_lock);
-	if (*i_acl != ERR_PTR(-ENODATA))
+	if (*i_acl != ACL_NOT_CACHED)
 		posix_acl_release(*i_acl);
-	*i_acl = acl ? posix_acl_dup(acl) : ERR_PTR(-ENODATA);
+	*i_acl = posix_acl_dup(acl);
 	spin_unlock(&inode->i_lock);
 }
 
 static inline struct posix_acl *iget_acl(struct inode *inode,
 					 struct posix_acl **i_acl)
 {
-	struct posix_acl *acl = ERR_PTR(-ENODATA);
+	struct posix_acl *acl = ACL_NOT_CACHED;
 
 	spin_lock(&inode->i_lock);
-	if (*i_acl != ERR_PTR(-ENODATA))
+	if (*i_acl != ACL_NOT_CACHED)
 		acl = posix_acl_dup(*i_acl);
 	spin_unlock(&inode->i_lock);
 
@@ -239,15 +239,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	}
 
 	acl = iget_acl(inode, p_acl);
-	if (acl && !IS_ERR(acl))
+	if (acl != ACL_NOT_CACHED)
 		return acl;
-	else if (PTR_ERR(acl) == -ENODATA)
-		return NULL;
 
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
 		if (size == -ENODATA || size == -ENOSYS) {
-			*p_acl = ERR_PTR(-ENODATA);
+			*p_acl = NULL;
 			return NULL;
 		}
 		return ERR_PTR(size);
@@ -262,7 +260,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		/* This shouldn't actually happen as it should have
 		   been caught above.. but just in case */
 		acl = NULL;
-		*p_acl = ERR_PTR(-ENODATA);
+		*p_acl = acl;
 	} else if (retval < 0) {
 		acl = ERR_PTR(retval);
 	} else {
@@ -379,11 +377,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 	}
 
 	acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl)) {
-		if (PTR_ERR(acl) == -ENODATA)
-			goto apply_umask;
+	if (IS_ERR(acl))
 		return PTR_ERR(acl);
-	}
 
 	if (acl) {
 		struct posix_acl *acl_copy;
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 8cc65757e47..8f4d8d718b1 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -58,12 +58,12 @@ extern struct xattr_handler reiserfs_posix_acl_access_handler;
 
 static inline void reiserfs_init_acl_access(struct inode *inode)
 {
-	REISERFS_I(inode)->i_acl_access = NULL;
+	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
 }
 
 static inline void reiserfs_init_acl_default(struct inode *inode)
 {
-	REISERFS_I(inode)->i_acl_default = NULL;
+	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
 }
 #else
 
-- 
cgit v1.2.3-70-g09d2


From 281eede0328c84a8f20e0e85b807d5b51c3de4f2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 21:07:04 -0400
Subject: switch reiserfs to inode->i_acl

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/inode.c           |  4 ----
 fs/reiserfs/super.c           | 24 ------------------------
 fs/reiserfs/xattr_acl.c       | 10 ++++------
 include/linux/reiserfs_acl.h  | 17 -----------------
 include/linux/reiserfs_fs_i.h |  4 ----
 5 files changed, 4 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6fd0f47e45d..a14d6cd9eed 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1131,8 +1131,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
 	mutex_init(&(REISERFS_I(inode)->i_mmap));
-	reiserfs_init_acl_access(inode);
-	reiserfs_init_acl_default(inode);
 	reiserfs_init_xattr_rwsem(inode);
 
 	if (stat_data_v1(ih)) {
@@ -1834,8 +1832,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
 	mutex_init(&(REISERFS_I(inode)->i_mmap));
-	reiserfs_init_acl_access(inode);
-	reiserfs_init_acl_default(inode);
 	reiserfs_init_xattr_rwsem(inode);
 
 	/* key to search for correct place for new stat data */
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b194451fc04..d3aeb061612 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -529,10 +529,6 @@ static void init_once(void *foo)
 
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	inode_init_once(&ei->vfs_inode);
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	ei->i_acl_access = ACL_NOT_CACHED;
-	ei->i_acl_default = ACL_NOT_CACHED;
-#endif
 }
 
 static int init_inodecache(void)
@@ -580,25 +576,6 @@ static void reiserfs_dirty_inode(struct inode *inode)
 	reiserfs_write_unlock(inode->i_sb);
 }
 
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-static void reiserfs_clear_inode(struct inode *inode)
-{
-	struct posix_acl *acl;
-
-	acl = REISERFS_I(inode)->i_acl_access;
-	if (acl && acl != ACL_NOT_CACHED)
-		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
-
-	acl = REISERFS_I(inode)->i_acl_default;
-	if (acl && acl != ACL_NOT_CACHED)
-		posix_acl_release(acl);
-	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
-}
-#else
-#define reiserfs_clear_inode NULL
-#endif
-
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
 				    size_t, loff_t);
@@ -612,7 +589,6 @@ static const struct super_operations reiserfs_sops = {
 	.write_inode = reiserfs_write_inode,
 	.dirty_inode = reiserfs_dirty_inode,
 	.delete_inode = reiserfs_delete_inode,
-	.clear_inode = reiserfs_clear_inode,
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
 	.sync_fs = reiserfs_sync_fs,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 7b3aeb9327d..b6e473faa8b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -223,16 +223,15 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl, **p_acl;
 	int size;
 	int retval;
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &reiserfs_i->i_acl_access;
+		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &reiserfs_i->i_acl_default;
+		p_acl = &inode->i_default_acl;
 		break;
 	default:
 		return ERR_PTR(-EINVAL);
@@ -288,7 +287,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	struct posix_acl **p_acl;
 	size_t size = 0;
 	int error;
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
@@ -296,7 +294,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &reiserfs_i->i_acl_access;
+		p_acl = &inode->i_acl;
 		if (acl) {
 			mode_t mode = inode->i_mode;
 			error = posix_acl_equiv_mode(acl, &mode);
@@ -311,7 +309,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &reiserfs_i->i_acl_default;
+		p_acl = &inode->i_default_acl;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 8f4d8d718b1..b4448853900 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -56,15 +56,6 @@ int reiserfs_cache_default_acl(struct inode *dir);
 extern struct xattr_handler reiserfs_posix_acl_default_handler;
 extern struct xattr_handler reiserfs_posix_acl_access_handler;
 
-static inline void reiserfs_init_acl_access(struct inode *inode)
-{
-	REISERFS_I(inode)->i_acl_access = ACL_NOT_CACHED;
-}
-
-static inline void reiserfs_init_acl_default(struct inode *inode)
-{
-	REISERFS_I(inode)->i_acl_default = ACL_NOT_CACHED;
-}
 #else
 
 #define reiserfs_cache_default_acl(inode) 0
@@ -86,12 +77,4 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 {
 	return 0;
 }
-
-static inline void reiserfs_init_acl_access(struct inode *inode)
-{
-}
-
-static inline void reiserfs_init_acl_default(struct inode *inode)
-{
-}
 #endif
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index 76360b36ac3..89f4d3abbf5 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -54,10 +54,6 @@ struct reiserfs_inode_info {
 	unsigned int i_trans_id;
 	struct reiserfs_journal_list *i_jl;
 	struct mutex i_mmap;
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	struct posix_acl *i_acl_access;
-	struct posix_acl *i_acl_default;
-#endif
 #ifdef CONFIG_REISERFS_FS_XATTR
 	struct rw_semaphore i_xattr_sem;
 #endif
-- 
cgit v1.2.3-70-g09d2


From 06b16e9f68edaa1e71aee943d3c030bcf7380af1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 19:56:00 -0400
Subject: switch shmem to inode->i_acl

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/shmem_fs.h |  8 --------
 mm/shmem.c               |  9 ++++-----
 mm/shmem_acl.c           | 29 ++++++-----------------------
 3 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index fd83f2584b1..abff6c9b413 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -19,10 +19,6 @@ struct shmem_inode_info {
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* first blocks */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct inode		vfs_inode;
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
 };
 
 struct shmem_sb_info {
@@ -45,7 +41,6 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 #ifdef CONFIG_TMPFS_POSIX_ACL
 int shmem_permission(struct inode *, int);
 int shmem_acl_init(struct inode *, struct inode *);
-void shmem_acl_destroy_inode(struct inode *);
 
 extern struct xattr_handler shmem_xattr_acl_access_handler;
 extern struct xattr_handler shmem_xattr_acl_default_handler;
@@ -57,9 +52,6 @@ static inline int shmem_acl_init(struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
-static inline void shmem_acl_destroy_inode(struct inode *inode)
-{
-}
 #endif  /* CONFIG_TMPFS_POSIX_ACL */
 
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index e89d7ec18ed..5f2019fc789 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2379,6 +2379,10 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!p)
 		return NULL;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	p->vfs_inode.i_acl = NULL;
+	p->vfs_inode.i_default_acl = NULL;
+#endif
 	return &p->vfs_inode;
 }
 
@@ -2388,7 +2392,6 @@ static void shmem_destroy_inode(struct inode *inode)
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
-	shmem_acl_destroy_inode(inode);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -2397,10 +2400,6 @@ static void init_once(void *foo)
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
 	inode_init_once(&p->vfs_inode);
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	p->i_acl = NULL;
-	p->i_default_acl = NULL;
-#endif
 }
 
 static int init_inodecache(void)
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 8e5aadd7dcd..606a8e757a4 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type)
 	spin_lock(&inode->i_lock);
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
+			acl = posix_acl_dup(inode->i_acl);
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
+			acl = posix_acl_dup(inode->i_default_acl);
 			break;
 	}
 	spin_unlock(&inode->i_lock);
@@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	spin_lock(&inode->i_lock);
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			free = SHMEM_I(inode)->i_acl;
-			SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
+			free = inode->i_acl;
+			inode->i_acl = posix_acl_dup(acl);
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			free = SHMEM_I(inode)->i_default_acl;
-			SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
+			free = inode->i_default_acl;
+			inode->i_default_acl = posix_acl_dup(acl);
 			break;
 	}
 	spin_unlock(&inode->i_lock);
@@ -154,23 +154,6 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
 	return generic_acl_init(inode, dir, &shmem_acl_ops);
 }
 
-/**
- * shmem_acl_destroy_inode  -  destroy acls hanging off the in-memory inode
- *
- * This is done before destroying the actual inode.
- */
-
-void
-shmem_acl_destroy_inode(struct inode *inode)
-{
-	if (SHMEM_I(inode)->i_acl)
-		posix_acl_release(SHMEM_I(inode)->i_acl);
-	SHMEM_I(inode)->i_acl = NULL;
-	if (SHMEM_I(inode)->i_default_acl)
-		posix_acl_release(SHMEM_I(inode)->i_default_acl);
-	SHMEM_I(inode)->i_default_acl = NULL;
-}
-
 /**
  * shmem_check_acl  -  check_acl() callback for generic_permission()
  */
-- 
cgit v1.2.3-70-g09d2


From 073aaa1b142461d91f83da66db1184d7c1b1edea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Jun 2009 12:11:54 -0400
Subject: helpers for acl caching + switch to those

helpers: get_cached_acl(inode, type), set_cached_acl(inode, type, acl),
forget_cached_acl(inode, type).

ubifs/xattr.c needed includes reordered, the rest is a plain switchover.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/acl.c            | 44 +++++--------------------
 fs/ext2/acl.c             | 79 ++++++++++----------------------------------
 fs/ext3/acl.c             | 83 +++++++++++------------------------------------
 fs/ext4/acl.c             | 65 +++++--------------------------------
 fs/jffs2/acl.c            | 60 +++++++---------------------------
 fs/jfs/acl.c              | 32 ++++++++----------
 fs/jfs/xattr.c            | 10 ++----
 fs/reiserfs/xattr_acl.c   | 49 ++++++----------------------
 fs/ubifs/xattr.c          |  2 +-
 include/linux/posix_acl.h | 64 ++++++++++++++++++++++++++++++++++++
 10 files changed, 155 insertions(+), 333 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6db8a42a3e5..f128427b995 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -29,51 +29,28 @@
 
 #ifdef CONFIG_FS_POSIX_ACL
 
-static void btrfs_update_cached_acl(struct inode *inode,
-				    struct posix_acl **p_acl,
-				    struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*p_acl && *p_acl != ACL_NOT_CACHED)
-		posix_acl_release(*p_acl);
-	*p_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
 	int size;
 	const char *name;
 	char *value = NULL;
-	struct posix_acl *acl = NULL, **p_acl;
+	struct posix_acl *acl;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 
-	/* Handle the cached NULL acl case without locking */
-	acl = ACCESS_ONCE(*p_acl);
-	if (!acl)
-		return acl;
-
-	spin_lock(&inode->i_lock);
-	acl = *p_acl;
-	if (acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	size = __btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
 		value = kzalloc(size, GFP_NOFS);
@@ -82,13 +59,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
-			btrfs_update_cached_acl(inode, p_acl, acl);
+			set_cached_acl(inode, type, acl);
 		}
 		kfree(value);
 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
 		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
-		btrfs_update_cached_acl(inode, p_acl, acl);
+		set_cached_acl(inode, type, acl);
 	} else {
 		acl = ERR_PTR(-EIO);
 	}
@@ -121,7 +98,6 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret, size = 0;
 	const char *name;
-	struct posix_acl **p_acl;
 	char *value = NULL;
 	mode_t mode;
 
@@ -141,13 +117,11 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		ret = 0;
 		inode->i_mode = mode;
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EINVAL : 0;
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		break;
 	default:
 		return -EINVAL;
@@ -172,7 +146,7 @@ out:
 	kfree(value);
 
 	if (!ret)
-		btrfs_update_cached_acl(inode, p_acl, acl);
+		set_cached_acl(inode, type, acl);
 
 	return ret;
 }
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d2ffddc1211..d636e1297ca 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -125,30 +125,6 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-
-	return acl;
-}
-
-static inline void
-ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-		   struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * inode->i_mutex: don't care
  */
@@ -163,23 +139,19 @@ ext2_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = ext2_iget_acl(inode, &inode->i_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = ext2_iget_acl(inode, &inode->i_default_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
 	}
 	retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
@@ -196,17 +168,9 @@ ext2_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext2_iset_acl(inode, &inode->i_acl, acl);
-				break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-			case ACL_TYPE_DEFAULT:
-				ext2_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
 	return acl;
 }
 
@@ -261,17 +225,8 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	error = ext2_xattr_set(inode, name_index, "", value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext2_iset_acl(inode, &inode->i_acl, acl);
-				break;
-
-			case ACL_TYPE_DEFAULT:
-				ext2_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
+	if (!error)
+		set_cached_acl(inode, type, acl);
 	return error;
 }
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a9707689d9e..e167bae37ef 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -126,33 +126,6 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
-
-	if (acl) {
-		spin_lock(&inode->i_lock);
-		acl = *i_acl;
-		if (acl != ACL_NOT_CACHED)
-			acl = posix_acl_dup(acl);
-		spin_unlock(&inode->i_lock);
-	}
-
-	return acl;
-}
-
-static inline void
-ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                  struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Inode operation get_posix_acl().
  *
@@ -169,24 +142,21 @@ ext3_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = ext3_iget_acl(inode, &inode->i_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = ext3_iget_acl(inode, &inode->i_default_acl);
-			if (acl != ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
 	}
+
 	retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
 		value = kmalloc(retval, GFP_NOFS);
@@ -202,17 +172,9 @@ ext3_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &inode->i_acl, acl);
-				break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
 	return acl;
 }
 
@@ -269,17 +231,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 				      value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext3_iset_acl(inode, &inode->i_acl, acl);
-				break;
 
-			case ACL_TYPE_DEFAULT:
-				ext3_iset_acl(inode, &inode->i_default_acl, acl);
-				break;
-		}
-	}
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
 	return error;
 }
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0084e3a19d8..f6d8967149c 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -126,33 +126,6 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
-
-	if (acl) {
-		spin_lock(&inode->i_lock);
-		acl = *i_acl;
-		if (acl != ACL_NOT_CACHED)
-			acl = posix_acl_dup(acl);
-		spin_unlock(&inode->i_lock);
-	}
-
-	return acl;
-}
-
-static inline void
-ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-		struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Inode operation get_posix_acl().
  *
@@ -169,23 +142,19 @@ ext4_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		acl = ext4_iget_acl(inode, &inode->i_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 		break;
-
 	case ACL_TYPE_DEFAULT:
-		acl = ext4_iget_acl(inode, &inode->i_default_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
 		break;
-
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
@@ -202,17 +171,9 @@ ext4_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			ext4_iset_acl(inode, &inode->i_acl, acl);
-			break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-		case ACL_TYPE_DEFAULT:
-			ext4_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
 	return acl;
 }
 
@@ -269,17 +230,9 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 				      value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			ext4_iset_acl(inode, &inode->i_acl, acl);
-			break;
+	if (!error)
+		set_cached_acl(inode, type, acl);
 
-		case ACL_TYPE_DEFAULT:
-			ext4_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
 	return error;
 }
 
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index ac16589ebbd..edd2ad6416d 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -156,47 +156,25 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
 	return ERR_PTR(-EINVAL);
 }
 
-static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-	return acl;
-}
-
-static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 {
 	struct posix_acl *acl;
 	char *value = NULL;
 	int rc, xprefix;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		acl = jffs2_iget_acl(inode, &inode->i_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		acl = jffs2_iget_acl(inode, &inode->i_default_acl);
-		if (acl != ACL_NOT_CACHED)
-			return acl;
 		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
 	if (rc > 0) {
@@ -214,16 +192,8 @@ static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 	}
 	if (value)
 		kfree(value);
-	if (!IS_ERR(acl)) {
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			jffs2_iset_acl(inode, &inode->i_acl, acl);
-			break;
-		case ACL_TYPE_DEFAULT:
-			jffs2_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 	return acl;
 }
 
@@ -283,16 +253,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		return -EINVAL;
 	}
 	rc = __jffs2_set_acl(inode, xprefix, acl);
-	if (!rc) {
-		switch(type) {
-		case ACL_TYPE_ACCESS:
-			jffs2_iset_acl(inode, &inode->i_acl, acl);
-			break;
-		case ACL_TYPE_DEFAULT:
-			jffs2_iset_acl(inode, &inode->i_default_acl, acl);
-			break;
-		}
-	}
+	if (!rc)
+		set_cached_acl(inode, type, acl);
 	return rc;
 }
 
@@ -336,7 +298,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 		*i_mode &= ~current_umask();
 	} else {
 		if (S_ISDIR(*i_mode))
-			jffs2_iset_acl(inode, &inode->i_default_acl, acl);
+			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
 
 		clone = posix_acl_clone(acl, GFP_KERNEL);
 		if (!clone)
@@ -347,7 +309,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 			return rc;
 		}
 		if (rc > 0)
-			jffs2_iset_acl(inode, &inode->i_acl, clone);
+			set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
 
 		posix_acl_release(clone);
 	}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 5fcfc9857c1..f272bf032e1 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -31,26 +31,24 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 {
 	struct posix_acl *acl;
 	char *ea_name;
-	struct posix_acl **p_acl;
 	int size;
 	char *value = NULL;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch(type) {
 		case ACL_TYPE_ACCESS:
 			ea_name = POSIX_ACL_XATTR_ACCESS;
-			p_acl = &inode->i_acl;
 			break;
 		case ACL_TYPE_DEFAULT:
 			ea_name = POSIX_ACL_XATTR_DEFAULT;
-			p_acl = &inode->i_default_acl;
 			break;
 		default:
 			return ERR_PTR(-EINVAL);
 	}
 
-	if (*p_acl != ACL_NOT_CACHED)
-		return posix_acl_dup(*p_acl);
-
 	size = __jfs_getxattr(inode, ea_name, NULL, 0);
 
 	if (size > 0) {
@@ -61,17 +59,18 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 	}
 
 	if (size < 0) {
-		if (size == -ENODATA) {
-			*p_acl = NULL;
+		if (size == -ENODATA)
 			acl = NULL;
-		} else
+		else
 			acl = ERR_PTR(size);
 	} else {
 		acl = posix_acl_from_xattr(value, size);
-		if (!IS_ERR(acl))
-			*p_acl = posix_acl_dup(acl);
 	}
 	kfree(value);
+	if (!IS_ERR(acl)) {
+		set_cached_acl(inode, type, acl);
+		posix_acl_release(acl);
+	}
 	return acl;
 }
 
@@ -79,7 +78,6 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 		       struct posix_acl *acl)
 {
 	char *ea_name;
-	struct posix_acl **p_acl;
 	int rc;
 	int size = 0;
 	char *value = NULL;
@@ -90,11 +88,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 	switch(type) {
 		case ACL_TYPE_ACCESS:
 			ea_name = POSIX_ACL_XATTR_ACCESS;
-			p_acl = &inode->i_acl;
 			break;
 		case ACL_TYPE_DEFAULT:
 			ea_name = POSIX_ACL_XATTR_DEFAULT;
-			p_acl = &inode->i_default_acl;
 			if (!S_ISDIR(inode->i_mode))
 				return acl ? -EACCES : 0;
 			break;
@@ -114,11 +110,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 out:
 	kfree(value);
 
-	if (!rc) {
-		if (*p_acl && (*p_acl != ACL_NOT_CACHED))
-			posix_acl_release(*p_acl);
-		*p_acl = posix_acl_dup(acl);
-	}
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+
 	return rc;
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index f6e90e34359..fad364548bc 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -727,10 +727,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 		/*
 		 * We're changing the ACL.  Get rid of the cached one
 		 */
-		acl =inode->i_acl;
-		if (acl != ACL_NOT_CACHED)
-			posix_acl_release(acl);
-		inode->i_acl = ACL_NOT_CACHED;
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
 
 		return 0;
 	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
@@ -746,10 +743,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 		/*
 		 * We're changing the default ACL.  Get rid of the cached one
 		 */
-		acl = inode->i_default_acl;
-		if (acl && (acl != ACL_NOT_CACHED))
-			posix_acl_release(acl);
-		inode->i_default_acl = ACL_NOT_CACHED;
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
 
 		return 0;
 	}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index b6e473faa8b..35d6e672a27 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -188,29 +188,6 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 	return ERR_PTR(-EINVAL);
 }
 
-static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
-			    struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
-static inline struct posix_acl *iget_acl(struct inode *inode,
-					 struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-
-	return acl;
-}
-
 /*
  * Inode operation get_posix_acl().
  *
@@ -220,31 +197,29 @@ static inline struct posix_acl *iget_acl(struct inode *inode,
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 {
 	char *name, *value;
-	struct posix_acl *acl, **p_acl;
+	struct posix_acl *acl;
 	int size;
 	int retval;
 
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		BUG();
 	}
 
-	acl = iget_acl(inode, p_acl);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
 		if (size == -ENODATA || size == -ENOSYS) {
-			*p_acl = NULL;
+			set_cached_acl(inode, type, NULL);
 			return NULL;
 		}
 		return ERR_PTR(size);
@@ -259,14 +234,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		/* This shouldn't actually happen as it should have
 		   been caught above.. but just in case */
 		acl = NULL;
-		*p_acl = acl;
 	} else if (retval < 0) {
 		acl = ERR_PTR(retval);
 	} else {
 		acl = posix_acl_from_disk(value, retval);
-		if (!IS_ERR(acl))
-			iset_acl(inode, p_acl, acl);
 	}
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
 	kfree(value);
 	return acl;
@@ -284,7 +258,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 {
 	char *name;
 	void *value = NULL;
-	struct posix_acl **p_acl;
 	size_t size = 0;
 	int error;
 
@@ -294,7 +267,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
-		p_acl = &inode->i_acl;
 		if (acl) {
 			mode_t mode = inode->i_mode;
 			error = posix_acl_equiv_mode(acl, &mode);
@@ -309,7 +281,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		break;
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
-		p_acl = &inode->i_default_acl;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
@@ -342,7 +313,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	kfree(value);
 
 	if (!error)
-		iset_acl(inode, p_acl, acl);
+		set_cached_acl(inode, type, acl);
 
 	return error;
 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index cfd31e229c8..adafcf55653 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -55,9 +55,9 @@
  * ACL support is not implemented.
  */
 
+#include "ubifs.h"
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
-#include "ubifs.h"
 
 /*
  * Limit the number of extended attributes per inode so that the total size
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 4bc241290c2..0cdba01b775 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -83,4 +83,68 @@ extern int posix_acl_chmod_masq(struct posix_acl *, mode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
+static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl **p, *acl;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		p = &inode->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		p = &inode->i_default_acl;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	acl = ACCESS_ONCE(*p);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *p;
+		if (acl != ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
+	return acl;
+}
+
+static inline void set_cached_acl(struct inode *inode,
+				  int type,
+				  struct posix_acl *acl)
+{
+	struct posix_acl *old = NULL;
+	spin_lock(&inode->i_lock);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		old = inode->i_acl;
+		inode->i_acl = posix_acl_dup(acl);
+		break;
+	case ACL_TYPE_DEFAULT:
+		old = inode->i_default_acl;
+		inode->i_default_acl = posix_acl_dup(acl);
+		break;
+	}
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+
+static inline void forget_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl *old = NULL;
+	spin_lock(&inode->i_lock);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		old = inode->i_acl;
+		inode->i_acl = ACL_NOT_CACHED;
+		break;
+	case ACL_TYPE_DEFAULT:
+		old = inode->i_default_acl;
+		inode->i_default_acl = ACL_NOT_CACHED;
+		break;
+	}
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+
 #endif  /* __LINUX_POSIX_ACL_H */
-- 
cgit v1.2.3-70-g09d2


From 641cf4a668e9e69d2bc061e953422ff72a91f86e Mon Sep 17 00:00:00 2001
From: Markus Trippelsdorf <markus@trippelsdorf.de>
Date: Wed, 24 Jun 2009 22:28:52 +0200
Subject: inline functions left without protection of ifdef (acl)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/posix_acl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0cdba01b775..c513466c7dc 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -83,6 +83,7 @@ extern int posix_acl_chmod_masq(struct posix_acl *, mode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
+#ifdef CONFIG_FS_POSIX_ACL
 static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
 {
 	struct posix_acl **p, *acl;
@@ -146,5 +147,5 @@ static inline void forget_cached_acl(struct inode *inode, int type)
 	if (old != ACL_NOT_CACHED)
 		posix_acl_release(old);
 }
-
+#endif
 #endif  /* __LINUX_POSIX_ACL_H */
-- 
cgit v1.2.3-70-g09d2


From 72c04902d1e27c8a324014cff1d4475c11b1cecd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 24 Jun 2009 16:58:48 -0400
Subject: Get "no acls for this inode" right, fix shmem breakage

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c          | 6 ++----
 fs/jffs2/acl.c            | 3 +--
 include/linux/posix_acl.h | 9 +++++++++
 mm/shmem.c                | 5 +----
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 78ad38ddd01..dbe1aabf96c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2122,10 +2122,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	 * any xattrs or acls
 	 */
 	maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
-	if (!maybe_acls) {
-		inode->i_acl = NULL;
-		inode->i_default_acl = NULL;
-	}
+	if (!maybe_acls)
+		cache_no_acl(inode);
 
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index edd2ad6416d..8fcb6239218 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -284,8 +284,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 	struct posix_acl *acl, *clone;
 	int rc;
 
-	inode->i_default_acl = NULL;
-	inode->i_acl = NULL;
+	cache_no_acl(inode);
 
 	if (S_ISLNK(*i_mode))
 		return 0;	/* Symlink always has no-ACL */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index c513466c7dc..065a3652a3e 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -148,4 +148,13 @@ static inline void forget_cached_acl(struct inode *inode, int type)
 		posix_acl_release(old);
 }
 #endif
+
+static inline void cache_no_acl(struct inode *inode)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = NULL;
+	inode->i_default_acl = NULL;
+#endif
+}
+
 #endif  /* __LINUX_POSIX_ACL_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index 5f2019fc789..d713239ce2c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1558,6 +1558,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
 		spin_lock_init(&info->lock);
 		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->swaplist);
+		cache_no_acl(inode);
 
 		switch (mode & S_IFMT) {
 		default:
@@ -2379,10 +2380,6 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!p)
 		return NULL;
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	p->vfs_inode.i_acl = NULL;
-	p->vfs_inode.i_default_acl = NULL;
-#endif
 	return &p->vfs_inode;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0803172778901e24a75ab074798d98c2b7411559 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Sep 2009 17:53:04 -0700
Subject: dmaengine: kill tx_list

The tx_list attribute of struct dma_async_tx_descriptor is common to
most, but not all dma driver implementations.  None of the upper level
code (dmaengine/async_tx) uses it, so allow drivers to implement it
locally if they need it.  This saves sizeof(struct list_head) bytes for
drivers that do not manage descriptors with a linked list (e.g.: ioatdma
v2,3).

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 1 -
 include/linux/dmaengine.h | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 5a87384ea4f..562d182eae6 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -933,7 +933,6 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 {
 	tx->chan = chan;
 	spin_lock_init(&tx->lock);
-	INIT_LIST_HEAD(&tx->tx_list);
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ffefba81c81..f114bc7790b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -180,8 +180,6 @@ typedef void (*dma_async_tx_callback)(void *dma_async_param);
  * @flags: flags to augment operation preparation, control completion, and
  * 	communicate status
  * @phys: physical address of the descriptor
- * @tx_list: driver common field for operations that require multiple
- *	descriptors
  * @chan: target channel for this operation
  * @tx_submit: set the prepared descriptor(s) to be executed by the engine
  * @callback: routine to call after this operation is complete
@@ -195,7 +193,6 @@ struct dma_async_tx_descriptor {
 	dma_cookie_t cookie;
 	enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */
 	dma_addr_t phys;
-	struct list_head tx_list;
 	struct dma_chan *chan;
 	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
 	dma_async_tx_callback callback;
-- 
cgit v1.2.3-70-g09d2